In [74]:
import requests
import pandas as pd
import io
import os
import numpy as np
from tabulate import tabulate
from dotenv import load_dotenv
from IPython.display import HTML
from yahoo_fin.stock_info import get_data

cwd = os.getcwd()
path = os.path.dirname(cwd)
folder_path = os.path.join(path, 'stock_data')


In [75]:
def to_closest_friday(date):
    day_of_week = date.dayofweek
    if day_of_week == 4: 
        return date
    elif day_of_week < 4: 
        return date + pd.Timedelta(days=(4 - day_of_week))
    else: 
        return date - pd.Timedelta(days=(day_of_week - 4))


def get_ticker_list():
    folder_path = os.path.join(path, 'preprocessing_data')
    txt_files = [file for file in os.listdir(folder_path) if file.endswith('.txt')]
    file_contents = {}
    for file in txt_files:
        with open(os.path.join(folder_path, file), 'r', encoding='utf-8') as f:
            file_contents[file] = f.read()
    final_df = []
    for filename, content in file_contents.items():
        final_df.append(pd.read_html(content)[0])
    return final_df

def calculate_wma(data):
    weights = np.arange(1, 31)[::-1]
    return np.dot(data, weights) / weights.sum()

def get_stock_df(ticker):
    df = get_data(ticker,index_as_date = True, interval="1wk")
    df.index = df.index.map(to_closest_friday)
    return df

def calculate_future_average(row, df):
    three_months_later = row.name + pd.DateOffset(weeks=12)
    six_months_later = row.name + pd.DateOffset(weeks=24)
    future_df = df[(df.index > three_months_later) & (df.index <= six_months_later)]
    if df.index[-1] > six_months_later and not future_df.empty:
        return future_df['close'].mean()
    else:
        return None
    
def label_stocks(row):
    if row['three_to_six'] >= row['close'] * 1.1:
        return 2
    elif row['three_to_six'] <= row['close'] * 0.9:
        return 4
    else:
        return 1


In [76]:
dfs = get_ticker_list()
tickers = []
for df in dfs:
    tickers += list(df['Symbol'])
len(tickers)
# print(tickers)
ticker_list = [item for item in tickers if not isinstance(item, float)]
ticker_list.sort()

  final_df.append(pd.read_html(content)[0])
  final_df.append(pd.read_html(content)[0])
  final_df.append(pd.read_html(content)[0])
  final_df.append(pd.read_html(content)[0])
  final_df.append(pd.read_html(content)[0])
  final_df.append(pd.read_html(content)[0])


In [77]:
####　EXAMPLE 

if not os.path.exists(folder_path):
    os.makedirs(folder_path)

for stock in ticker_list:
    try:
        df = get_stock_df(stock)
        df = df.drop(["adjclose","ticker"], axis = 1)
        df['wma30'] = df['close'].rolling(window=30).apply(calculate_wma, raw=True)
        df['fyh'] = df['high'].rolling(window=5*52, min_periods=1).max()
        df = df.dropna()
        df['three_to_six'] = df.apply(calculate_future_average, df=df, axis=1)
        df = df.dropna()
        df['stage'] = df.apply(label_stocks, axis=1)
        for i in range(1, len(df)):
            if (df.iloc[i-1]['stage'] == 2 or df.iloc[i-1]['stage'] == 3) and df.iloc[i]['stage'] != 4:
                df.iloc[i,8] = 3
        df.to_pickle(folder_path + '/' + stock + '.pkl')
    except:
        print(stock + " not found")

AGM.Anot found
AKO.Anot found
AKO.Bnot found
BF.Anot found
BF.Bnot found
BH.Anot found
BIO.Bnot found
BNRE.Anot found
BRK.Anot found
BRK.Bnot found
CRD.Anot found
CRD.Bnot found
CWEN.Anot found
DYCQnot found
GEF.Bnot found
GTN.Anot found
HEI.Anot found
HVT.Anot found
JVSAnot found
LEGTnot found
LEN.Bnot found
LGF.Anot found
LGF.Bnot found
MKC.Vnot found
MOG.Anot found
MOG.Bnot found
PBR.Anot found
TAP.Anot found
UHAL.Bnot found
WSO.Bnot found


In [71]:
df = pd.read_pickle(folder_path + '/A.pkl')

In [72]:
display(df)

Unnamed: 0,open,high,low,close,volume,wma30,5yh
2000-06-09,57.939915,59.907009,46.852646,50.697426,53087652,52.079103,115.879829
2000-06-16,51.949211,51.949211,44.349072,44.796139,32289606,53.960631,115.879829
2000-06-23,44.438484,57.179901,44.438484,53.737480,38210555,55.856869,115.879829
2000-06-30,53.648067,57.358727,52.686874,52.753933,34770637,57.645234,115.879829
2000-07-07,52.932762,53.066879,47.746780,48.283260,18789259,59.457058,115.879829
...,...,...,...,...,...,...,...
2024-02-02,131.750000,134.619995,129.259995,132.970001,5865600,119.637893,179.570007
2024-02-09,132.740005,136.910004,131.070007,133.339996,8161300,119.809291,179.570007
2024-02-16,133.279999,136.270004,128.259995,134.839996,6249400,119.488474,179.570007
2024-02-23,133.669998,135.449997,129.009995,132.029999,7241000,119.281807,179.570007
