In [1]:
import requests
import pandas as pd
import io
import os
import numpy as np
from tabulate import tabulate
from dotenv import load_dotenv
from IPython.display import HTML
from yahoo_fin.stock_info import get_data

cwd = os.getcwd()
path = os.path.dirname(cwd)
folder_path = os.path.join(path, 'stock_data')


In [2]:

### Get stock list
def get_ticker_list():
    folder_path = os.path.join(path, 'preprocessing_data')
    txt_files = [file for file in os.listdir(folder_path) if file.endswith('.txt')]
    file_contents = {}
    for file in txt_files:
        with open(os.path.join(folder_path, file), 'r', encoding='utf-8') as f:
            file_contents[file] = f.read()
    final_df = []
    for filename, content in file_contents.items():
        final_df.append(pd.read_html(content)[0])
    return final_df


### Stock Processing
def to_closest_friday(date):
    day_of_week = date.dayofweek
    if day_of_week == 4: 
        return date
    elif day_of_week < 4: 
        return date + pd.Timedelta(days=(4 - day_of_week))
    else: 
        return date - pd.Timedelta(days=(day_of_week - 4))

def calculate_wma(data):
    weights = np.arange(1, 31)[::-1]
    return np.dot(data, weights) / weights.sum()

def get_stock_df(ticker):
    df = get_data(ticker,index_as_date = True, interval="1wk")
    df.index = df.index.map(to_closest_friday)
    return df


### Stage labelling

def calculate_wma_10(data):
    weights = np.arange(1, 11)[::-1]
    return np.dot(data, weights) / weights.sum()

def calculate_future_average(row, df):
    one_month_later = row.name + pd.DateOffset(weeks=4)
    four_months_later = row.name + pd.DateOffset(weeks=16)
    # future_df = df[(df.index > one_months_later) & (df.index <= four_months_later)]
    if df.index[-1] > four_months_later:
        return (df.loc[four_months_later]['wma10'] + df.loc[four_months_later]['wma30'])/ (df.loc[one_month_later]['wma10'] + df.loc[one_month_later]['wma30']) 
    else:
        return None
    
def label_stocks(row):
    if row['future_wma_slope'] >= 1.10:
        return 2
    elif row['future_wma_slope'] <= 0.95:
        return 4
    else:
        return 1




In [3]:
dfs = get_ticker_list()
tickers = []
for df in dfs:
    tickers += list(df['Symbol'])
len(tickers)
# print(tickers)
ticker_list = [item for item in tickers if not isinstance(item, float)]
ticker_list.sort()

  final_df.append(pd.read_html(content)[0])
  final_df.append(pd.read_html(content)[0])
  final_df.append(pd.read_html(content)[0])
  final_df.append(pd.read_html(content)[0])
  final_df.append(pd.read_html(content)[0])
  final_df.append(pd.read_html(content)[0])


In [4]:
####　EXAMPLE 

if not os.path.exists(folder_path):
    os.makedirs(folder_path)

missing_list = list()

for stock in ticker_list:
    try:
        df = get_stock_df(stock)
        df = df.drop(["adjclose","ticker"], axis = 1)
        df['wma30'] = df['close'].rolling(window=30).apply(calculate_wma, raw=True)
        df['fyh'] = df['high'].rolling(window=5*52, min_periods=1).max()
        df = df.dropna()
        df['wma10'] = df['close'].rolling(window=10).apply(calculate_wma_10, raw=True)
        df['future_wma_slope'] = df.apply(calculate_future_average, df=df, axis=1)
        df = df.dropna()
        df['stage'] = df.apply(label_stocks, axis=1)
        for i in range(1, len(df)):
            if df.iloc[i]['stage'] == 1:
                if (df.iloc[i-1]['stage'] == 2 and df.iloc[i]['future_wma_slope'] >= 1.05):
                    df.iloc[i,9] = 2
                if (df.iloc[i-1]['stage'] == 2 or df.iloc[i-1]['stage'] == 3) and df.iloc[i]['stage'] != 4:
                    df.iloc[i,9] = 3
        df.to_pickle(folder_path + '/' + stock + '.pkl')
    except:
        missing_list.append(stock)
        print(stock + " not found")

ABLV not found
ABVX not found
AFJK not found
AGM.A not found
AHR not found
AIRE not found
AITR not found
AKO.A not found
AKO.B not found
AMAM not found
AMIX not found
ANL not found
ANRO not found
ANSC not found
ARM not found
AS not found
ATGL not found
ATPC not found
AVBP not found
AXIL not found
BAYA not found
BEEP not found
BF.A not found
BF.B not found
BH.A not found
BIO.B not found
BIRK not found
BNRE.A not found
BOWN not found
BQ not found
BRK.A not found
BRK.B not found
BTSG not found
BUJA not found
CART not found
CCG not found
CCTG not found
CDLR not found
CGON not found
CHRO not found
CLBR not found
CORZ not found
CPBI not found
CRD.A not found
CRD.B not found
CREV not found
CRGX not found
CWEN.A not found
DBD not found
DDC not found
DEC not found
DHAI not found
DTCK not found
DYCQ not found
ECO not found
EGOX not found
ELAB not found
ELPC not found
EXTO not found
FBLG not found
FBYD not found
FEBO not found
FLYX not found
FTEL not found
GEF.B not found
GLAC not found
GMM not f

In [None]:
df = pd.read_pickle(folder_path + '/A.pkl')

In [None]:
display(df)