<a href="https://colab.research.google.com/github/JasonTJH/mycolab/blob/main/USEquityML.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Initialization

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Visualization
#from openpyxl.workbook import Workbook
#from openpyxl import load_workbook
import matplotlib.pyplot as plt
import seaborn as sns
# Data Manipulation
import pandas as pd
import numpy as np
# Machine Learning
from sklearn import preprocessing, neighbors, svm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from sklearn.linear_model import LinearRegression
from sklearn.cluster import KMeans, MeanShift
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
# For oversampling to balance the dataset
from imblearn.over_sampling import SMOTE
# Results Analysis
from collections import Counter
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score
from sklearn.metrics import recall_score, f1_score, roc_auc_score, roc_curve
#Model weight manipulation (may cause overfitting though)
from sklearn.model_selection import GridSearchCV
from pylab import rcParams
#
from timeit import default_timer as timer
sns.set()

#from numba import jit, njit
#GPU VERSIONS
#import cupy as np
#import cudf as pd
#import cuml as sklearn



In [None]:
def get_df(path: str):
  return pd.read_excel(path)

def add_hurdle_col(dataframe,hurdle):
  #Add new col
  dataframe.loc[dataframe['Fwd 5Y Return'] >= hurdle, 'Y'] = 1
  dataframe.loc[dataframe['Fwd 5Y Return'] < hurdle, 'Y'] = 0

def get_base_prob(dataframe):
  base_probability = round(len(dataframe[(dataframe['Y']==1)])/len(dataframe),2)
  return base_probability

def drop_useless_cols(dataframe):
  # Drop useless columns
  dataframe = dataframe.drop(['Ticker','Name'],axis=1)
  return dataframe

def clean_div_yield_col(dataframe):
  # Clean dividend yield column
  dataframe['Dvd Yld'] = dataframe['Dvd Yld'].str[:-1].astype(float)

def clean_KMB_all_cols(dataframe):
  # Clean K, M, B from all columns
  repl_dict = {'[kK]': '*1e3', '[mM]': '*1e6', '[bB]': '*1e9'}
  for col in dataframe:
      try:
          dataframe[col] = dataframe[col].replace(repl_dict, regex=True).map(pd.eval).astype(float)
      except:
          None

def get_xy(dataframe):
  #Data Selection/Preparation (X and y are dataframes)
  X = dataframe.drop(['Fwd 5Y Return','Y','As of date'],1)
  y = dataframe['Y'].to_frame()
  return X,y

def get_oversampled_df(X,y):
  #Data Oversampling for balancing
  X_resampled, y_resampled = SMOTE(random_state=0).fit_resample(X,y.values.ravel())
  #Create oversampled df
  oversampled_df = pd.DataFrame(X_resampled, columns=X.columns).join(pd.DataFrame(y_resampled, columns=y.columns))
  return oversampled_df

def join_df(X,y):
  #Create oversampled df
  new_df = X.join(y)
  return new_df

def get_scaled_xy(dataframe):
  #Change X and y to np arrays for input into models
  X = np.array(dataframe.drop(['Y'],1))
  y = np.array(dataframe['Y'])
  #Data Scaling
  scaler = RobustScaler(quantile_range=(5, 95))
  X = scaler.fit_transform(X)
  return X, y

In [None]:
def get_EP_results(K_Means_nclusters:int,Mean_Shift_min_df_len:float,base_test_prob:float,X_train, y_train,X_test, y_test,train_date,test_date):

  EP = {'DT' : DT(X_train, y_train,X_test, y_test)-base_test_prob,
        'KNN' : KNN(X_train, y_train,X_test, y_test)-base_test_prob,
        'RF' : RF(X_train, y_train,X_test, y_test)-base_test_prob,
        'SVM' : SVM(X_train, y_train,X_test, y_test)-base_test_prob,
        'KMeans' : K_Means(K_Means_nclusters,X_train, y_train,X_test, y_test)-base_test_prob}#,
        #'MeanShift' : Mean_Shift(df,Mean_Shift_min_df_len)-base_prob}
  
  EP = {k: round(v,2) for k, v in EP.items()}
  
  x = (f"Decision Tree:\t\t{EP['DT']}\
      \nK Nearest Neighbors:\t{EP['KNN']}\
      \nRandom Forest:\t\t{EP['RF']}\
      \nSupport Vector Machine:\t{EP['SVM']}\
      \nK Means Clustering:\t{EP['KMeans']})")
      #\nMean Shift:\t\t{EP['MeanShift']}")

  bestalgo = {'Algo':max(EP,key=EP.get),
            'ExProb':max(EP.values()),
            'BaseProb':base_prob if max(EP,key=EP.get) == 'MeanShift' else base_test_prob,
            'Precision':round(max(EP.values()) + (base_prob if max(EP,key=EP.get) == 'MeanShift' else base_test_prob),2),
            'Traindate':train_date,
            'Testdate':test_date}
      
  y = (f"The Best Algorithm:\t{bestalgo['Algo']}\
      \nExcess probability:\t{bestalgo['ExProb']}\
      \nBase probability:\t{bestalgo['BaseProb']}\
      \n\
      \nTrain date:\t\t{bestalgo['Traindate']}\
      \nTest date:\t\t{bestalgo['Testdate']}")
  
  #EP['base_prob'] = base_prob
  EP['base_test_prob'] = base_test_prob
  
  return EP, bestalgo, f"Excess Probabilities\n\n{x}\n\n{y}"

def store_results(result_to_store,resultdict,train_date,test_date,hurdle):
  resultdict[f'{hurdle*100}%//{train_date}//{test_date}'] = result_to_store

# Algorithms

In [None]:
def DT(X_train, y_train,X_test, y_test):
  # Decision Tree
  clf = DecisionTreeClassifier()
  clf.fit(X_train,y_train)
  y_predict = clf.predict(X_test)
  return precision_score(y_test,y_predict,zero_division=0)

def KNN(X_train, y_train,X_test, y_test):
  # K Nearest Neighbors
  clf = neighbors.KNeighborsClassifier()
  clf.fit(X_train,y_train)
  y_predict = clf.predict(X_test)
  return precision_score(y_test,y_predict,zero_division=0)

def RF(X_train, y_train,X_test, y_test):
  # Random Forest
  clf = RandomForestClassifier()
  clf.fit(X_train,y_train)
  y_predict = clf.predict(X_test)
  return precision_score(y_test,y_predict,zero_division=0)

def SVM(X_train, y_train,X_test, y_test):
  # Support Vector Machine
  clf = svm.SVC(kernel='rbf', C=1)
  clf.fit(X_train,y_train)
  y_predict = clf.predict(X_test)
  return precision_score(y_test,y_predict,zero_division=0)

def K_Means(num_clusters,X_train, y_train,X_test, y_test):
  # K Means Clustering (unsupervised)
  clf = KMeans(n_clusters=num_clusters)
  clf.fit(X_train)
  postivepred = 0
  correct_pos = 0
  for i in range(len(X_test)):
      predict_me = np.array(X_test[i].astype(float))
      predict_me = predict_me.reshape(-1,len(predict_me))
      prediction = clf.predict(predict_me)
      if prediction[0] == np.ones(1):
        postivepred+=1
        if prediction[0] == y_test[i]:
          correct_pos+=1
  precision = correct_pos/postivepred if postivepred > 0 else 0
  return precision

def Mean_Shift(dataframe,min_df_len,X_train, y_train,X_test, y_test):
  # Means Shift (unsupervised)
  clf = MeanShift()
  clf.fit(X)
  labels = clf.labels_
  cluster_centers = clf.cluster_centers_
  dataframe['cluster_group'] = labels
  n_clusters_= len(np.unique(labels))
  correct_rates = {}

  for i in range(n_clusters_):
      temp_df = dataframe.loc[(dataframe["cluster_group"] == float(i))].copy()
      correct_cluster = temp_df.loc[(temp_df["Y"] == 1)].copy()
      correct_rate = len(correct_cluster)/len(temp_df)
      if len(temp_df)>min_df_len*len(dataframe):
          correct_rates[i]= correct_rate
  return max(correct_rates.values())
  # MeanShift algo returns the cluster number/key i/o underlying binary outcomes.
  # Already using precision (we are choosing the cluster with best precision)
  # This algorithm is very slow....

# Model

## Model for normal data

In [None]:
start = timer()
PATH = '/content/drive/My Drive/YOUR_PATH_TO_DATA'
result1 = {}

#Lists to iterate over
#datepairs = [['2010-06-30','2011-06-30']]#,['2011-06-30','2012-06-30'],['2012-06-30','2013-06-30'],['2013-06-30','2014-06-30']]
datepairs = [['2011-06-30','2012-06-30'],['2011-06-30','2013-06-30'],['2011-06-30','2014-06-30']]
hurdlelist = [1,2,3,4,5] # >x*100% returns in the forward 5Y period

for hurdle in hurdlelist:
  for datepair in datepairs:
    #Setting of parameters
    hurdle = hurdle
    train_date = datepair[0]
    test_date = datepair[1]
    K_Means_nclusters = 2
    Mean_Shift_min_df_len = 0.1

    #-----Start main program-----
    df = get_df(PATH)
    df = df.loc[(df["As of date"] == train_date)|(df["As of date"] == test_date)]
    add_hurdle_col(dataframe=df,hurdle=hurdle)
    df = drop_useless_cols(dataframe=df)
    clean_div_yield_col(dataframe=df)
    clean_KMB_all_cols(dataframe=df)
    #-----Start of ML-----
    #Get X, y as individual dataframes
    X, y = get_xy(dataframe=df)
    X_train, y_train = get_xy(dataframe=df.loc[(df["As of date"] == train_date)])
    X_test, y_test = get_xy(dataframe=df.loc[(df["As of date"] == test_date)])

    train_df = join_df(X_train,y_train)
    X_train, y_train = get_scaled_xy(dataframe=train_df)
    
    #Supervised test
    test_df = join_df(X_test,y_test)
    base_test_probability = get_base_prob(dataframe=test_df)
    X_test, y_test = get_scaled_xy(dataframe=test_df)

    df = join_df(X,y)
    #base_probability = get_base_prob(dataframe=df)
    X, y = get_scaled_xy(dataframe=df)
    
    EP, bestalgo, summary = get_EP_results(K_Means_nclusters=K_Means_nclusters,Mean_Shift_min_df_len=Mean_Shift_min_df_len,base_test_prob=base_test_probability,\
                                           X_train=X_train, y_train=y_train,X_test=X_test, y_test=y_test,train_date=train_date,test_date=test_date)
    store_results(result_to_store=EP,resultdict=result1,train_date=train_date,test_date=test_date,hurdle=hurdle)
    #-----End of program-----

end = timer()
print (end-start)

result1

143.154163621


{'100%//2011-06-30//2012-06-30': {'DT': 0.0,
  'KMeans': 0.12,
  'KNN': 0.02,
  'RF': 0.18,
  'SVM': -0.38,
  'base_test_prob': 0.38},
 '100%//2011-06-30//2013-06-30': {'DT': 0.01,
  'KMeans': -0.29,
  'KNN': 0.0,
  'RF': -0.0,
  'SVM': -0.29,
  'base_test_prob': 0.29},
 '100%//2011-06-30//2014-06-30': {'DT': 0.0,
  'KMeans': -0.19,
  'KNN': 0.02,
  'RF': -0.06,
  'SVM': -0.19,
  'base_test_prob': 0.19},
 '200%//2011-06-30//2012-06-30': {'DT': 0.04,
  'KMeans': 0.37,
  'KNN': 0.12,
  'RF': -0.13,
  'SVM': -0.13,
  'base_test_prob': 0.13},
 '200%//2011-06-30//2013-06-30': {'DT': -0.0,
  'KMeans': -0.09,
  'KNN': -0.09,
  'RF': -0.09,
  'SVM': -0.09,
  'base_test_prob': 0.09},
 '200%//2011-06-30//2014-06-30': {'DT': 0.01,
  'KMeans': -0.06,
  'KNN': 0.04,
  'RF': -0.06,
  'SVM': -0.06,
  'base_test_prob': 0.06},
 '300%//2011-06-30//2012-06-30': {'DT': 0.11,
  'KMeans': 0.45,
  'KNN': -0.05,
  'RF': 0.95,
  'SVM': -0.05,
  'base_test_prob': 0.05},
 '300%//2011-06-30//2013-06-30': {'DT': 0

## Model for oversampled data

In [None]:
start = timer()
PATH = '/content/drive/My Drive/YOUR_PATH_TO_DATA'
result2 = {}

#Lists to iterate over
#datepairs = [['2010-06-30','2011-06-30']]#,['2011-06-30','2012-06-30'],['2012-06-30','2013-06-30'],['2013-06-30','2014-06-30']]
datepairs = [['2011-06-30','2012-06-30'],['2011-06-30','2013-06-30'],['2011-06-30','2014-06-30']]
hurdlelist = [1,2,3,4,5] # >x*100% returns in the forward 5Y period

for hurdle in hurdlelist:
  for datepair in datepairs:
    #Setting of parameters
    hurdle = hurdle
    train_date = datepair[0]
    test_date = datepair[1]
    K_Means_nclusters = 2
    Mean_Shift_min_df_len = 0.1

    #-----Start main program-----
    df = get_df(PATH)
    df = df.loc[(df["As of date"] == train_date)|(df["As of date"] == test_date)]
    add_hurdle_col(dataframe=df,hurdle=hurdle)
    df = drop_useless_cols(dataframe=df)
    clean_div_yield_col(dataframe=df)
    clean_KMB_all_cols(dataframe=df)
    #-----Start of ML-----
    #Get X, y as individual dataframes
    X, y = get_xy(dataframe=df)
    X_train, y_train = get_xy(dataframe=df.loc[(df["As of date"] == train_date)])
    X_test, y_test = get_xy(dataframe=df.loc[(df["As of date"] == test_date)])

    #Supervised train
    ostrain_df = get_oversampled_df(X_train,y_train)
    X_train, y_train = get_scaled_xy(dataframe=ostrain_df)

    #Supervised test
    test_df = join_df(X_test,y_test)
    base_test_probability = get_base_prob(dataframe=test_df)
    X_test, y_test = get_scaled_xy(dataframe=test_df)

    #Unsupervised
    df = get_oversampled_df(X,y)
    #base_probability = get_base_prob(dataframe=df)
    X, y = get_scaled_xy(dataframe=df)
    
    EP, bestalgo, summary = get_EP_results(K_Means_nclusters=K_Means_nclusters,Mean_Shift_min_df_len=Mean_Shift_min_df_len,base_test_prob=base_test_probability,\
                                           X_train=X_train, y_train=y_train,X_test=X_test, y_test=y_test,train_date=train_date,test_date=test_date)
    store_results(result_to_store=EP,resultdict=result2,train_date=train_date,test_date=test_date,hurdle=hurdle)
    #-----End of program-----

end = timer()
print (end-start)

result2



152.81782761200003


{'100%//2011-06-30//2012-06-30': {'DT': 0.03,
  'KMeans': 0.12,
  'KNN': 0.07,
  'RF': 0.08,
  'SVM': 0.02,
  'base_test_prob': 0.38},
 '100%//2011-06-30//2013-06-30': {'DT': 0.01,
  'KMeans': -0.29,
  'KNN': 0.04,
  'RF': 0.01,
  'SVM': 0.01,
  'base_test_prob': 0.29},
 '100%//2011-06-30//2014-06-30': {'DT': 0.0,
  'KMeans': -0.19,
  'KNN': 0.02,
  'RF': -0.0,
  'SVM': 0.01,
  'base_test_prob': 0.19},
 '200%//2011-06-30//2012-06-30': {'DT': 0.06,
  'KMeans': 0.37,
  'KNN': 0.06,
  'RF': 0.11,
  'SVM': 0.04,
  'base_test_prob': 0.13},
 '200%//2011-06-30//2013-06-30': {'DT': 0.0,
  'KMeans': -0.09,
  'KNN': 0.02,
  'RF': 0.03,
  'SVM': 0.02,
  'base_test_prob': 0.09},
 '200%//2011-06-30//2014-06-30': {'DT': -0.01,
  'KMeans': -0.06,
  'KNN': -0.01,
  'RF': -0.0,
  'SVM': 0.01,
  'base_test_prob': 0.06},
 '300%//2011-06-30//2012-06-30': {'DT': 0.03,
  'KMeans': 0.45,
  'KNN': 0.03,
  'RF': 0.08,
  'SVM': 0.05,
  'base_test_prob': 0.05},
 '300%//2011-06-30//2013-06-30': {'DT': 0.02,
  'KM

# Model Speed Up

## Model Speed Tester (MULTIPROCESSING)

In [None]:
def test(hurdle,datepair,result):
    #Setting of parameters
    hurdle = hurdle
    train_date = datepair[0]
    test_date = datepair[1]
    K_Means_nclusters = 2
    Mean_Shift_min_df_len = 0.1

    #-----Start main program-----
    df = get_df(PATH)
    df = df.loc[(df["As of date"] == train_date)|(df["As of date"] == test_date)]
    add_hurdle_col(dataframe=df,hurdle=hurdle)
    df = drop_useless_cols(dataframe=df)
    clean_div_yield_col(dataframe=df)
    clean_KMB_all_cols(dataframe=df)
    #-----Start of ML-----
    #Get X, y as individual dataframes
    X, y = get_xy(dataframe=df)
    X_train, y_train = get_xy(dataframe=df.loc[(df["As of date"] == train_date)])
    X_test, y_test = get_xy(dataframe=df.loc[(df["As of date"] == test_date)])
    #Supervised train
    train_df = join_df(X_train,y_train)
    X_train, y_train = get_scaled_xy(dataframe=train_df)
    #Supervised test
    test_df = join_df(X_test,y_test)
    base_test_probability = get_base_prob(dataframe=test_df)
    X_test, y_test = get_scaled_xy(dataframe=test_df)
    #Unsupervised
    df = join_df(X,y)
    #base_probability = get_base_prob(dataframe=df)
    X, y = get_scaled_xy(dataframe=df)
    
    EP, bestalgo, summary = get_EP_results(K_Means_nclusters=K_Means_nclusters,Mean_Shift_min_df_len=Mean_Shift_min_df_len,base_test_prob=base_test_probability,\
                                           X_train=X_train, y_train=y_train,X_test=X_test, y_test=y_test,train_date=train_date,test_date=test_date)
    store_results(result_to_store=EP,resultdict=result,train_date=train_date,test_date=test_date,hurdle=hurdle)
    #-----End of program-----

PATH = '/content/drive/My Drive/YOUR_PATH_TO_DATA'

#Lists to iterate over
datepairs = [['2010-06-30','2011-06-30'],['2011-06-30','2012-06-30'],['2012-06-30','2013-06-30'],['2013-06-30','2014-06-30']]
#datepairs = [['2011-06-30','2012-06-30'],['2011-06-30','2013-06-30'],['2011-06-30','2014-06-30']]
hurdlelist = [1,2,3,4,5] # >x*100% returns in the forward 5Y period

start = timer()
import multiprocessing as mp

if __name__ == '__main__':
    manager = mp.Manager()
    result3 = manager.dict()
    jobs = []
    for hurdle in hurdlelist:
      for datepair in datepairs:
        p = mp.Process(target=test, args=(hurdle,datepair,result3))
        jobs.append(p)
        p.start()

    for proc in jobs:
        proc.join()

end = timer()
print (end-start)
result3 = dict(result3)
result3
'''
import cProfile, pstats, io
pr = cProfile.Profile()
pr.enable()
###############################
test(1,['2011-06-30','2012-06-30'],result3)
###############################
pr.disable()
s = io.StringIO()
sortby = 'cumulative'
ps = pstats.Stats(pr, stream=s).sort_stats(sortby)
ps.print_stats()
print(s.getvalue())
'''

Process Process-2:
Traceback (most recent call last):
Process Process-3:
Process Process-5:
  File "/usr/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/usr/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
Traceback (most recent call last):
  File "<ipython-input-10-92fb488df4f0>", line 10, in test
    df = get_df(PATH)
Process Process-4:
Traceback (most recent call last):
Traceback (most recent call last):
  File "/usr/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/usr/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "<ipython-input-3-bbbebad92957>", line 2, in get_df
    return pd.read_excel(path)
  File "/usr/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/usr/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwarg

0.6081199160000779


"\nimport cProfile, pstats, io\npr = cProfile.Profile()\npr.enable()\n###############################\ntest(1,['2011-06-30','2012-06-30'],result3)\n###############################\npr.disable()\ns = io.StringIO()\nsortby = 'cumulative'\nps = pstats.Stats(pr, stream=s).sort_stats(sortby)\nps.print_stats()\nprint(s.getvalue())\n"

## Model Speed Tester 2 (CONCURRENT FUTURES)

In [None]:
def test(hurdle,datepair,result):
    #Setting of parameters
    #hurdle = hurdle
    train_date = datepair[0]
    test_date = datepair[1]
    K_Means_nclusters = 2
    Mean_Shift_min_df_len = 0.1

    #-----Start main program-----
    df = get_df(PATH)
    df = df.loc[(df["As of date"] == train_date)|(df["As of date"] == test_date)]
    add_hurdle_col(dataframe=df,hurdle=hurdle)
    df = drop_useless_cols(dataframe=df)
    clean_div_yield_col(dataframe=df)
    clean_KMB_all_cols(dataframe=df)
    #-----Start of ML-----
    #Get X, y as individual dataframes
    X, y = get_xy(dataframe=df)
    X_train, y_train = get_xy(dataframe=df.loc[(df["As of date"] == train_date)])
    X_test, y_test = get_xy(dataframe=df.loc[(df["As of date"] == test_date)])
    #Supervised train
    train_df = join_df(X_train,y_train)
    X_train, y_train = get_scaled_xy(dataframe=train_df)
    #Supervised test
    test_df = join_df(X_test,y_test)
    base_test_probability = get_base_prob(dataframe=test_df)
    X_test, y_test = get_scaled_xy(dataframe=test_df)
    #Unsupervised
    df = join_df(X,y)
    #base_probability = get_base_prob(dataframe=df)
    X, y = get_scaled_xy(dataframe=df)
    
    EP, bestalgo, summary = get_EP_results(K_Means_nclusters=K_Means_nclusters,Mean_Shift_min_df_len=Mean_Shift_min_df_len,base_test_prob=base_test_probability,\
                                           X_train=X_train, y_train=y_train,X_test=X_test, y_test=y_test,train_date=train_date,test_date=test_date)
    store_results(result_to_store=EP,resultdict=result,train_date=train_date,test_date=test_date,hurdle=hurdle)
    title = f'{hurdle*100}%//{train_date}//{test_date}'
    return title, EP
    #-----End of program-----
    
import concurrent.futures as cf
from itertools import repeat
from multiprocessing import Pool

def main(hurdlelist,datepairs,result):
    with cf.ProcessPoolExecutor() as executor:#ThreadPoolExecutor() as executor:
      x = [executor.submit(test,x,y,z) for x,y,z in zip(hurdlelist,datepairs,repeat(result))]
      for i in cf.as_completed(x):
            k, v = i.result()
            result4[k] = v #result or result4?


start = timer()

if __name__ == '__main__':
  PATH = '/content/drive/My Drive/YOUR_PATH_TO_DATA'
  #args = [(x,y,result4) for x in hurdlelist for y in datepairs]
  #Lists to iterate over
  #datepairs = [['2010-06-30','2011-06-30']]#,['2011-06-30','2012-06-30'],['2012-06-30','2013-06-30'],['2013-06-30','2014-06-30']]
  datepairs = [['2011-06-30','2012-06-30'],['2011-06-30','2013-06-30'],['2011-06-30','2014-06-30']]
  hurdlelist = [1,2,3,4,5] # >x*100% returns in the forward 5Y period
  datepairs2 = [['2010-06-30','2011-06-30'],['2011-06-30','2012-06-30'],['2012-06-30','2013-06-30'],['2013-06-30','2014-06-30']]*5
  hurdlelist2 = [1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5] # >x*100% returns in the forward 5Y period
  
  result4 = {}
  main(hurdlelist2,datepairs2,result4)
  '''
  p = Pool(processes=20)
  data = p.starmap(test, zip(hurdlelist2,datepairs2,repeat(result4)))
  p.close()
  result5 = dict(data)
  '''
end = timer()
print (end-start)

result4

160.67076105500007


{'100%//2010-06-30//2011-06-30': {'DT': 0.04,
  'KMeans': -0.25,
  'KNN': 0.04,
  'RF': 0.07,
  'SVM': -0.25,
  'base_test_prob': 0.25},
 '100%//2011-06-30//2012-06-30': {'DT': 0.02,
  'KMeans': 0.12,
  'KNN': 0.02,
  'RF': 0.15,
  'SVM': -0.38,
  'base_test_prob': 0.38},
 '100%//2012-06-30//2013-06-30': {'DT': 0.05,
  'KMeans': -0.29,
  'KNN': 0.06,
  'RF': 0.14,
  'SVM': -0.29,
  'base_test_prob': 0.29},
 '100%//2013-06-30//2014-06-30': {'DT': 0.06,
  'KMeans': -0.19,
  'KNN': 0.08,
  'RF': 0.22,
  'SVM': -0.19,
  'base_test_prob': 0.19},
 '200%//2010-06-30//2011-06-30': {'DT': 0.03,
  'KMeans': -0.07,
  'KNN': 0.03,
  'RF': 0.08,
  'SVM': -0.07,
  'base_test_prob': 0.07},
 '200%//2011-06-30//2012-06-30': {'DT': 0.04,
  'KMeans': 0.37,
  'KNN': 0.12,
  'RF': -0.13,
  'SVM': -0.13,
  'base_test_prob': 0.13},
 '200%//2012-06-30//2013-06-30': {'DT': 0.05,
  'KMeans': -0.09,
  'KNN': 0.09,
  'RF': 0.16,
  'SVM': -0.09,
  'base_test_prob': 0.09},
 '200%//2013-06-30//2014-06-30': {'DT': 0.

# Model Analysis

In [None]:
from numpy import mean

algoDT1 = []
algoKNN1 = []
algoRF1 = []
algoSVM1 = []
algoKMeans1 = []
algoMeanShift1 = []

for k,v in result3.items():
  algoDT1.append(v['DT']) if v['DT'] != -v['base_test_prob'] else None
  algoKNN1.append(v['KNN']) if v['KNN'] != -v['base_test_prob'] else None
  algoRF1.append(v['RF']) if v['RF'] != -v['base_test_prob'] else None
  algoSVM1.append(v['SVM']) if v['SVM'] != -v['base_test_prob'] else None
  algoKMeans1.append(v['KMeans']) if v['KMeans'] != -v['base_test_prob'] else None

algoDT2 = []
algoKNN2 = []
algoRF2 = []
algoSVM2 = []
algoKMeans2 = []
algoMeanShift2 = []

for k,v in result4.items():
  algoDT2.append(v['DT']) if v['DT'] != -v['base_test_prob'] else None
  algoKNN2.append(v['KNN']) if v['KNN'] != -v['base_test_prob'] else None
  algoRF2.append(v['RF']) if v['RF'] != -v['base_test_prob'] else None
  algoSVM2.append(v['SVM']) if v['SVM'] != -v['base_test_prob'] else None
  algoKMeans2.append(v['KMeans']) if v['KMeans'] != -v['base_test_prob'] else None

AVGdict = {
    'AVGalgoDT1' : [mean(algoDT1),len(algoDT1)],
    'AVGalgoKNN1' : [mean(algoKNN1),len(algoKNN1)],
    'AVGalgoRF1' : [mean(algoRF1),len(algoRF1)],
    'AVGalgoSVM1' : [mean(algoSVM1),len(algoSVM1)],
    'AVGalgoKMeans1' : [mean(algoKMeans1),len(algoKMeans1)],
    'AVGalgoDT2' : [mean(algoDT2),len(algoDT2)],
    'AVGalgoKNN2' : [mean(algoKNN2),len(algoKNN2)],
    'AVGalgoRF2' : [mean(algoRF2),len(algoRF2)],
    'AVGalgoSVM2' : [mean(algoSVM2),len(algoSVM2)],
    'AVGalgoKMeans2' : [mean(algoKMeans2),len(algoKMeans2)]}

AVGdict = {k: [round(v[0],2),v[1]] for k, v in AVGdict.items()}

listdict = {
    'DT1' : [algoDT1,len(algoDT1)],
    'KNN1' : [algoKNN1,len(algoKNN1)],
    'RF1' : [algoRF1,len(algoRF1)],
    'SVM1' : [algoSVM1,len(algoSVM1)],
    'KMeans1' : [algoKMeans1,len(algoKMeans1)],
    'DT2' : [algoDT2,len(algoDT2)],
    'KNN2' : [algoKNN2,len(algoKNN2)],
    'RF2' : [algoRF2,len(algoRF2)],
    'SVM2' : [algoSVM2,len(algoSVM2)],
    'KMeans2' : [algoKMeans2,len(algoKMeans2)]}




'''
Notes:
DT is good for a little additional pct over base prob
X - KMeans is only good in 1 year, (not suitable for usage)
KNN is good for hurdle=1,2, cannot predict above that
RF is very predictive, but only gives predictions for a few periods
X - SVM is bad
RANK - RF, KNN, DT           X - KMeans, SVM
'''
AVGdict
#listdict

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


{'AVGalgoDT1': [nan, 0],
 'AVGalgoDT2': [0.04, 15],
 'AVGalgoKMeans1': [nan, 0],
 'AVGalgoKMeans2': [0.24, 4],
 'AVGalgoKNN1': [nan, 0],
 'AVGalgoKNN2': [0.06, 8],
 'AVGalgoRF1': [nan, 0],
 'AVGalgoRF2': [0.15, 7],
 'AVGalgoSVM1': [nan, 0],
 'AVGalgoSVM2': [nan, 0]}

In [None]:
def generate_model_report(y_actual,y_predict):
  print(f"Accuracy:\t {accuracy_score(y_actual,y_predict)}" )
  print(f"Precision:\t {precision_score(y_actual,y_predict)}" )
  print(f"Recall:\t\t {recall_score(y_actual,y_predict)}" )
  print(f"F1 Score:\t {f1_score(y_actual,y_predict)}" )
  pass

def generate_auc_roc_curve(clf, X_test):
  y_pred_proba = clf.predict_proba(X_test)[:, 1]
  fpr, tpr, thresholds = roc_curve(y_test,  y_pred_proba)
  auc = roc_auc_score(y_test, y_pred_proba)
  plt.plot(fpr,tpr,label="AUC ROC Curve with Area Under the curve ="+str(auc))
  plt.legend(loc=4)
  plt.show()
  pass

In [None]:
#pd.crosstab(pd.Series(y_predict,name ='Predicted'), pd.Series(y_test, name ='Actual'))

In [None]:
#generate_model_report(y_test,y_predict)

In [None]:
#generate_auc_roc_curve(clf, X_test)

# Backtester (Individual algos)

In [None]:
import itertools
#List for iteration
PATH = '/content/drive/My Drive/YOUR_PATH_TO_DATA'
#datepairs = ([f'{2000+i}-06-30',f'{2000+i+1}-06-30'] for i in range(14)) #2000 to 2014
hurdlelist = (i for i in range(1,2)) # >x*100% returns in the forward 5Y period

#List for iteration
#datepairs = [['2010-06-30','2011-06-30'],['2011-06-30','2012-06-30'],['2012-06-30','2013-06-30'],['2013-06-30','2014-06-30']]
#hurdlelist = [1,2,3,4,5] # >x*100% returns in the forward 5Y period
bt_result = {}

for hurdle in hurdlelist:
  datepairs = ([f'{2000+i}-06-30',f'{2000+i+1}-06-30'] for i in range(14)) #2000 to 2014
  for datepair in datepairs:
    #-----Setting Parameters-----
    train_date = datepair[0]
    test_date = datepair[1]
    hurdle = hurdle
    K_Means_nclusters = 2
    Mean_Shift_min_df_len = 0.1
    #-----Start main program-----
    df = get_df(PATH)
    df = df.loc[(df["As of date"] == train_date)|(df["As of date"] == test_date)]
    add_hurdle_col(dataframe=df,hurdle=hurdle)
    df = drop_useless_cols(dataframe=df)
    clean_div_yield_col(dataframe=df)
    clean_KMB_all_cols(dataframe=df)
    #-----ADDITIONS-----
    #df = df.loc[(df['5Y AVG P/E']!=-10000)&(df['5Y AVG P/FCF']!=-10000)] ##### remove negative 5y returns companies
    #df.drop(['CAGR of Revenue:CQ T12M','Gr PoP of Avg Shares for EPS:Q','Gr PoP of Avg Shares for EPS:Q.1','Dvd Yld'], axis=1, inplace=True)
    #-----END OF ADDITIONS-----
    #-----Start of ML-----
    #Get X, y as individual dataframes
    X, y = get_xy(dataframe=df)
    X_train, y_train = get_xy(dataframe=df.loc[(df["As of date"] == train_date)])
    X_test, y_test = get_xy(dataframe=df.loc[(df["As of date"] == test_date)])
    #Supervised train
    train_df = join_df(X_train,y_train)
    X_train, y_train = get_scaled_xy(dataframe=train_df)
    #Supervised test
    test_df = join_df(X_test,y_test)
    base_test_probability = get_base_prob(dataframe=test_df)
    X_test, y_test = get_scaled_xy(dataframe=test_df)
    #Unsupervised
    df2 = join_df(X,y)
    #base_probability = get_base_prob(dataframe=df)
    X, y = get_scaled_xy(dataframe=df2)
    #-----End of data retrieval-----
    #-----BACKTESTER-----
    #print(f'Hurdle: {hurdle} // Train Date: {train_date} // Test Date: {test_date}')
    run_result = {}
    for i in range(10):
      # Random Forest
      clf = RandomForestClassifier(n_jobs=-1) #neighbors.KNeighborsClassifier() #DecisionTreeClassifier() #RandomForestClassifier(n_jobs=-1)
      clf.fit(X_train,y_train)
      y_predict = clf.predict(X_test)
      #print(precision_score(y_test,y_predict,zero_division=0))
      #Precision x (hurdle+1) > 1
      #print(precision_score(y_test,y_predict,zero_division=0)*(hurdle+1))
      result = df[(df['As of date']==test_date)]
      result.reset_index(inplace = True)
      result = result.join(pd.DataFrame(y_predict))
      result = result[(result[0] == 1)]
      run_result[f'Run: {i}'] = round(result['Fwd 5Y Return'].mean()*100,0)
    bt_result[f'Hurdle: {hurdle} // Train Date: {train_date} // Test Date: {test_date}'] = pd.Series([*run_result.values()]).mean()
  print (f'Run: {hurdle}')

Run: 1


In [None]:
bt_result

{'Hurdle: 1 // Train Date: 2000-06-30 // Test Date: 2001-06-30': 104.9,
 'Hurdle: 1 // Train Date: 2001-06-30 // Test Date: 2002-06-30': 135.9,
 'Hurdle: 1 // Train Date: 2002-06-30 // Test Date: 2003-06-30': 121.3,
 'Hurdle: 1 // Train Date: 2003-06-30 // Test Date: 2004-06-30': 21.1,
 'Hurdle: 1 // Train Date: 2004-06-30 // Test Date: 2005-06-30': -85.0,
 'Hurdle: 1 // Train Date: 2005-06-30 // Test Date: 2006-06-30': 107.8,
 'Hurdle: 1 // Train Date: 2006-06-30 // Test Date: 2007-06-30': 99.11111111111111,
 'Hurdle: 1 // Train Date: 2007-06-30 // Test Date: 2008-06-30': 134.0,
 'Hurdle: 1 // Train Date: 2008-06-30 // Test Date: 2009-06-30': 229.5,
 'Hurdle: 1 // Train Date: 2009-06-30 // Test Date: 2010-06-30': 125.9,
 'Hurdle: 1 // Train Date: 2010-06-30 // Test Date: 2011-06-30': 80.4,
 'Hurdle: 1 // Train Date: 2011-06-30 // Test Date: 2012-06-30': 127.4,
 'Hurdle: 1 // Train Date: 2012-06-30 // Test Date: 2013-06-30': 99.4,
 'Hurdle: 1 // Train Date: 2013-06-30 // Test Date: 201

In [None]:
result

Unnamed: 0,index,Market Cap,CAGR of Revenue:CQ T12M,Gr PoP of Avg Shares for EPS:Q,Gr PoP of Avg Shares for EPS:Q.1,5Y AVG P/E,5Y AVG P/FCF,Dvd Yld,Fwd 5Y Return,As of date,Y,0
9,1879,9.680000e+09,-1.19,33.64,0.25,23.44,-10000.00,0.00,0.135590,2014-06-30,0.0,1.0
20,1890,9.460000e+09,167.96,69.88,23.08,-10000.00,-10000.00,0.00,0.505315,2014-06-30,0.0,1.0
26,1896,9.360000e+09,-6.10,34.85,15.08,29.54,1.21,1.72,1.443848,2014-06-30,1.0,1.0
37,1907,9.110000e+09,-6.02,20.81,-2.70,29.33,10.36,1.40,0.600057,2014-06-30,0.0,1.0
40,1910,8.950000e+09,1.04,-21.58,-3.69,14.11,8.46,1.63,0.076340,2014-06-30,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...
1683,3553,3.577400e+08,4.54,19.41,4.25,17.27,12.99,2.25,0.843310,2014-06-30,0.0,1.0
1685,3555,3.567600e+08,2.26,52.17,8.38,31.59,9.62,1.16,1.438432,2014-06-30,1.0,1.0
1687,3557,3.563000e+08,8.51,-24.16,-3.96,10.06,10.97,5.40,1.037845,2014-06-30,1.0,1.0
1731,3601,3.312200e+08,35.80,41.56,2.15,21.81,26.76,0.00,1.130724,2014-06-30,1.0,1.0


# Future improvements

In [None]:
# Normalize with max("x/x(absmax)"), l1("x/sum(x)") , l2(euclidean dist)
# Optimize with numba, cprofile, multiprocessing etc
# Binning of returns?
# https://www.oreilly.com/library/view/machine-learning-with/9781491989371/ch04.html

In [None]:
import tensorflow as tf
tf.test.gpu_device_name()

'/device:GPU:0'

In [None]:
import multiprocessing as mp
print("Number of processors: ", mp.cpu_count())

Number of processors:  2


# Other checks

In [None]:
#Check for unbalanced data
unique_classes = list(os_df['Y'].unique())
print(unique_classes)
    
out_dict = {}
for classes in unique_classes:
    out_dict[classes] = os_df.shape[0]/((os_df.loc[os_df['Y'] == classes].shape[0])
                                     *len(unique_classes))
print(out_dict)

from collections import Counter
from imblearn.over_sampling import SMOTE

#Before resampling
print(Counter(y))

#After resampling
X_res, y_res = SMOTE(random_state=0).fit_resample(X, y)
print(Counter(y_res))