In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_predict, cross_val_score, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import precision_recall_curve
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor, XGBClassifier
import numpy as np
from bs4 import BeautifulSoup
import requests
import os

In [2]:
stocklist = []
stockpath = []
for a,b,c in os.walk('stocks'):
    for stock in c:
        stockpath.append(a+'\\'+stock)
        stock = stock.split('.')[0]
        stocklist.append(stock)
        

In [3]:
def processdifferencing(df, differencedcolumn, lags):
    df = df.copy()
    column = differencedcolumn
    other = differencedcolumn.split('_')[0]
    for lag in range(1,lags+1):
        df[f'{other}_{lag}'] = df.copy()[column].shift(-lag)
    return df

In [4]:
def getvaluestp(x,target,predicted):
    if target == 1:
        if target == predicted:
            return x
    return np.nan
def getvaluesfp(x,target,predicted):
    if target == -1:
        if predicted == 1:
            return x
    return np.nan
def getvaluesfn(x,target,predicted):
    if target == 1:
        if predicted == -1:
            return -x
    return np.nan
def getvaluestn(x,target,predicted):
    if target == -1:
        if predicted == -1:
            return -x
    return np.nan

In [5]:
def getadjustedprediction(x_prob,tp,tn,fp,fn, tax):
    gainforbuying = (x_prob * tp) + ((1-x_prob)* fp ) - tax
    gainfornotbuying = ((1-x_prob) * tn) + (x_prob * fn)
    
    if gainforbuying > gainfornotbuying:
        return 1
    return -1

In [6]:
buys = []
sells = []
for path in stockpath:
    stock_name = path.split('\\')[2].split('.')[0]
    
    stock_df = pd.read_csv(path)
    stock_df['Date'] = pd.to_datetime(stock_df['Date'])
    
    stock_df['differenced_0'] = stock_df['Last Price'] - stock_df['Last Price'].shift(-1)
    stock_df['target'] = stock_df['Last Price'].shift(5) - stock_df['Last Price']
    stock_df = processdifferencing(stock_df, 'differenced_0', 30)
    stock_df = processdifferencing(stock_df, 'Volume', 30)
    listofdifferences = []
    for i in range(30):
        string = f'differenced_{i}'
        string2 = f'Volume_{i}'
        if string2 == 'Volume_0':
            string2 = 'Volume'
        listofdifferences.append(string)
        listofdifferences.append(string2)
    
#     columns = listofdifferences.copy()
#     columns.append('target')
#     columns.append('Last Price')
#     testing = stock_df.copy()[columns]
#     testing['target_classification'] = testing['target'].apply(lambda x: 1 if x >= 0 else 0)
#     testing = testing.dropna()
   
    stock_df['target_classification'] = stock_df.copy()['target'].apply(lambda x: -1 if x <= 0 else 1 if x > 0 else np.nan)
    
    pos_neg_df = stock_df.copy()
    pos_neg_df = pos_neg_df.dropna()
    X = pos_neg_df.copy()[listofdifferences]
    y = pos_neg_df.copy()['target_classification']
    clf = XGBClassifier(max_depth=23)
    clf.fit(X,y)
    predictions = cross_val_predict(clf, X,y, cv = 5)
    predictions_probab = cross_val_predict(clf, X,y, method='predict_proba', cv = 5)
    score_clf = cross_val_score(clf, X,y, cv = 5, scoring = 'f1_weighted')
    
    pos_neg_df['predicted'] = predictions
    pos_neg_df['probability_predicted'] = predictions_probab[:,1]
    
    pos_neg_df = pos_neg_df.sort_values(by='Date')
    
    pos_neg_df['tp_gain'] = pos_neg_df.apply(lambda x :getvaluestp(x['target'],x['target_classification'],x['predicted']), axis = 1)
    pos_neg_df['tp_gain'] = pos_neg_df['tp_gain'].ewm(span =500, ignore_na=True).mean()
    pos_neg_df['tp_gain'] = pos_neg_df['tp_gain'].fillna(method= 'backfill')
    
    pos_neg_df['fp_gain'] = pos_neg_df.apply(lambda x :getvaluesfp(x['target'],x['target_classification'],x['predicted']), axis = 1)
    pos_neg_df['fp_gain'] = pos_neg_df['fp_gain'].ewm(span =500, ignore_na=True).mean()
    pos_neg_df['fp_gain'] = pos_neg_df['fp_gain'].fillna(method= 'backfill')
    
    pos_neg_df['fn_gain'] = pos_neg_df.apply(lambda x :getvaluesfn(x['target'],x['target_classification'],x['predicted']), axis = 1)
    pos_neg_df['fn_gain'] = pos_neg_df['fn_gain'].ewm(span =500, ignore_na=True).mean()
    pos_neg_df['fn_gain'] = pos_neg_df['fn_gain'].fillna(method= 'backfill')

    pos_neg_df['tn_gain'] = pos_neg_df.apply(lambda x :getvaluestn(x['target'],x['target_classification'],x['predicted']), axis = 1)
    pos_neg_df['tn_gain'] = pos_neg_df['tn_gain'].ewm(span =500, ignore_na=True).mean()
    pos_neg_df['tn_gain'] = pos_neg_df['tn_gain'].fillna(method= 'backfill')
    
    pos_neg_df = pos_neg_df.sort_values(by='Date', ascending= False)
    
    pos_neg_df['probability_predicted'] = predictions_probab[:,1]
    pos_neg_df['adjusted_prediction'] = pos_neg_df.apply(lambda x: getadjustedprediction(x['probability_predicted'],x['tp_gain'],x['tn_gain'],x['fp_gain'],x['fn_gain'],x['Last Price']*0.012), axis = 1)
    threshold = pos_neg_df[pos_neg_df['adjusted_prediction'] == 1]['probability_predicted'].mean()
    
    stock_tp_gain  = pos_neg_df.copy()['tp_gain'].reset_index(drop=True)[0]
    stock_fp_gain = pos_neg_df.copy()['fp_gain'].reset_index(drop=True)[0]
    stock_tn_gain = pos_neg_df.copy()['tn_gain'].reset_index(drop=True)[0]
    stock_fn_gain = pos_neg_df.copy()['fn_gain'].reset_index(drop=True)[0]
    stock_tax = pos_neg_df.copy()['Last Price'].reset_index(drop=True)[0] * 0.012
    
    stock_x = stock_df.copy().head(1)[listofdifferences]
    nextweekproba_predict = clf.predict_proba(stock_x)
    probability_of_buy = nextweekproba_predict[:,1]
    buyorsell = getadjustedprediction(probability_of_buy[0],stock_tp_gain,stock_tn_gain,stock_fp_gain,stock_fn_gain, stock_tax)
    
    if buyorsell == 1:
        buys.append([stock_name,score_clf.mean()] )
    elif buyorsell == -1:
        sells.append([stock_name,score_clf.mean()] )
    
    print(stock_name,buyorsell, ' f1: ',score_clf.mean(), ' adjusted: ',threshold)

KeyboardInterrupt: 