In [296]:
import pandas as pd
import numpy as np
import math
import random
from datetime import date, timedelta

from surprise import Reader
from surprise import Dataset
from surprise.model_selection import cross_validate as CV
from surprise.model_selection import GridSearchCV as GSCV
from surprise.model_selection import train_test_split as TTS
from surprise.prediction_algorithms import SVD, SVDpp
from surprise.accuracy import rmse

from sklearn.neighbors import NearestNeighbors as NN
from sklearn.preprocessing import StandardScaler as SS
from sklearn.metrics import mean_squared_error as MSE


np.seed = 42

Note:   
Try also:     
biased = False   
SVD++ -but probably worse    

## Initial Model
SVD on the value-change data

In [4]:
df = pd.read_csv('change.csv', index_col=0)
df.head()

Unnamed: 0,A,AA,AAIC,AAN,AAP,AAT,AB,ABB,ABBV,ABC,...,K,KAI,KAMN,KAR,KB,KBH,KBR,KEN,KEP,KEX
1999-11-01,,,0.25,,,,0.87,,,-1.44,...,-1.18,-0.06,-0.44,,,-0.81,,,0.5,-0.31
1999-11-02,,,0.25,,,,2.69,,,0.0,...,-0.82,-0.06,-0.28,,,0.5,,,0.44,-0.37
1999-11-03,,,-0.13,,,,-1.44,,,-0.76,...,-0.56,0.0,0.47,,,-0.12,,,0.63,0.06
1999-11-04,,,0.13,,,,0.81,,,-0.25,...,-1.19,0.06,0.13,,,0.06,,,-0.5,-0.43
1999-11-05,,,0.25,,,,0.07,,,-0.38,...,-0.25,-0.06,0.66,,,1.19,,,-0.38,-0.06


In [5]:
syms = df.columns

In [10]:
syms

Index(['A', 'AA', 'AAIC', 'AAN', 'AAP', 'AAT', 'AB', 'ABB', 'ABBV', 'ABC',
       ...
       'K', 'KAI', 'KAMN', 'KAR', 'KB', 'KBH', 'KBR', 'KEN', 'KEP', 'KEX'],
      dtype='object', length=1310)

In [1]:
def surprisify(df):
    syms = df.columns
    symID = []
    dateID = []
    value = []

    for sym in syms:
        for row in df[sym].iteritems():
            if not math.isnan(row[1]):
                symID.append(sym)
                dateID.append(row[0])
                value.append(row[1])
                
    sur_dict = {'symID': symID,
              'dateID': dateID,
              'value': value
              }
    sur_df = pd.DataFrame(sur_dict)
    return sur_df

In [6]:
sur_df = surprisify(df)

In [7]:
len(sur_df)

4268702

In [62]:
print(max(sur_df.value))
print(min(sur_df.value))

172.01
-151.59000000000017


In [104]:
def readify(df):
    ma = max(df.value)
    mi = min(df.value)
    reader = Reader(rating_scale=(mi, ma))
    data = Dataset.load_from_df(df[['dateID', 'symID', 'value']], reader)
    return data

In [101]:
ma = max(sur_df.value)
mi = min(sur_df.value)
reader = Reader(rating_scale=(mi, ma))
data = Dataset.load_from_df(sur_df[['dateID', 'symID', 'value']], reader)

In [102]:
#data = readify(sur_df)
train, test = TTS(data, test_size = 0.25)
algo = SVD()
algo.fit(train)
preds = algo.test(test)

rmse(preds)

RMSE: 0.9351


0.9350791967812728

In [105]:
data = readify(sur_df)
train, test = TTS(data, test_size = 0.25)
algo = SVD()
algo.fit(train)
preds = algo.test(test)

rmse(preds)

RMSE: 0.9249


0.9248982429606476

In [85]:
sur_df.value.std()

1.0791845595259724

### SVD on percentage change

In [71]:
pc_df = pd.read_csv('percent_change.csv', index_col=0)
pc_df.head()

Unnamed: 0,A,AA,AAIC,AAN,AAP,AAT,AB,ABB,ABBV,ABC,...,K,KAI,KAMN,KAR,KB,KBH,KBR,KEN,KEP,KEX
1999-11-01,,,4.940711,,,,3.120516,,,-9.517515,...,-2.982811,-0.849858,-4.0,,,-3.681818,,,3.052503,-1.722222
1999-11-02,,,4.64684,,,,9.356522,,,0.0,...,-2.136529,-0.849858,-2.583026,,,2.359604,,,2.550725,-2.107062
1999-11-03,,,-2.28471,,,,-4.438964,,,-5.475504,...,-1.493333,0.0,4.359926,,,-0.551724,,,3.537339,0.34904
1999-11-04,,,2.338129,,,,2.571429,,,-1.886792,...,-3.238095,0.872093,1.155556,,,0.272727,,,-2.547122,-2.484113
1999-11-05,,,4.347826,,,,0.214988,,,-2.923077,...,-0.701656,-0.864553,5.851064,,,5.274823,,,-2.060738,-0.352941


In [72]:
sur_pcdf = surprisify(pc_df)
sur_pcdf.head()

Unnamed: 0,symID,dateID,value
0,A,1999-11-18,-3.296703
1,A,1999-11-19,-5.961807
2,A,1999-11-22,6.51174
3,A,1999-11-23,-5.294118
4,A,1999-11-24,2.317468


In [91]:
ma = max(sur_pcdf.value)
mi = min(sur_pcdf.value)
reader = Reader(rating_scale=(mi, ma))
data = Dataset.load_from_df(sur_pcdf[['dateID', 'symID', 'value']], reader)
train, test = TTS(data, test_size = 0.25)
algo = SVD()
algo.fit(train)
preds = algo.test(test)
rmse(preds)

RMSE: 2.2489


2.2489349015690774

In [84]:
sur_pcdf.value.std()

2.3396620341091756

In [87]:
algo.pu.shape

(5382, 100)

In [88]:
algo.pu[0]

array([ 0.32724222, -0.17112312, -0.14482898, -0.19676111, -0.61177941,
       -0.04043862, -0.0596063 , -0.04582079, -0.37100417, -0.29101331,
        0.00423587, -0.56929856, -0.14427849,  0.05842542, -0.00468503,
        0.20014201, -0.00575569,  0.06437647, -0.23000374, -0.04599039,
        0.42742323, -0.40292844,  0.18552409,  0.0656216 , -0.12046813,
       -0.19203081, -0.33647796,  0.07588266, -0.03175989, -0.36092148,
        0.05765124, -0.40853459, -0.17551208, -0.27035477, -0.0800951 ,
        0.22687023, -0.18648973, -0.58957369,  0.29201327,  0.22552867,
        0.29367648, -0.15139456,  0.27722866, -0.00545925, -0.38351509,
       -0.11018775,  0.11501291,  0.26009605,  0.05024719, -0.20667209,
        0.12395133,  0.44523993, -0.58493641,  0.12050439,  0.41889912,
        0.20972228,  0.0889681 ,  0.10638516, -0.35311202,  0.20540077,
       -0.26313919,  0.36572982,  0.24663976,  0.44654162,  0.04357456,
       -0.0472553 ,  0.23486802,  0.1160844 ,  0.29720638,  0.34

## Proposition:
Some days are similar to others. I.e. The factors affecting stocks on one day have previously been in a similar circumstance and the stocks will change in a similar manner as how they previously did.

Issue:Performing SVD on the data at the end of the day is too late for making a prediction.   
Solution: Use midday values to make a prediction.     
Issue: May be able to get some midday results later, but currently would need to get a paid for account for the API
   
   

## Testing 

How well can the results at the end of the day be predicted by those at midday?   
The closest I can get, currently, to testing this is by grouping days into pairs and using the first day's change (acting as the midday change) to predict the total day-pair change (acting as whole day change).
The main difference is that the underlying factors are more likely to change over two days than over one day.

In [26]:
test_df = df.copy()
dates = {}
for row in range(len(test_df)-1):
    dates[test_df.iloc[:,0].index[row]] = test_df.iloc[row] + test_df.iloc[row+1]

In [30]:
test_df = pd.DataFrame(dates).transpose()
test_df.head()

Unnamed: 0,A,AA,AAIC,AAN,AAP,AAT,AB,ABB,ABBV,ABC,...,K,KAI,KAMN,KAR,KB,KBH,KBR,KEN,KEP,KEX
1999-11-01,,,0.5,,,,3.56,,,-1.44,...,-2.0,-0.12,-0.72,,,-0.31,,,0.94,-0.68
1999-11-02,,,0.12,,,,1.25,,,-0.76,...,-1.38,-0.06,0.19,,,0.38,,,1.07,-0.31
1999-11-03,,,0.0,,,,-0.63,,,-1.01,...,-1.75,0.06,0.6,,,-0.06,,,0.13,-0.37
1999-11-04,,,0.38,,,,0.88,,,-0.63,...,-1.44,0.0,0.79,,,1.25,,,-0.88,-0.49
1999-11-05,,,0.06,,,,-0.37,,,0.06,...,0.62,-0.06,0.97,,,1.63,,,-0.69,0.0


In [49]:
len(test_df)*0.2

1076.2

In [52]:
inds = random.sample(list(test_df.index), 1076)

In [54]:
test_df_test = test_df.loc[inds, :]
test_df_test.head()

Unnamed: 0,A,AA,AAIC,AAN,AAP,AAT,AB,ABB,ABBV,ABC,...,K,KAI,KAMN,KAR,KB,KBH,KBR,KEN,KEP,KEX
2001-07-10,-3.09,,-0.23,,,,-1.62,0.18,,-0.5,...,-0.16,0.02,-0.11,,,0.48,,,0.02,-0.66
2014-02-12,0.52,,0.21,,0.47,0.56,0.74,0.29,0.98,0.45,...,-0.11,1.5,0.73,-0.02,-0.02,0.16,0.59,,0.34,2.09
2018-10-29,-0.28,-0.28,-0.08,,-2.03,0.86,0.17,-0.11,-1.13,1.35,...,1.83,4.64,0.65,0.49,-1.23,0.02,-0.1,-0.02,-0.17,-0.54
2012-02-29,0.24,,-0.34,,-0.41,-0.71,0.31,-0.36,,-0.18,...,-0.38,-0.25,0.42,-0.18,0.25,-0.14,0.09,,-0.01,1.4
2003-04-22,1.15,,0.45,,2.57,,1.01,0.13,,-0.53,...,0.94,0.09,0.05,,-0.6,2.27,,,0.08,0.24


In [56]:
test_df_train = test_df.drop(inds)
test_df_train.head()

Unnamed: 0,A,AA,AAIC,AAN,AAP,AAT,AB,ABB,ABBV,ABC,...,K,KAI,KAMN,KAR,KB,KBH,KBR,KEN,KEP,KEX
1999-11-01,,,0.5,,,,3.56,,,-1.44,...,-2.0,-0.12,-0.72,,,-0.31,,,0.94,-0.68
1999-11-02,,,0.12,,,,1.25,,,-0.76,...,-1.38,-0.06,0.19,,,0.38,,,1.07,-0.31
1999-11-03,,,0.0,,,,-0.63,,,-1.01,...,-1.75,0.06,0.6,,,-0.06,,,0.13,-0.37
1999-11-04,,,0.38,,,,0.88,,,-0.63,...,-1.44,0.0,0.79,,,1.25,,,-0.88,-0.49
1999-11-05,,,0.06,,,,-0.37,,,0.06,...,0.62,-0.06,0.97,,,1.63,,,-0.69,0.0


In [57]:
len(test_df_train)

4305

In [124]:
test_df_test.head()

Unnamed: 0,A,AA,AAIC,AAN,AAP,AAT,AB,ABB,ABBV,ABC,...,K,KAI,KAMN,KAR,KB,KBH,KBR,KEN,KEP,KEX
2001-07-10,-3.09,,-0.23,,,,-1.62,0.18,,-0.5,...,-0.16,0.02,-0.11,,,0.48,,,0.02,-0.66
2014-02-12,0.52,,0.21,,0.47,0.56,0.74,0.29,0.98,0.45,...,-0.11,1.5,0.73,-0.02,-0.02,0.16,0.59,,0.34,2.09
2018-10-29,-0.28,-0.28,-0.08,,-2.03,0.86,0.17,-0.11,-1.13,1.35,...,1.83,4.64,0.65,0.49,-1.23,0.02,-0.1,-0.02,-0.17,-0.54
2012-02-29,0.24,,-0.34,,-0.41,-0.71,0.31,-0.36,,-0.18,...,-0.38,-0.25,0.42,-0.18,0.25,-0.14,0.09,,-0.01,1.4
2003-04-22,1.15,,0.45,,2.57,,1.01,0.13,,-0.53,...,0.94,0.09,0.05,,-0.6,2.27,,,0.08,0.24


In [141]:
test_dates = [np.datetime64(x) for x in test_df_test.index]

In [172]:
#list of previous days
test_dates_prev = [np.datetime64(x) -1 for x in test_df_test.index]


In [59]:
sur_train = surprisify(test_df_train)
sur_train.head()

Unnamed: 0,symID,dateID,value
0,A,1999-11-18,-4.06
1,A,1999-11-19,0.13
2,A,1999-11-22,0.44
3,A,1999-11-24,1.24
4,A,1999-11-29,1.32


In [109]:
train = readify(sur_train)
train = train.build_full_trainset()
test_algo = SVD()
test_algo.fit(train)


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x190a1672948>

In [119]:
days_df = pd.DataFrame(test_algo.pu)
days_df.index = test_df_train.index
days_df.tail()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
2021-03-16,0.367118,0.064925,0.431814,0.265329,0.192271,0.203466,-0.126263,-0.393558,0.188713,-0.085871,...,-0.121082,0.249686,-0.025685,-0.420226,-0.221207,-0.255014,-0.011437,-0.112349,0.016621,0.141679
2021-03-17,-0.133013,-0.24918,0.036074,0.205301,-0.145316,-0.114851,0.11949,0.039775,0.148908,0.18438,...,-0.08441,0.25842,-0.004861,-0.177897,0.001211,-0.071944,0.124679,-0.119976,0.14891,0.169544
2021-03-18,-1.400811,-0.623412,-0.706768,-0.317822,-0.61092,-0.673829,0.756931,0.414659,0.026496,0.205375,...,-0.00904,0.422216,0.273555,0.494628,0.467456,0.322478,0.586803,-0.001457,0.12054,0.283521
2021-03-19,0.175462,0.195865,-1.139583,0.230722,-0.130534,0.086582,-0.071924,-0.163929,0.422903,0.449381,...,0.13066,0.046565,0.101157,-0.302341,0.104991,-0.21942,0.096827,0.203656,0.357403,-0.451
2021-03-22,0.08309,-0.090418,-0.250634,-0.013273,0.098419,0.028983,0.057824,0.040076,-0.162161,0.031022,...,-0.010107,-0.078166,-0.048035,-0.077645,0.24487,-0.024871,-0.144129,-0.053411,-0.111525,-0.056567


In [176]:
test_dates_prev = []
for date in test_df_test.index:
    while date not in days_df.index:
        date = str(np.datetime64(date)-1)
    test_dates_prev.append(date)

len(test_dates_prev)

1076

In [186]:
ss = SS()
scaled = ss.fit_transform(days_df)

nn = NN(n_neighbors = 15)
nn.fit(scaled)
neighbours = nn.kneighbors(X=days_df.loc[test_dates_prev])

In [244]:
preds = {}
for i, date in enumerate(test_dates):
    weights = neighbours[0][i] / np.sum(np.square(neighbours[0][i]))
    vals = days_df.iloc[neighbours[1][0]-1].transpose() * weights
    preds[date] = vals.transpose().sum()
    
    

In [257]:
preds_df = pd.DataFrame(preds).transpose()
uns_preds = pd.DataFrame(ss.inverse_transform(preds_df), index=preds_df.index)
uns_preds.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
2001-07-10,0.005414,-0.003244,-0.00279,-0.002607,0.003862,0.000838,0.004819,-0.002494,-0.002512,0.002333,...,-0.001919,-0.002547,-0.000593,0.003853,0.002074,-0.001742,0.00063,0.005074,-0.007429,0.005151
2014-02-12,0.004982,-0.0031,-0.002703,-0.002555,0.003667,0.000679,0.004539,-0.00238,-0.002462,0.002297,...,-0.001959,-0.002481,-0.000363,0.003335,0.002389,-0.001471,0.00047,0.004858,-0.006715,0.004751
2018-10-29,0.004061,-0.002744,-0.00237,-0.002254,0.003107,0.00022,0.003927,-0.002083,-0.002405,0.002119,...,-0.001948,-0.002515,7.7e-05,0.002292,0.002927,-0.001017,0.000103,0.004355,-0.005171,0.003928
2012-02-29,0.005414,-0.003257,-0.00282,-0.002653,0.003899,0.000861,0.004835,-0.002508,-0.002517,0.002356,...,-0.001912,-0.002529,-0.0006,0.00383,0.002087,-0.001697,0.000636,0.005108,-0.007445,0.005144
2003-04-22,0.005666,-0.003325,-0.002809,-0.002594,0.003945,0.000905,0.004975,-0.002542,-0.002541,0.002315,...,-0.001851,-0.002661,-0.000726,0.004158,0.001859,-0.00192,0.000736,0.00523,-0.007845,0.005379


In [284]:
true = pd.DataFrame(algo.pu, index=df.index)
true = true.loc[inds]
true.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
2001-07-10,0.120621,-0.018095,-0.022283,0.100175,-0.060575,-0.07748,0.144291,0.04023,-0.059655,0.097492,...,-0.125174,0.066826,-0.037192,0.178491,-0.124973,-0.083445,-0.05078,0.225695,0.038329,0.086916
2014-02-12,0.023591,-0.038009,-0.027012,-0.142473,0.020809,0.046929,0.082563,-0.060934,0.059851,0.045112,...,0.089732,-0.090784,-0.132467,0.035261,-0.006054,0.065521,-0.053941,0.077976,0.104975,-0.028011
2018-10-29,-0.003756,-0.048225,-0.013594,0.012971,-0.009267,0.216581,-0.07341,-0.03469,-0.02081,0.121531,...,-0.012624,-0.062942,0.006067,0.022961,0.090733,-0.077521,0.058229,-0.021905,-0.055225,-0.061579
2012-02-29,0.123621,-0.017243,0.04096,0.021408,0.030324,0.015108,-0.008295,0.003847,0.013248,0.023765,...,0.029704,-0.09654,-0.08498,-0.024416,-0.020232,0.02046,0.028583,-0.04592,-0.067717,0.058292
2003-04-22,-0.089416,0.103192,-0.083241,-0.120813,-0.148799,0.073967,-0.261958,-0.173291,0.483816,0.26019,...,0.031274,-0.101147,-0.012965,0.192092,-0.321017,-0.059583,0.100864,-0.37659,-0.293198,-0.015264


Predicted values are much smaller: as it finds neighbours over so many dimensions there can be greater variation -greater error- in more dimensions. As these are averaged they go some way to cancelling eachother out before being divided.

In [286]:
uns_preds.sum().sum()

17.888558487972375

In [287]:
true.sum().sum()

-23.292922062956823

In [291]:
list_preds = np.reshape(np.array(uns_preds), -1)

In [293]:
list_true = np.reshape(np.array(true), -1)

In [298]:
rmse = MSE(list_true, list_preds, squared=False)
rmse

0.19919319793731463

Try the same again but this time Not Scaled

In [300]:
nn = NN(n_neighbors = 15)
nn.fit(days_df)
neighbours = nn.kneighbors(X=days_df.loc[test_dates_prev])

preds = {}
for i, date in enumerate(test_dates):
    weights = neighbours[0][i] / np.sum(np.square(neighbours[0][i]))
    vals = days_df.iloc[neighbours[1][0]-1].transpose() * weights
    preds[date] = vals.transpose().sum()
    
preds_df = pd.DataFrame(preds).transpose()
preds_df.head()


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
2001-07-10,-0.009003,0.024943,-0.010204,-0.019985,0.021546,0.043756,0.016364,-0.022423,-0.0522,-0.00742,...,-0.017647,-0.035142,-0.002357,-0.039094,-0.02149,-0.03589,-0.011408,0.001307,-0.064154,0.0033
2014-02-12,-0.005681,0.014913,-0.006647,-0.012092,0.013052,0.026437,0.009698,-0.013465,-0.03139,-0.004321,...,-0.010574,-0.021064,-0.001621,-0.023688,-0.013001,-0.021626,-0.006536,0.000936,-0.038562,0.002176
2018-10-29,-0.003411,0.008606,-0.003722,-0.006879,0.00737,0.014971,0.0055,-0.00763,-0.017813,-0.002539,...,-0.005846,-0.012085,-0.000906,-0.0135,-0.007213,-0.012313,-0.003948,0.000531,-0.021971,0.001316
2012-02-29,-0.009069,0.022993,-0.009322,-0.018,0.019363,0.038857,0.014468,-0.019985,-0.04686,-0.006248,...,-0.015368,-0.031561,-0.002292,-0.035289,-0.01872,-0.032274,-0.010761,0.000894,-0.057291,0.003474
2003-04-22,-0.010735,0.029336,-0.0137,-0.025037,0.026149,0.052838,0.019258,-0.027615,-0.062364,-0.009162,...,-0.020856,-0.041691,-0.003413,-0.04718,-0.026065,-0.04195,-0.011734,0.003336,-0.077418,0.004346


## Test
Some days are similar to others.   
1st test:    
A rough test I want to do to begin -Can tomorrow's changes be approximated by the changes seen in the first days after those most similar to today?
Not expecting too much from this -as its only going back one day- especially if PCA is needed.

In [301]:
ns_list_preds = np.reshape(np.array(preds_df), -1)
rmse = MSE(list_true, ns_list_preds, squared=False)
rmse

0.2003353087024881

So far I've been treating each day's values as inseparable. Also, putting each day separately into the SVD algorithm, I was  only able to look one day back. I hoped there was enough information in each day's state to show where it was going, but that was a bit too much to ask of it.       

I'm next going to go for an opposite approach: consider the day's values separately, but look further back in time.   
Next: LSTMs    
    
Another idea is to try nearest neighbours again but for the SVD, for each day, instead just that day's ~2500 stock values, use all the values for the last week (~12500 stock values) or multiple weeks

# LSTM