In [1]:
import cPickle
import numpy as np
import pandas as pd
import datetime
from sklearn import preprocessing

import datetime
from sklearn.ensemble import RandomForestClassifier
from sklearn import neighbors
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
import operator
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix, classification_report

from pandas_datareader import data, wb
import pandas_datareader as pdr
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA
import re
from dateutil import parser
from bt import Strategy#, Portfolio
import quandl

In [2]:
def getStock(symbol, start, end):
    """
    Downloads Stock from Yahoo Finance.
    Computes daily Returns based on Adj Close.
    Returns pandas dataframe.
    """
    
    df =  pdr.data.get_data_yahoo(symbol, start, end)

    df.columns.values[-1] = 'AdjClose'
    df.columns = df.columns + '_' + symbol
    df['Return_%s' %symbol] = df['AdjClose_%s' %symbol].pct_change()
    
    print(df)
    
    return df

d1 = datetime.datetime(1995,5,1)
d2 = datetime.datetime(2017,5,1)

df = getStock("^IXIC", d1, d2)

df.columns

             Open_^IXIC   High_^IXIC    Low_^IXIC  Close_^IXIC  Volume_^IXIC  \
Date                                                                           
1995-05-01   844.719971   846.419983   841.630005   841.630005     315660000   
1995-05-02   841.929993   842.820007   839.429993   841.789978     348490000   
1995-05-03   842.750000   850.270020   842.750000   850.260010     398810000   
1995-05-04   851.500000   856.789978   846.070007   846.750000     421000000   
1995-05-05   849.070007   850.789978   842.570007   843.530029     331890000   
1995-05-08   843.479980   849.840027   842.760010   849.289978     320160000   
1995-05-09   850.419983   851.890015   845.359985   848.169983     381750000   
1995-05-10   850.479980   851.989990   845.750000   847.619995     380030000   
1995-05-11   847.119995   853.840027   846.130005   853.830017     390710000   
1995-05-12   853.739990   859.880005   852.520020   858.940002     404570000   
1995-05-15   859.739990   863.390015   8

Index([u'Open_^IXIC', u'High_^IXIC', u'Low_^IXIC', u'Close_^IXIC',
       u'Volume_^IXIC', u'AdjClose_^IXIC', u'Return_^IXIC'],
      dtype='object')

In [3]:
def getStockFromQuandl(symbol, name, start, end):
    """
    Downloads Stock from Quandl.
    Computes daily Returns based on Adj Close.
    Returns pandas dataframe.
    """
    import quandl
    df =  quandl.get(symbol, trim_start = start, trim_end = end)

    df.columns.values[-1] = 'AdjClose'
    df.columns = df.columns + '_' + name
    df['Return_%s' %name] = df['AdjClose_%s' %name].pct_change()
    
    return df

In [4]:
def getStockDataFromWeb(fout, start_string, end_string):
    """
    Collects predictors data from Yahoo Finance and Quandl.
    Returns a list of dataframes.
    """
    start = start_string
    end = end_string
    
    nasdaq = getStock('^IXIC', start, end)
    frankfurt = getStock('^GDAXI', start, end)
    london = getStock('^FTSE', start, end)
    paris = getStock('^FCHI', start, end)
    hkong = getStock('^HSI', start, end)
    nikkei = getStock('^N225', start, end)
    australia = getStock('^AXJO', start, end)
    djia = getStock('^DJI', start, end)
#    gspc = getStock('^GSPC', start, end)
    
#    djia = getStockFromQuandl("YAHOO/INDEX_DJI", 'Djia', start_string, end_string) 
    
    out =  pdr.data.get_data_yahoo(fout, start, end)
    out.columns.values[-1] = 'AdjClose'
    out.columns = out.columns + '_Out'
    out['Return_Out'] = out['AdjClose_Out'].pct_change()
    
    return [out, nasdaq, djia, frankfurt, london, paris, hkong, nikkei, australia]

d1 = datetime.datetime(1995,5,1)
d2 = datetime.datetime(2017,5,1)

#df = getStock

bigDF = getStockDataFromWeb('GSPC',d1, d2)

             Open_^IXIC   High_^IXIC    Low_^IXIC  Close_^IXIC  Volume_^IXIC  \
Date                                                                           
1995-05-01   844.719971   846.419983   841.630005   841.630005     315660000   
1995-05-02   841.929993   842.820007   839.429993   841.789978     348490000   
1995-05-03   842.750000   850.270020   842.750000   850.260010     398810000   
1995-05-04   851.500000   856.789978   846.070007   846.750000     421000000   
1995-05-05   849.070007   850.789978   842.570007   843.530029     331890000   
1995-05-08   843.479980   849.840027   842.760010   849.289978     320160000   
1995-05-09   850.419983   851.890015   845.359985   848.169983     381750000   
1995-05-10   850.479980   851.989990   845.750000   847.619995     380030000   
1995-05-11   847.119995   853.840027   846.130005   853.830017     390710000   
1995-05-12   853.739990   859.880005   852.520020   858.940002     404570000   
1995-05-15   859.739990   863.390015   8

In [5]:
def addFeatures(dataframe, adjclose, returns, n):
    """
    operates on two columns of dataframe:
    - n >= 2
    - given Return_* computes the return of day i respect to day i-n. 
    - given AdjClose_* computes its moving average on n days

    """
    
    return_n = adjclose[9:] + "Time" + str(n)
    dataframe[return_n] = dataframe[adjclose].pct_change(n)
    
    roll_n = returns[7:] + "RolMean" + str(n)
#    dataframe[roll_n] = pd.rolling_mean(dataframe[returns], n)
    dataframe[roll_n] = pd.DataFrame(dataframe[returns]).rolling(window = n).mean()



In [6]:
def applyRollMeanDelayedReturns(datasets, delta):
    """
    applies rolling mean and delayed returns to each dataframe in the list
    """
    for dataset in datasets:
        columns = dataset.columns    
        adjclose = columns[-2]
        returns = columns[-1]
        for n in delta:
            addFeatures(dataset, adjclose, returns, n)
    
    return datasets    

delta = {2}

h = applyRollMeanDelayedReturns(bigDF, delta)

In [7]:
h[0]

Unnamed: 0_level_0,Open_Out,High_Out,Low_Out,Close_Out,Volume_Out,AdjClose_Out,Return_Out,OutTime2,OutRolMean2
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2005-11-04,25.4500,25.4800,25.4000,25.4000,797000,14.3241,,,
2005-11-07,25.4800,25.4800,25.4000,25.4000,13200,14.3241,0.000000,,
2005-11-08,25.4000,25.4100,25.4000,25.4100,26500,14.3298,0.000398,0.000398,0.000199
2005-11-09,25.4000,25.4100,25.4000,25.4100,210000,14.3298,0.000000,0.000398,0.000199
2005-11-10,25.4300,25.4750,25.4200,25.4600,700000,14.3580,0.001968,0.001968,0.000984
2005-11-11,25.5000,25.5000,25.5000,25.5000,1900,14.3805,0.001567,0.003538,0.001767
2005-11-14,25.5500,25.5500,25.5500,25.5500,2000,14.4087,0.001961,0.003531,0.001764
2005-11-15,25.3700,25.5700,25.3500,25.5500,196400,14.4087,0.000000,0.001961,0.000980
2005-11-16,25.5500,25.5500,25.5500,25.5500,800,14.4087,0.000000,0.000000,0.000000
2005-11-17,25.5400,25.5400,25.5400,25.5400,300,14.4031,-0.000389,-0.000389,-0.000194


In [10]:
def mergeDataframes(datasets, index, cut):
    """
    merges datasets in the list 
    """
    subset = []#tion
    subset = [dataset.iloc[:, index:] for dataset in datasets[1:]]
    
    first = subset[0].join(subset[1:], how = 'outer')
    finance = datasets[0].iloc[:, index:].join(first, how = 'left') 
    finance = finance[finance.index > cut]
    return finance

index = 6

cut = datetime.datetime(2005,11,3)

mergedDF = mergeDataframes(bigDF, index, cut)

mergedDF

Unnamed: 0_level_0,Return_Out,OutTime2,OutRolMean2,Return_^IXIC,^IXICTime2,^IXICRolMean2,Return_^DJI,^DJITime2,^DJIRolMean2,Return_^GDAXI,...,^FCHIRolMean2,Return_^HSI,^HSITime2,^HSIRolMean2,Return_^N225,^N225Time2,^N225RolMean2,Return_^AXJO,^AXJOTime2,^AXJORolMean2
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2005-11-04,,,,0.004263,0.011715,0.005842,0.000776,0.005541,0.002769,-0.003145,...,0.007800,-0.001082,-0.000801,-0.000400,0.013039,0.015006,0.007490,0.001285,0.014182,0.007083
2005-11-07,0.000000,,,0.004061,0.008342,0.004162,0.005267,0.006048,0.003022,0.005798,...,0.000181,-0.015083,-0.016149,-0.008083,-0.001020,0.012006,0.006010,-0.001549,-0.000266,-0.000132
2005-11-08,0.000398,0.000398,0.000199,-0.002833,0.001217,0.000614,-0.004394,0.000851,0.000437,-0.003059,...,0.000547,0.002604,-0.012518,-0.006240,-0.001769,-0.002787,-0.001394,0.009307,0.007744,0.003879
2005-11-09,0.000000,0.000398,0.000199,0.001722,-0.001116,-0.000555,0.000616,-0.003780,-0.001889,0.000509,...,-0.002600,0.013494,0.016133,0.008049,0.002527,0.000754,0.000379,-0.006236,0.003014,0.001536
2005-11-10,0.001968,0.001968,0.000984,0.009592,0.011330,0.005657,0.008903,0.009524,0.004759,0.000832,...,-0.002678,0.002451,0.015978,0.007972,0.000617,0.003145,0.001572,0.003889,-0.002371,-0.001173
2005-11-11,0.001567,0.003538,0.001767,0.002636,0.012253,0.006114,0.004318,0.013259,0.006610,0.014993,...,0.006280,0.007330,0.009800,0.004891,0.005268,0.005888,0.002942,0.006735,0.010649,0.005312
2005-11-14,0.001961,0.003531,0.001764,-0.000690,0.001944,0.000973,0.001042,0.005364,0.002680,0.000330,...,0.007679,-0.007538,-0.000262,-0.000104,-0.002757,0.002497,0.001256,0.007455,0.014239,0.007095
2005-11-15,0.000000,0.001961,0.000980,-0.006456,-0.007142,-0.003573,-0.001003,0.000037,0.000019,0.003570,...,0.000736,-0.000142,-0.007679,-0.003840,-0.001719,-0.004471,-0.002238,-0.001128,0.006318,0.003163
2005-11-16,0.000000,0.000000,0.000000,0.000544,-0.005916,-0.002956,-0.001093,-0.002095,-0.001048,-0.005704,...,-0.003997,0.001581,0.001439,0.000720,0.005613,0.003884,0.001947,0.003889,0.002756,0.001380
2005-11-17,-0.000389,-0.000389,-0.000194,0.014868,0.015420,0.007706,0.004259,0.003161,0.001583,0.003594,...,-0.002189,0.009381,0.010977,0.005481,0.017001,0.022710,0.011307,0.001731,0.005626,0.002810


In [11]:
def applyTimeLag(dataset, lags, delta):
    """
    apply time lag to return columns selected according  to delta.
    Days to lag are contained in the lags list passed as argument.
    Returns a NaN free dataset obtained cutting the lagged dataset
    at head and tail
    """
    
    dataset.Return_Out = dataset.Return_Out.shift(-1)
    
    maxLag = max(lags)
 
    allColumns = dataset.columns[:]
  
    for column in allColumns:
        data_col = pd.Series(dataset[column])
        data_col = data_col.interpolate(method='linear', axis=0, limit=None, inplace=False, limit_direction='forward', downcast=None)
        dataset[column] = data_col

    return dataset.iloc[maxLag:-1,:]

lags = {0, 1, 2}

hhh = applyTimeLag(mergedDF, lags, delta)

hhh

Unnamed: 0_level_0,Return_Out,OutTime2,OutRolMean2,Return_^IXIC,^IXICTime2,^IXICRolMean2,Return_^DJI,^DJITime2,^DJIRolMean2,Return_^GDAXI,...,^FCHIRolMean2,Return_^HSI,^HSITime2,^HSIRolMean2,Return_^N225,^N225Time2,^N225RolMean2,Return_^AXJO,^AXJOTime2,^AXJORolMean2
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2005-11-08,0.000000,0.000398,0.000199,-0.002833,0.001217,0.000614,-0.004394,0.000851,0.000437,-0.003059,...,0.000547,0.002604,-0.012518,-0.006240,-0.001769,-0.002787,-0.001394,0.009307,0.007744,0.003879
2005-11-09,0.001968,0.000398,0.000199,0.001722,-0.001116,-0.000555,0.000616,-0.003780,-0.001889,0.000509,...,-0.002600,0.013494,0.016133,0.008049,0.002527,0.000754,0.000379,-0.006236,0.003014,0.001536
2005-11-10,0.001567,0.001968,0.000984,0.009592,0.011330,0.005657,0.008903,0.009524,0.004759,0.000832,...,-0.002678,0.002451,0.015978,0.007972,0.000617,0.003145,0.001572,0.003889,-0.002371,-0.001173
2005-11-11,0.001961,0.003538,0.001767,0.002636,0.012253,0.006114,0.004318,0.013259,0.006610,0.014993,...,0.006280,0.007330,0.009800,0.004891,0.005268,0.005888,0.002942,0.006735,0.010649,0.005312
2005-11-14,0.000000,0.003531,0.001764,-0.000690,0.001944,0.000973,0.001042,0.005364,0.002680,0.000330,...,0.007679,-0.007538,-0.000262,-0.000104,-0.002757,0.002497,0.001256,0.007455,0.014239,0.007095
2005-11-15,0.000000,0.001961,0.000980,-0.006456,-0.007142,-0.003573,-0.001003,0.000037,0.000019,0.003570,...,0.000736,-0.000142,-0.007679,-0.003840,-0.001719,-0.004471,-0.002238,-0.001128,0.006318,0.003163
2005-11-16,-0.000389,0.000000,0.000000,0.000544,-0.005916,-0.002956,-0.001093,-0.002095,-0.001048,-0.005704,...,-0.003997,0.001581,0.001439,0.000720,0.005613,0.003884,0.001947,0.003889,0.002756,0.001380
2005-11-17,-0.004305,-0.000389,-0.000194,0.014868,0.015420,0.007706,0.004259,0.003161,0.001583,0.003594,...,-0.002189,0.009381,0.010977,0.005481,0.017001,0.022710,0.011307,0.001731,0.005626,0.002810
2005-11-18,-0.000788,-0.004692,-0.002347,0.002977,0.017889,0.008922,0.004301,0.008578,0.004280,0.004663,...,0.004878,0.006447,0.015889,0.007914,0.014664,0.031914,0.015832,0.009073,0.010820,0.005402
2005-11-21,0.005506,-0.005089,-0.002546,0.006556,0.009552,0.004766,0.005011,0.009334,0.004656,0.009195,...,0.006962,0.000151,0.006599,0.003299,0.003919,0.018640,0.009291,-0.003618,0.005422,0.002728


In [12]:
def prepareDataForCrossValidation(dataset):
    """
    generate categorical output column, set features and return whole data set
    """
    le = preprocessing.LabelEncoder()
    
    dataset['UpDown'] = dataset['Return_Out']
    dataset.UpDown[dataset.UpDown >= 0] = 'Up'
    dataset.UpDown[dataset.UpDown < 0] = 'Down'
    dataset.UpDown = le.fit(dataset.UpDown).transform(dataset.UpDown)
    
    features = dataset.columns[1:-1]
    X = dataset[features]    
    y = dataset.UpDown
    return [X,y]

[X, y] = prepareDataForCrossValidation(hhh)
X

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = val

Unnamed: 0_level_0,OutTime2,OutRolMean2,Return_^IXIC,^IXICTime2,^IXICRolMean2,Return_^DJI,^DJITime2,^DJIRolMean2,Return_^GDAXI,^GDAXITime2,...,^FCHIRolMean2,Return_^HSI,^HSITime2,^HSIRolMean2,Return_^N225,^N225Time2,^N225RolMean2,Return_^AXJO,^AXJOTime2,^AXJORolMean2
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2005-11-08,0.000398,0.000199,-0.002833,0.001217,0.000614,-0.004394,0.000851,0.000437,-0.003059,0.002721,...,0.000547,0.002604,-0.012518,-0.006240,-0.001769,-0.002787,-0.001394,0.009307,0.007744,0.003879
2005-11-09,0.000398,0.000199,0.001722,-0.001116,-0.000555,0.000616,-0.003780,-0.001889,0.000509,-0.002552,...,-0.002600,0.013494,0.016133,0.008049,0.002527,0.000754,0.000379,-0.006236,0.003014,0.001536
2005-11-10,0.001968,0.000984,0.009592,0.011330,0.005657,0.008903,0.009524,0.004759,0.000832,0.001342,...,-0.002678,0.002451,0.015978,0.007972,0.000617,0.003145,0.001572,0.003889,-0.002371,-0.001173
2005-11-11,0.003538,0.001767,0.002636,0.012253,0.006114,0.004318,0.013259,0.006610,0.014993,0.015838,...,0.006280,0.007330,0.009800,0.004891,0.005268,0.005888,0.002942,0.006735,0.010649,0.005312
2005-11-14,0.003531,0.001764,-0.000690,0.001944,0.000973,0.001042,0.005364,0.002680,0.000330,0.015328,...,0.007679,-0.007538,-0.000262,-0.000104,-0.002757,0.002497,0.001256,0.007455,0.014239,0.007095
2005-11-15,0.001961,0.000980,-0.006456,-0.007142,-0.003573,-0.001003,0.000037,0.000019,0.003570,0.003901,...,0.000736,-0.000142,-0.007679,-0.003840,-0.001719,-0.004471,-0.002238,-0.001128,0.006318,0.003163
2005-11-16,0.000000,0.000000,0.000544,-0.005916,-0.002956,-0.001093,-0.002095,-0.001048,-0.005704,-0.002154,...,-0.003997,0.001581,0.001439,0.000720,0.005613,0.003884,0.001947,0.003889,0.002756,0.001380
2005-11-17,-0.000389,-0.000194,0.014868,0.015420,0.007706,0.004259,0.003161,0.001583,0.003594,-0.002131,...,-0.002189,0.009381,0.010977,0.005481,0.017001,0.022710,0.011307,0.001731,0.005626,0.002810
2005-11-18,-0.004692,-0.002347,0.002977,0.017889,0.008922,0.004301,0.008578,0.004280,0.004663,0.008273,...,0.004878,0.006447,0.015889,0.007914,0.014664,0.031914,0.015832,0.009073,0.010820,0.005402
2005-11-21,-0.005089,-0.002546,0.006556,0.009552,0.004766,0.005011,0.009334,0.004656,0.009195,0.013901,...,0.006962,0.000151,0.006599,0.003299,0.003919,0.018640,0.009291,-0.003618,0.005422,0.002728


In [13]:
def prepareDataForClassification(dataset, start_test):
    """
    generates categorical output column, attach to dataframe 
    label the categories and split into train and test
    """
    le = preprocessing.LabelEncoder()
    
    dataset['UpDown'] = dataset['Return_Out']
    dataset.UpDown[dataset.UpDown >= 0] = 'Up'
    dataset.UpDown[dataset.UpDown < 0] = 'Down'
    dataset.UpDown = le.fit(dataset.UpDown).transform(dataset.UpDown)
    
    features = dataset.columns[1:-1]
    X = dataset[features]    
    y = dataset.UpDown    
        
    X_train = X[X.index < start_test]
    y_train = y[y.index < start_test]              
    
    X_test = X[X.index >= start_test]    
    y_test = y[y.index >= start_test]
    
    return [X_train, y_train, X_test, y_test]  


[X_train, y_train, X_test, y_test] = prepareDataForClassification(hhh, datetime.datetime(2016, 5, 1))

print X_train

print y_train

print X_test

print y_test


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.


            OutTime2  OutRolMean2  Return_^IXIC  ^IXICTime2  ^IXICRolMean2  \
Date                                                                         
2005-11-08  0.000398     0.000199     -0.002833    0.001217       0.000614   
2005-11-09  0.000398     0.000199      0.001722   -0.001116      -0.000555   
2005-11-10  0.001968     0.000984      0.009592    0.011330       0.005657   
2005-11-11  0.003538     0.001767      0.002636    0.012253       0.006114   
2005-11-14  0.003531     0.001764     -0.000690    0.001944       0.000973   
2005-11-15  0.001961     0.000980     -0.006456   -0.007142      -0.003573   
2005-11-16  0.000000     0.000000      0.000544   -0.005916      -0.002956   
2005-11-17 -0.000389    -0.000194      0.014868    0.015420       0.007706   
2005-11-18 -0.004692    -0.002347      0.002977    0.017889       0.008922   
2005-11-21 -0.005089    -0.002546      0.006556    0.009552       0.004766   
2005-11-22  0.004714     0.002359      0.005304    0.011895     

In [14]:
def performClassification(X_train, y_train, X_test, y_test, method, parameters, fout, savemodel):
    """
    performs classification on daily returns using several algorithms (method).
    method --> string algorithm
    parameters --> list of parameters passed to the classifier (if any)
    fout --> string with name of stock to be predicted
    savemodel --> boolean. If TRUE saves the model to pickle file
    """
   
    if method == 'RF':   
        return performRFClass(X_train, y_train, X_test, y_test, parameters, fout, savemodel)
        
    elif method == 'KNN':
        return performKNNClass(X_train, y_train, X_test, y_test, parameters, fout, savemodel)
    
    elif method == 'SVM':   
        return performSVMClass(X_train, y_train, X_test, y_test, parameters, fout, savemodel)
    
    elif method == 'ADA':
        return performAdaBoostClass(X_train, y_train, X_test, y_test, parameters, fout, savemodel)
    
    elif method == 'GTB': 
        return performGTBClass(X_train, y_train, X_test, y_test, parameters, fout, savemodel)

    elif method == 'QDA': 
        return performQDAClass(X_train, y_train, X_test, y_test, parameters, fout, savemodel)

In [25]:
def performRFClass(X_train, y_train, X_test, y_test, parameters, fout, savemodel):
    """
    Random Forest Binary Classification
    """
    clf = RandomForestClassifier(n_estimators=1000, n_jobs=-1)
    clf.fit(X_train, y_train)
    
    if savemodel == True:
        fname_out = '{}_{}_RFClass.pickle'.format(fout, pd.datetime.today().strftime("%m-%d-%Y"))
        with open(fname_out, 'wb') as f:
            cPickle.dump(clf, f, -1)
        f.close()
    clf.feature_importances_
    accuracy = clf.score(X_test, y_test)
    
  #  result = clf.predict(datetime.datetime(2016,5,2))
  #  result
    
    return accuracy, clf.feature_importances_
performRFClass (X_train, y_train, X_test, y_test,2, '^GSPC', 1)

(0.49402390438247012,
 array([ 0.04322427,  0.04423847,  0.04174779,  0.03742108,  0.03680862,
         0.04416843,  0.03648032,  0.03640215,  0.04076629,  0.03424219,
         0.03401312,  0.04042608,  0.03539479,  0.03592187,  0.04320535,
         0.03400552,  0.03487776,  0.04109037,  0.03542281,  0.03607304,
         0.04020278,  0.03743503,  0.03714933,  0.0453831 ,  0.03711585,
         0.03678358]))

In [17]:
def performKNNClass(X_train, y_train, X_test, y_test, parameters, fout, savemodel):
    """
    KNN binary Classification
    """
    clf = neighbors.KNeighborsClassifier()
    clf.fit(X_train, y_train)

    if savemodel == True:
        fname_out = '{}_{}_KNNClass.pickle'.format(fout, pd.datetime.today().strftime("%m-%d-%Y"))
        with open(fname_out, 'wb') as f:
            cPickle.dump(clf, f, -1)    
    
    accuracy = clf.score(X_test, y_test)
    
    return accuracy
performKNNClass (X_train, y_train, X_test, y_test,2, '^GSPC', 1)

0.50199203187250996

In [18]:
def performSVMClass(X_train, y_train, X_test, y_test, parameters, fout, savemodel):
    """
    SVM binary Classification
    """
#    c = parameters[0]
#    g =  parameters[1]
    clf = SVC()
    clf.fit(X_train, y_train)

    if savemodel == True:
        fname_out = '{}-{}_SVM.pickle'.format(fout, pd.datetime.today().strftime("%m-%d-%Y"))
        with open(fname_out, 'wb') as f:
            cPickle.dump(clf, f, -1)    
    
    accuracy = clf.score(X_test, y_test)
    
    return accuracy
performSVMClass (X_train, y_train, X_test, y_test,2, '^GSPC', 1)

0.53784860557768921

In [19]:
def performAdaBoostClass(X_train, y_train, X_test, y_test, parameters, fout, savemodel):
    """
    Ada Boosting binary Classification
    """
#    n = parameters[0]
#    l =  parameters[1]
    clf = AdaBoostClassifier()
    clf.fit(X_train, y_train)

    if savemodel == True:
        fname_out = '{}-{}_AdaBoostClass.pickle'.format(fout, pd.datetime.today().strftime("%m-%d-%Y"))
        with open(fname_out, 'wb') as f:
            cPickle.dump(clf, f, -1)    
    
    accuracy = clf.score(X_test, y_test)
    
    return accuracy
performAdaBoostClass (X_train, y_train, X_test, y_test,2, '^GSPC', 1)

0.54183266932270913

In [20]:
def performGTBClass(X_train, y_train, X_test, y_test, parameters, fout, savemodel):
    """
    Gradient Tree Boosting binary Classification
    """
    clf = GradientBoostingClassifier(n_estimators=100)
    clf.fit(X_train, y_train)

    if savemodel == True:
        fname_out = '{}-{}_GTBClass.pickle'.format(fout, pd.datetime.today().strftime("%m-%d-%Y"))
        with open(fname_out, 'wb') as f:
            cPickle.dump(clf, f, -1)    
    
    accuracy = clf.score(X_test, y_test)
    
    return accuracy
performGTBClass(X_train, y_train, X_test, y_test,2, '^GSPC', 1)

0.54581673306772904

In [22]:
def performQDAClass(X_train, y_train, X_test, y_test, parameters, fout, savemodel):
    """
    Quadratic Discriminant Analysis binary Classification
    """
   # def replaceTiny(x):
   #     if (abs(x) < 0.0001):
   #         x = 0.0001
    
   # X_train = X_train.apply(replaceTiny)
   # X_test = X_test.apply(replaceTiny)
    
    clf = QDA()
    clf.fit(X_train, y_train)

    if savemodel == True:
        fname_out = '{}-{}_QDAClass.pickle'.format(fout, pd.datetime.today().strftime("%m-%d-%Y"))
        with open(fname_out, 'wb') as f:
            cPickle.dump(clf, f, -1)    
    
    accuracy = clf.score(X_test, y_test)
    
    return accuracy

performQDAClass(X_train, y_train, X_test, y_test,2, '^GSPC', 1)

0.52589641434262946

In [49]:
def performFeatureSelection(maxdeltas, maxlags, fout, cut, start_test, path_datasets, savemodel, method, folds, parameters):
    """
    Performs Feature selection for a specific algorithm
    """
    
    for maxlag in range(3, maxlags + 2):
        lags = range(2, maxlag) 
        print ''
        print '============================================================='
        print 'Maximum time lag applied', max(lags)
        print ''
        for maxdelta in range(3, maxdeltas + 2):
            datasets = loadDatasets(path_datasets, fout)
            delta = range(2, maxdelta) 
            print 'Delta days accounted: ', max(delta)
            datasets = applyRollMeanDelayedReturns(datasets, delta)
            finance = mergeDataframes(datasets, 6, cut)
            print 'Size of data frame: ', finance.shape
            print 'Number of NaN after merging: ', count_missing(finance)
            finance = finance.interpolate(method='linear')
            print 'Number of NaN after time interpolation: ', count_missing(finance)
            finance = finance.fillna(finance.mean())
            print 'Number of NaN after mean interpolation: ', count_missing(finance)    
            finance = applyTimeLag(finance, lags, delta)
            print 'Number of NaN after temporal shifting: ', count_missing(finance)
            print 'Size of data frame after feature creation: ', finance.shape
            X_train, y_train, X_test, y_test  = prepareDataForClassification(finance, start_test)
            
            print performCV(X_train, y_train, folds, method, parameters, fout, savemodel)
            print ''

In [24]:
def performTimeSeriesCV(X_train, y_train, number_folds, algorithm, parameters, fout, savemodel):
    """
    Given X_train and y_train (the test set is excluded from the Cross Validation),
    number of folds, the ML algorithm to implement and the parameters to test,
    the function acts based on the following logic: it splits X_train and y_train in a
    number of folds equal to number_folds. Then train on one fold and tests accuracy
    on the consecutive as follows:
    - Train on fold 1, test on 2
    - Train on fold 1-2, test on 3
    - Train on fold 1-2-3, test on 4
    ....
    Returns mean of test accuracies.
    """

    print 'Parameters --------------------------------> ', parameters
    print 'Size train set: ', X_train.shape
    
    # k is the size of each fold. It is computed dividing the number of 
    # rows in X_train by number_folds. This number is floored and coerced to int
    k = int(np.floor(float(X_train.shape[0]) / number_folds))
    print 'Size of each fold: ', k
    
    # initialize to zero the accuracies array. It is important to stress that
    # in the CV of Time Series if I have n folds I test n-1 folds as the first
    # one is always needed to train
    accuracies = np.zeros(number_folds-1)

    # loop from the first 2 folds to the total number of folds    
    for i in range(2, number_folds + 1):
        print ''
        
        # the split is the percentage at which to split the folds into train
        # and test. For example when i = 2 we are taking the first 2 folds out 
        # of the total available. In this specific case we have to split the
        # two of them in half (train on the first, test on the second), 
        # so split = 1/2 = 0.5 = 50%. When i = 3 we are taking the first 3 folds 
        # out of the total available, meaning that we have to split the three of them
        # in two at split = 2/3 = 0.66 = 66% (train on the first 2 and test on the
        # following)
        split = float(i-1)/i
        
        # example with i = 4 (first 4 folds):
        #      Splitting the first       4        chunks at          3      /        4
        print 'Splitting the first ' + str(i) + ' chunks at ' + str(i-1) + '/' + str(i) 
        
        # as we loop over the folds X and y are updated and increase in size.
        # This is the data that is going to be split and it increases in size 
        # in the loop as we account for more folds. If k = 300, with i starting from 2
        # the result is the following in the loop
        # i = 2
        # X = X_train[:(600)]
        # y = y_train[:(600)]
        #
        # i = 3
        # X = X_train[:(900)]
        # y = y_train[:(900)]
        # .... 
        X = X_train[:(k*i)]
        y = y_train[:(k*i)]
        print 'Size of train + test: ', X.shape # the size of the dataframe is going to be k*i

        # X and y contain both the folds to train and the fold to test.
        # index is the integer telling us where to split, according to the
        # split percentage we have set above
        index = int(np.floor(X.shape[0] * split))
        
        # folds used to train the model        
        X_trainFolds = X[:index]        
        y_trainFolds = y[:index]
        
        # fold used to test the model
        X_testFolds = X[(index + 1):]
        y_testFolds = y[(index + 1):]
        
        # i starts from 2 so the zeroth element in accuracies array is i-2. performClassification() is a function 
        #which takes care of a classification problem. This is only an example and you can replace this function with 
        #whatever ML approach you need.
        print "str(i): " + str(i)
        accuracies[i-2] = performClassification(X_train, y_train, X_test, y_test, algorithm, parameters, fout, savemodel)
        
        #performQDAClass(X_train, y_train, X_test, y_test, parameters, fout, savemodel)
        #performQDAClass(X_trainFolds, y_trainFolds, X_testFolds, y_testFolds, parameters, fout, savemodel)
        #works...
        #performQDAClass(X_train, y_train, X_test, y_test, parameters, fout, savemodel)

        #(X_trainFolds, y_trainFolds, X_testFolds, y_testFolds, algorithm, parameters, fout, savemodel)
        #performClassification(X_trainFolds, y_trainFolds, X_testFolds, y_testFolds, algorithm, parameters, fout, savemodel)
        
        # example with i = 4:
        #      Accuracy on fold         4     :    0.85423
        
        print 'Accuracy on fold ' + str(i) + ': ', accuracies[i-2]
    
    # the function returns the mean of the accuracy on the n-1 folds    
    return accuracies.mean()

performTimeSeriesCV(X, y, 10, 'QDA', [], '^GSPC', 1)

Parameters -------------------------------->  []
Size train set:  (2879, 26)
Size of each fold:  287

Splitting the first 2 chunks at 1/2
Size of train + test:  (574, 26)
str(i): 2
Accuracy on fold 2:  0.541832669323

Splitting the first 3 chunks at 2/3
Size of train + test:  (861, 26)
str(i): 3
Accuracy on fold 3:  0.541832669323

Splitting the first 4 chunks at 3/4
Size of train + test:  (1148, 26)
str(i): 4
Accuracy on fold 4:  0.541832669323

Splitting the first 5 chunks at 4/5
Size of train + test:  (1435, 26)
str(i): 5
Accuracy on fold 5:  0.541832669323

Splitting the first 6 chunks at 5/6
Size of train + test:  (1722, 26)
str(i): 6
Accuracy on fold 6:  0.541832669323

Splitting the first 7 chunks at 6/7
Size of train + test:  (2009, 26)
str(i): 7
Accuracy on fold 7:  0.541832669323

Splitting the first 8 chunks at 7/8
Size of train + test:  (2296, 26)
str(i): 8
Accuracy on fold 8:  0.541832669323

Splitting the first 9 chunks at 8/9
Size of train + test:  (2583, 26)
str(i): 9
A

0.54183266932270913