# Python Programming For Finance Preprocessing For Machine Learning

## Label all the data

In [1]:
import numpy as np
import pandas as pd
import pickle

In [24]:
def process_data_for_labels(ticker):
    hm_days = 7
    df = pd.read_csv('sp500_joined_closes.csv',index_col = 0)
    tickers = df.columns.values.tolist()
    df.fillna(0,inplace = True)
    
    for i in range(1,hm_days+1):
        df['{}_{}d'.format(ticker,i)] = (df[ticker].shift(-i)-df[ticker])/df[ticker]
    df.fillna(0,inplace = True)
    return tickers,df        
    

In [25]:
tickers,df = process_data_for_labels('XOM')
df.head()

Unnamed: 0_level_0,MMM,ABT,ABBV,ABMD,ACN,ATVI,ADBE,AMD,AAP,AES,...,ZBH,ZION,ZTS,XOM_1d,XOM_2d,XOM_3d,XOM_4d,XOM_5d,XOM_6d,XOM_7d
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1999-12-31,30.180643,10.078859,0.0,18.375,0.0,1.15106,16.693562,14.46875,0.0,29.916197,...,0.0,44.564438,0.0,-0.027929,-0.046547,0.005431,0.057409,0.054306,0.039566,0.042669
2000-01-03,29.101397,9.714566,0.0,18.25,0.0,1.235628,16.274673,15.5,0.0,29.015705,...,0.0,41.78801,0.0,-0.019154,0.034318,0.087789,0.084597,0.069433,0.072626,0.066241
2000-01-04,27.945047,9.437006,0.0,17.8125,0.0,1.198042,14.909401,14.625,0.0,27.865088,...,0.0,39.764465,0.0,0.054516,0.109032,0.105777,0.090317,0.093572,0.087062,0.112286
2000-01-05,28.75449,9.41966,0.0,18.0,0.0,1.20274,15.204173,15.0,0.0,28.165257,...,0.0,39.717422,0.0,0.051698,0.048611,0.033951,0.037037,0.030864,0.054784,0.033951
2000-01-06,31.067183,9.74926,0.0,18.03125,0.0,1.179249,15.32829,16.0,0.0,28.390371,...,0.0,40.282116,0.0,-0.002935,-0.016874,-0.01394,-0.01981,0.002934,-0.016874,-0.005136


# Creating Targets For Machine Learning 

In [26]:
def buy_sell_hold(*args):
    cols = [c for c in args]
    #If stock price increase by 2%, we buy, 
    #if the stock price is decrease by 2%, we will sell
    requirement = 0.02
    for col in cols:
        if col > requirement:
            return 1
        if col < -requirement:
            return -1
    return 0

In [27]:
from collections import Counter
def extract_featuresets(ticker):
    tickers,df = process_data_for_labels(ticker)
    
    df['{}_target'.format(ticker)] = list(map(buy_sell_hold,
                                             df['{}_1d'.format(ticker)],
                                             df['{}_2d'.format(ticker)],
                                             df['{}_3d'.format(ticker)],
                                             df['{}_4d'.format(ticker)],
                                             df['{}_5d'.format(ticker)],
                                             df['{}_6d'.format(ticker)],
                                             df['{}_7d'.format(ticker)]))
    vals = df['{}_target'.format(ticker)].values.tolist()
    str_vals = [str(i) for i in vals]
    print("Data Spread", Counter(str_vals)) 
    df.fillna(0,inplace=True)
    
    df = df.replace([np.inf,-np.inf],np.nan)
    
    df.dropna(inplace=True)
    
    df_vals = df[[ticker for ticker in tickers]].pct_change()
    df_vals = df_vals.replace([np.inf,-np.inf],0)
    df_vals.fillna(0,inplace = True)
    
    X = df_vals.values
    y =  df['{}_target'.format(ticker)].values
    
    return X,y,df


In [28]:
extract_featuresets('AAPL')

Data Spread Counter({'1': 2432, '-1': 2030, '0': 486})


(array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [-0.03575954, -0.03614427,  0.        , ...,  0.        ,
         -0.06230143,  0.        ],
        [-0.03973521, -0.02857153,  0.        , ...,  0.        ,
         -0.04842406,  0.        ],
        ...,
        [ 0.011557  ,  0.00681403,  0.00609095, ...,  0.0196283 ,
          0.01342107,  0.00423187],
        [ 0.01783559,  0.00712418,  0.00227048, ...,  0.00758339,
          0.01874063,  0.00723545],
        [ 0.00848092,  0.0058949 , -0.00724865, ...,  0.00738173,
          0.00784891, -0.00205243]]),
 array([ 1, -1, -1, ...,  0,  0,  0]),
                    MMM        ABT       ABBV        ABMD         ACN  \
 Date                                                                   
 1999-12-31   30.180643  10.078859   0.000000   18.375000    0.000000   
 2000-01-03   29.101397   9.714566   0.000000   18.250000    0.000000   
 2000-01-04   27.945047   9.437006   0.000

# Machine Learning Classification

In [29]:
from sklearn import svm,cross_validation,neighbors
from sklearn.ensemble import VotingClassifier, RandomForestClassifier

In [37]:
def do_ml(ticker):
    X,y,df = extract_featuresets(ticker)
    
    X_train,X_test, y_train,y_test = cross_validation.train_test_split(X,y,test_size = 0.25)
    # In this case, we use KNN classifier
    #clf = neighbors.KNeighborsClassifier()
    
    # We then use voting classifier
    clf = VotingClassifier([('lsvc',svm.LinearSVC()),
                            ('KNN' ,neighbors.KNeighborsClassifier())
                            ,('rfor',RandomForestClassifier())])
    clf.fit(X_train,y_train)
    confidence = clf.score(X_test,y_test)
    
    predictions = clf.predict(X_test)
    print('Predicted Spread:',Counter(predictions))
    
    return confidence
    

In [39]:
do_ml('AAPL')

Data Spread Counter({'1': 2432, '-1': 2030, '0': 486})
Predicted Spread: Counter({1: 728, -1: 505, 0: 4})


0.4834276475343573