In [1]:
import numpy as np
import pandas as pd

PROCESSING THE DATA

In [25]:
def process_data(ticker):     
    
    df = pd.read_csv('sp500_tickers_joined.csv',index_col=0)     # setting the date as the index column
    
    hm_days = 7        # how much days we want to look ahead for buying and selling
    tickers= df.columns.values.tolist()      # as the new column is named ticker instead of the close price
    df.fillna(0,inplace = True)
    
    for i in range(1,hm_days+1):
        ## format is ticker_how many days ahead
        df['{}_{}d'.format(ticker,i)] = (df[ticker].shift(-i) - df[ticker])/df[ticker]
        
    df.fillna(0,inplace=True)
    
    return tickers,df

In [26]:
process_data('MMM')    # just for the example

(['MMM',
  'ABT',
  'ABBV',
  'ABMD',
  'ACN',
  'ATVI',
  'ADBE',
  'AMD',
  'AAP',
  'AES',
  'AET',
  'AMG',
  'AFL',
  'A',
  'APD',
  'AKAM',
  'ALK',
  'ALB',
  'ARE',
  'ALXN',
  'ALGN',
  'ALLE',
  'AGN',
  'ADS',
  'LNT',
  'ALL',
  'GOOGL',
  'GOOG',
  'MO',
  'AMZN'],
                MMM      ABT    ABBV    ABMD     ACN    ATVI    ADBE    AMD  \
 Date                                                                         
 2010-01-01   82.67  25.8328    0.00    8.73   41.50  11.110   36.78   9.68   
 2010-01-04   83.02  26.0577    0.00    8.74   42.07  11.300   37.09   9.70   
 2010-01-05   82.50  25.8471    0.00    8.53   42.33  11.320   37.70   9.71   
 2010-01-06   83.67  25.9907    0.00    8.40   42.78  11.260   37.62   9.57   
 2010-01-07   83.73  26.2060    0.00    8.40   42.74  10.990   36.89   9.47   
 2010-01-08   84.32  26.3400    0.00    8.23   42.57  10.900   36.69   9.43   
 2010-01-11   83.98  26.4739    0.00    8.08   42.53  10.890   36.21   9.14   
 2010-01-

# Creating the targets(hold,buy or sell)

The labels are buy,sell or hold. Basically if the price rises more than 2% in the next hm_days specified above then we buy the share or if the price falls we sell it. If it does not change then we hold( we neither buy or sell the stocks)

The special syntax *args in function definitions in python is used to pass a variable number of arguments to a function. 
The number of the arguments can vary and it allows us to take in more arguments than the number of formal arguments we had previously defined.

In [22]:
def buy_sell_hold(*args):   # we are using args here as to take many number of columns we want.
    requirement  = 0.02    # 2% change in the price
    cols = [c for c in args]     # here cols are themselves the percent change in price columns
    
    for col in cols:
        if col> requirement:
            return 1      # buy
        if col< -requirement:
            return -1     # sell

    return 0      # hold

The idea here is to map the above function to the pandas dataframe column

In [27]:
from collections import Counter

In [32]:
def features(ticker):
    tickers,df = process_data(ticker)
    
    df['{}_target'.format(ticker)]=list(map(buy_sell_hold,
                                           df['{}_1d'.format(ticker)],
                                           df['{}_2d'.format(ticker)],
                                           df['{}_3d'.format(ticker)],
                                           df['{}_4d'.format(ticker)],
                                           df['{}_5d'.format(ticker)],
                                           df['{}_6d'.format(ticker)],
                                           df['{}_7d'.format(ticker)]))

    
    ## getting the distribution
    vals = df['{}_target'.format(ticker)].values.tolist()     # getting the values from the future data columns
    str_vals = [str(i) for i in vals]       # we use str as the counter accepts string as the input
    print('Data Distribution',Counter(str_vals))
    
    #Cleaning the data
    df.fillna(0,inplace=True)
    # there is a infinity when we did a percent change from zero to something
    df = df.replace([np.inf, -np.inf],np.nan)      # replacing the infinity elements with Nan
    df.dropna(inplace=True)
    
## Right now the features are static data(daily prices of the stocks), so we convert it to percent change in price of the stocks
    df_vals = df[[ticker for ticker in tickers]].pct_change()
    df_vals = df_vals.replace([np.inf,-np.inf],0)        # checking for the infinity and convrting it to zero
    df_vals.fillna(0,inplace=True)
    
    ## Trainig and testing data
    X = df_vals.values
    y = df['{}_target'.format(ticker)].values
    
    return X,y,df

In [33]:
features('MMM')     # just for example

Data Distribution Counter({'0': 915, '1': 745, '-1': 552})


(array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.0042337 ,  0.00870599,  0.        , ...,  0.        ,
          0.00815079, -0.00460898],
        [-0.00626355, -0.00808206,  0.        , ...,  0.        ,
          0.00050531,  0.00589993],
        ...,
        [-0.01292407, -0.00627615, -0.02748092, ..., -0.01041168,
         -0.00105727, -0.01134805],
        [ 0.00501227,  0.00307692, -0.02155939, ..., -0.00188311,
          0.00582113, -0.00840934],
        [-0.00356234, -0.01888925, -0.00470639, ..., -0.02654308,
          0.00087689, -0.03061195]]),
 array([0, 0, 1, ..., 0, 0, 0], dtype=int64),
                MMM      ABT    ABBV    ABMD     ACN    ATVI    ADBE    AMD  \
 Date                                                                         
 2010-01-01   82.67  25.8328    0.00    8.73   41.50  11.110   36.78   9.68   
 2010-01-04   83.02  26.0577    0.00    8.74   42.07  11.300   37.09   9.70   
 2010-01-05

# Using Machine Learning

In [48]:
from sklearn import neighbors, model_selection,svm
from sklearn.ensemble import VotingClassifier, RandomForestClassifier

The voting classifier is a classifier that lets us combine many classifiers and allow them each to get a vote on their prediction

In [39]:
def machine_learning(ticker):
    X,y,df = features(ticker)
    
    X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.25)
    
    clf=neighbors.KNeighborsClassifier()
    
    clf.fit(X_train,y_train)
    confidence= clf.score(X_test,y_test)
    print('accuray:',confidence)
    
    predictions = clf.predict(X_test)
    print('distribution in  prediction',Counter(predictions))

In [45]:
machine_learning('MMM')
machine_learning('GOOGL')

Data Distribution Counter({'0': 915, '1': 745, '-1': 552})
accuray: 0.37613019891500904
distribution in  prediction Counter({0: 292, 1: 137, -1: 124})
Data Distribution Counter({'1': 948, '-1': 752, '0': 512})
accuray: 0.4213381555153707
distribution in  prediction Counter({1: 246, -1: 194, 0: 113})


Now we use the voting classifier

In [46]:
def machine_learning_voting(ticker):
    X,y,df = features(ticker)
    
    X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.25)
    
    clf = VotingClassifier([('lsvc',svm.LinearSVC()),
                            ('knn',neighbors.KNeighborsClassifier()),
                            ('rfor',RandomForestClassifier())])
    
    clf.fit(X_train,y_train)
    confidence= clf.score(X_test,y_test)
    print('accuray:',confidence)
    
    predictions = clf.predict(X_test)
    print('distribution in  prediction',Counter(predictions))

In [49]:
machine_learning_voting('MMM')
machine_learning_voting('GOOGL')
machine_learning_voting('AMZN')

Data Distribution Counter({'0': 915, '1': 745, '-1': 552})


  if diff:
  if diff:


accuray: 0.4267631103074141
distribution in  prediction Counter({0: 399, 1: 82, -1: 72})
Data Distribution Counter({'1': 948, '-1': 752, '0': 512})


  if diff:
  if diff:


accuray: 0.3743218806509946
distribution in  prediction Counter({1: 365, -1: 161, 0: 27})
Data Distribution Counter({'1': 1128, '-1': 773, '0': 311})
accuray: 0.49547920433996384
distribution in  prediction Counter({1: 454, -1: 97, 0: 2})


  if diff:
  if diff:
