In [55]:
import numpy as np
import pandas as pd
import pickle
from collections import Counter
from sklearn import svm, model_selection, neighbors
from sklearn.ensemble import VotingClassifier, RandomForestClassifier

In [21]:
def process_data_for_labels(ticker):
    hm_days = 7 
    df = pd.read_csv('sp500_joined_closes.csv', index_col=0)
    tickers = df.columns.values.tolist()
    # 1. Fill missing values
    df.fillna(0, inplace=True)
    # 2. FIX: Convert the ticker column to numbers so we can do math
    df[ticker] = pd.to_numeric(df[ticker], errors='coerce')
       # Now this math will work because both sides are numbers
    for i in range(1, hm_days + 1):
     df['{}_{}d'.format(ticker, i)] = (df[ticker].shift(-i) - df[ticker]) / df[ticker]
     df.fillna(0, inplace=True)
    return tickers, df
# Call the function
tickers_list, processed_df = process_data_for_labels('AVGO')

In [31]:
def buy_sell_hold(*args):
    cols=[c for c in args]
    requirement=0.01
    for col in cols:
        if col>requirement:
            return 1
        if col<requirement:
            return -1
            return 0

In [33]:
def extract_featuresets(ticker):
    tickers, df=process_data_for_labels(ticker)
    df['{}_target'.format(ticker)]=list(map(buy_sell_hold, 
                                           df['{}_1d'.format(ticker)],
                                           df['{}_2d'.format(ticker)],
                                           df['{}_3d'.format(ticker)],
                                           df['{}_4d'.format(ticker)],
                                           df['{}_5d'.format(ticker)],
                                           df['{}_6d'.format(ticker)],
                                           df['{}_7d'.format(ticker)]
                                           ))
    vals=df['{}_target'.format(ticker)].values.tolist()
    str_vals=[str(i) for i in vals]
    print('Data spread:', Counter(str_vals))
    df.fillna(0,inplace=True)

    df = df.replace([np.inf, -np.inf], np.nan)
    df.dropna(inplace=True)
    df_vals = df[tickers].apply(pd.to_numeric, errors='coerce').pct_change()
    df_vals = df_vals.replace([np.inf, -np.inf], 0)
    df_vals.fillna(0, inplace=True)

    x = df_vals.values
    y=df['{}_target'.format(ticker)].values

    return x,y,df

X, y, processed_df = extract_featuresets('AVGO')

Data spread: Counter({'-1': 1734, '1': 788})


In [56]:
def do_ml(ticker):
    x,y,df=extract_featuresets(ticker)
    x_train, x_test, y_train, y_test = model_selection.train_test_split(x, y, test_size=0.25)
    #clf=neighbors.KNeighborsClassifier()
    clf=VotingClassifier([('lsvc',svm.LinearSVC()),
                          ('knn',neighbors.KNeighborsClassifier()),
                           ('rfor',RandomForestClassifier())])

    clf.fit(x_train,y_train)
    confidence=clf.score(x_test,y_test)
    print('accuracy',confidence)
    prediction=clf.predict(x_test)
    print('predicted spread',Counter(prediction))
    return confidence
result = do_ml('AVGO')

Data spread: Counter({'-1': 1734, '1': 788})
accuracy 0.6952380952380952
predicted spread Counter({np.int64(-1): 604, np.int64(1): 26})
