In [1]:
import nbimporter
import helper_methods as hm
import preprocessing as pp
import numpy as np
import time
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, matthews_corrcoef
from sklearn.ensemble import RandomForestClassifier

Importing Jupyter notebook from helper_methods.ipynb
Importing Jupyter notebook from preprocessing.ipynb


### Random Forest Classifier Implementation

### Finding Optimal Parameters

In [2]:
def print_parameters_accuracy(accuracies):
    print('#Features \t Estimators \t Accuracy')
    for i in range(len(accuracies)):
        print(accuracies[i][0], '\t\t', accuracies[i][1], '\t\t', accuracies[i][2])
    print()

In [3]:
def find_optimal_values(max_features, max_estimators, num_splits = 10, symbol_name = 'AAPL', use_implementation = True):
    accuracies = list()
    for num_features in range(1, max_features + 1, 1):
        print('Features:', num_features)
        
        X_train, X_test, Y_train, Y_test = hm.prepare_data(num_features, symbol_name, is_binary_ouput=True)
        X_train, X_test, Y_train, Y_test = X_train.values, X_test.values, Y_train.values, Y_test.values
        
        for n_estimators in range(100, max_estimators + 100, 100):
            print('Estimators ------------------------>', n_estimators)
            rf_tscv = RandomForestClassifier(n_estimators=n_estimators, n_jobs=-1)
            if use_implementation:
                neighbour_accuracy = hm.timeSeriesCV(X_train, Y_train, num_splits, rf_tscv, is_classification=True)
            else:
                neighbour_accuracy = hm.rolling_cross_validation(X_train, Y_train, num_splits, rf_tscv, is_classification=True)
            accuracies.append([num_features, n_estimators, neighbour_accuracy])
    
    print_parameters_accuracy(accuracies)
    
    # Sorting the accuracies
    accuracies.sort(reverse=True, key=lambda x: x[2])
    print_parameters_accuracy(accuracies)
    
    return accuracies[0][0], accuracies[0][1]

In [4]:
def get_data_ready(symbol_name):
    start_time = time.time()
    num_features, n_estimators = find_optimal_values(max_features=15, max_estimators=1100, num_splits=10, symbol_name = symbol_name)
    end_time = time.time()
    print('Time taken:', end_time - start_time)
    
    X_train, X_test, Y_train, Y_test = hm.prepare_data(num_features)
    X_train, X_test, Y_train, Y_test = X_train.values, X_test.values, Y_train.values, Y_test.values
    return X_train, X_test, Y_train, Y_test, n_estimators  

### 1. SkLearn RF Classifier

In [5]:
def sklearn_RF_forecast(X_train, X_test, Y_train, Y_test, n_estimators):
    print('SKLEARN INBUILT')
    clf = RandomForestClassifier(n_estimators=n_estimators, n_jobs=-1)
    clf.fit(X_train, Y_train)
    print('Accuracy score --', clf.score(X_test, Y_test), '\n')

### 2. Predicting using Implementation

### Running RF

In [6]:
def forecast(X_train, X_test, Y_train, Y_test, n_estimators):
    print('Number of estimators --', n_estimators)
    sklearn_RF_forecast(X_train, X_test, Y_train, Y_test, n_estimators)
    
# X_train, X_test, Y_train, Y_test = hm.prepare_data(2)
# X_train, X_test, Y_train, Y_test = X_train.values, X_test.values, Y_train.values, Y_test.values
# forecast(X_train, X_test, Y_train, Y_test)

In [7]:
def run_RF(symbol_name):
    X_train, X_test, Y_train, Y_test, n_estimators = get_data_ready(symbol_name)
    forecast(X_train, X_test, Y_train, Y_test, n_estimators)

In [8]:
run_RF(symbol_name = 'INX')

Features: 1
Estimators ------------------------> 100
Implemented Rolling Cross Validation
Accuracy: 0.4997165532879818 

Estimators ------------------------> 200
Implemented Rolling Cross Validation
Accuracy: 0.4994331065759636 

Estimators ------------------------> 300
Implemented Rolling Cross Validation
Accuracy: 0.49943310657596374 

Estimators ------------------------> 400
Implemented Rolling Cross Validation
Accuracy: 0.49943310657596374 

Estimators ------------------------> 500
Implemented Rolling Cross Validation
Accuracy: 0.49943310657596374 

Estimators ------------------------> 600
Implemented Rolling Cross Validation
Accuracy: 0.49943310657596374 

Estimators ------------------------> 700
Implemented Rolling Cross Validation
Accuracy: 0.49943310657596374 

Estimators ------------------------> 800
Implemented Rolling Cross Validation
Accuracy: 0.49943310657596374 

Estimators ------------------------> 900
Implemented Rolling Cross Validation
Accuracy: 0.49943310657596374 



Accuracy: 0.5170068027210885 

Estimators ------------------------> 1000
Implemented Rolling Cross Validation
Accuracy: 0.5133219954648527 

Estimators ------------------------> 1100
Implemented Rolling Cross Validation
Accuracy: 0.5110544217687074 

Features: 8
Estimators ------------------------> 100
Implemented Rolling Cross Validation
Accuracy: 0.5177607274793976 

Estimators ------------------------> 200
Implemented Rolling Cross Validation
Accuracy: 0.5100880932082978 

Estimators ------------------------> 300
Implemented Rolling Cross Validation
Accuracy: 0.5183290707587384 

Estimators ------------------------> 400
Implemented Rolling Cross Validation
Accuracy: 0.5089514066496164 

Estimators ------------------------> 500
Implemented Rolling Cross Validation
Accuracy: 0.5134981528843422 

Estimators ------------------------> 600
Implemented Rolling Cross Validation
Accuracy: 0.5103722648479683 

Estimators ------------------------> 700
Implemented Rolling Cross Validation
Accur

Accuracy: 0.5103722648479683 

Estimators ------------------------> 800
Implemented Rolling Cross Validation
Accuracy: 0.5132139812446719 

Estimators ------------------------> 900
Implemented Rolling Cross Validation
Accuracy: 0.5083830633702756 

Estimators ------------------------> 1000
Implemented Rolling Cross Validation
Accuracy: 0.5126456379653311 

Estimators ------------------------> 1100
Implemented Rolling Cross Validation
Accuracy: 0.5188974140380789 

Features: 15
Estimators ------------------------> 100
Implemented Rolling Cross Validation
Accuracy: 0.5072463768115942 

Estimators ------------------------> 200
Implemented Rolling Cross Validation
Accuracy: 0.5069622051719238 

Estimators ------------------------> 300
Implemented Rolling Cross Validation
Accuracy: 0.5129298096050015 

Estimators ------------------------> 400
Implemented Rolling Cross Validation
Accuracy: 0.5120772946859904 

Estimators ------------------------> 500
Implemented Rolling Cross Validation
Accu

Number of estimators -- 1000
SKLEARN INBUILT
Accuracy score -- 0.5045941807044411 

