In [1]:
import numpy as np
import pandas as pd
pd.options.mode.chained_assignment = None
from sklearn.model_selection import train_test_split
import os
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import accuracy_score

In [2]:
training_data = pd.read_csv('train.csv') # Loading training data 

In [3]:
training_data.head() # Reading first few examples 

Unnamed: 0,stock_id,date_id,seconds_in_bucket,imbalance_size,imbalance_buy_sell_flag,reference_price,matched_size,far_price,near_price,bid_price,bid_size,ask_price,ask_size,wap,target,time_id,row_id
0,0,0,0,3180602.69,1,0.999812,13380276.64,,,0.999812,60651.5,1.000026,8493.03,1.0,-3.029704,0,0_0_0
1,1,0,0,166603.91,-1,0.999896,1642214.25,,,0.999896,3233.04,1.00066,20605.09,1.0,-5.519986,0,0_0_1
2,2,0,0,302879.87,-1,0.999561,1819368.03,,,0.999403,37956.0,1.000298,18995.0,1.0,-8.38995,0,0_0_2
3,3,0,0,11917682.27,-1,1.000171,18389745.62,,,0.999999,2324.9,1.000214,479032.4,1.0,-4.0102,0,0_0_3
4,4,0,0,447549.96,-1,0.999532,17860614.95,,,0.999394,16485.54,1.000016,434.1,1.0,-7.349849,0,0_0_4


In [4]:
X = training_data.drop(columns=['target']) # Setting features as all colums except target columns
y = training_data['target'] # Setting target column as label 

## Imputation

In [5]:
sum(y.isnull()) # Checking for missing values in target column and how many there are

88

In [6]:
# Replacing missing target values with the mean 
# when to use mode/median? ### VARIABLE --> could change for potentially more accurate results 

y = y.fillna(training_data['target'].mean()) # Using pandas fillna to replace all missing values with the mean of 

In [7]:
sum(y.isnull()) # Checking that all missing values are no longer missing

0

In [8]:
X.isnull().values.any() # Checking if there are any missing values in the features data

True

In [9]:
X.isnull().head() # First few rows with missing data

Unnamed: 0,stock_id,date_id,seconds_in_bucket,imbalance_size,imbalance_buy_sell_flag,reference_price,matched_size,far_price,near_price,bid_price,bid_size,ask_price,ask_size,wap,time_id,row_id
0,False,False,False,False,False,False,False,True,True,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,True,True,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,True,True,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,True,True,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,True,True,False,False,False,False,False,False,False


In [10]:
nan_count = np.sum(X.isnull(), axis = 0)
nan_count

stock_id                         0
date_id                          0
seconds_in_bucket                0
imbalance_size                 220
imbalance_buy_sell_flag          0
reference_price                220
matched_size                   220
far_price                  2894342
near_price                 2857180
bid_price                      220
bid_size                         0
ask_price                      220
ask_size                         0
wap                            220
time_id                          0
row_id                           0
dtype: int64

In [11]:
condition = nan_count != 0 # All columns with missing values 
col_names = nan_count[condition].index # Get the column names
print(col_names)

nan_cols = list(col_names) # Convert column names into a list
print(nan_cols)

Index(['imbalance_size', 'reference_price', 'matched_size', 'far_price',
       'near_price', 'bid_price', 'ask_price', 'wap'],
      dtype='object')
['imbalance_size', 'reference_price', 'matched_size', 'far_price', 'near_price', 'bid_price', 'ask_price', 'wap']


In [12]:
nan_col_types = training_data[nan_cols].dtypes # Data types of columns with missing data
nan_col_types

imbalance_size     float64
reference_price    float64
matched_size       float64
far_price          float64
near_price         float64
bid_price          float64
ask_price          float64
wap                float64
dtype: object

All missing data types are floats

In [13]:
# Adding new columns 1 for each corresponding column that has missing values to keep track of which elements were null
X['imbalance_size_na'] = X['imbalance_size'].isnull()
X['reference_price_na'] = X['reference_price'].isnull()
X['matched_size_na'] = X['matched_size'].isnull()
X['far_price_na'] = X['far_price'].isnull()
X['near_price_na'] = X['near_price'].isnull()
X['bid_price_na'] = X['bid_price'].isnull()
X['ask_price_na'] = X['ask_price'].isnull()
X['wap_na'] = X['wap'].isnull()
X.head()

Unnamed: 0,stock_id,date_id,seconds_in_bucket,imbalance_size,imbalance_buy_sell_flag,reference_price,matched_size,far_price,near_price,bid_price,...,time_id,row_id,imbalance_size_na,reference_price_na,matched_size_na,far_price_na,near_price_na,bid_price_na,ask_price_na,wap_na
0,0,0,0,3180602.69,1,0.999812,13380276.64,,,0.999812,...,0,0_0_0,False,False,False,True,True,False,False,False
1,1,0,0,166603.91,-1,0.999896,1642214.25,,,0.999896,...,0,0_0_1,False,False,False,True,True,False,False,False
2,2,0,0,302879.87,-1,0.999561,1819368.03,,,0.999403,...,0,0_0_2,False,False,False,True,True,False,False,False
3,3,0,0,11917682.27,-1,1.000171,18389745.62,,,0.999999,...,0,0_0_3,False,False,False,True,True,False,False,False
4,4,0,0,447549.96,-1,0.999532,17860614.95,,,0.999394,...,0,0_0_4,False,False,False,True,True,False,False,False


In [14]:
mean_imbalance_size = X['imbalance_size'].mean()
X.fillna({'imbalance_size': mean_imbalance_size}, inplace=True)

mean_reference_price = X['reference_price'].mean()
X.fillna({'reference_price': mean_imbalance_size}, inplace=True)

mean_matched_size = X['matched_size'].mean()
X.fillna({'matched_size': mean_matched_size}, inplace=True)

mean_far_price = X['far_price'].mean()
X.fillna({'far_price': mean_far_price}, inplace=True)

mean_near_price = X['near_price'].mean()
X.fillna({'near_price': mean_near_price}, inplace=True)

mean_bid_price = X['bid_price'].mean()
X.fillna({'bid_price': mean_bid_price}, inplace=True)

mean_ask_price = X['ask_price'].mean()
X.fillna({'ask_price': mean_ask_price}, inplace=True)

mean_wap = X['wap'].mean()
X.fillna({'wap': mean_wap}, inplace=True)

In [15]:
X.isnull().values.any() # Checking if there are still null elements 

False

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42) # Splitting data into training and test (75% - training, 25% - testing)

In [None]:
n_estimators = [8, 64, 100]

# Create an empty list to store the accuracy scores
error_scores = []

# Loop over the different number of trees
for n in n_estimators:
    # Create a random forest classifier
    model = RandomForestRegressor(n_estimators=n, random_state=42)

    # Train the model
    model.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = model.predict(X_test)

    # Calculate the accuracy score
    mae = mean_absolute_error(y_test, y_pred)

    # Append the accuracy score to the list
    error_scores.append(mae)

# Plot the accuracy scores
plt.plot(n_estimators, error_scores)
plt.xlabel("Number of Trees")
plt.ylabel("Accuracy")
plt.title("Random Forest Error vs. Number of Trees")
plt.show()

In [None]:
# model = RandomForestRegressor(n_estimators=8, random_state=42)
# model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test) 

In [None]:
mae = mean_absolute_error(y_test, y_pred)  # Compute MAE between true and predicted values
print(f'Mean Absolute Error: {mae}')