In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark.sql.window import Window

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error
import pickle

In [3]:
spark = SparkSession.builder.appName('Model Training').getOrCreate()

In [4]:
df_fe_data = spark.read.parquet(r'..\data\feature_engineering_output.parquet').select('Date','Volume','vol_moving_avg', 'adj_close_rolling_med')

In [5]:
data = df_fe_data.toPandas()

data['Date'] = pd.to_datetime(data['Date'])
data.set_index('Date', inplace=True)

In [None]:
# (data.isnull().sum()/data.shape[0])*100

In [6]:
data.dropna(inplace=True)

In [7]:
features = ['vol_moving_avg', 'adj_close_rolling_med']
target = 'Volume'

X = pd.DataFrame(data[features])
y = pd.DataFrame(data[target])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
scaler_x = StandardScaler()
scaler_y = StandardScaler()

X_train_scaled = scaler_x.fit_transform(X_train.values)
X_train_scaled = pd.DataFrame(X_train_scaled,index=X_train.index,columns=features)

X_test_scaled = scaler_x.transform(X_test.values)
X_test_scaled = pd.DataFrame(X_test_scaled,index=X_test.index,columns=features)

y_train_scaled = scaler_y.fit_transform(y_train.values.reshape(-1,1))
y_train['scaled_volume'] = y_train_scaled

y_test_scaled = scaler_y.transform(y_test.values.reshape(-1,1))
y_test['scaled_volume'] = y_test_scaled

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y_train['scaled_volume'] = y_train_scaled
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y_test['scaled_volume'] = y_test_scaled


In [9]:
# Create a RandomForestRegressor model
model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model
model.fit(X_train, y_train['scaled_volume'])

RandomForestRegressor(random_state=42)

In [10]:
# Make predictions on test data
y_pred = model.predict(X_test)

# Calculate the Mean Absolute Error and Mean Squared Error
mae = mean_absolute_error(y_test['scaled_volume'], y_pred)
mse = mean_squared_error(y_test['scaled_volume'], y_pred)

print('Mean Absolute Error is {} and Mean squared Error is {}'.format(mae,mse))

Mean Absolute Error is 0.5004576307566186 and Mean squared Error is 0.8840711285259968


In [None]:
# Saving trained model to disk
filename = r'..\trained_model\random_forest_regressor_model.pkl'
with open(filename, 'wb') as file:
    pickle.dump(model, file)