In [1]:
# importing required libraries

import pandas as pd
import numpy as np


import sklearn
import joblib


from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [2]:
# creating empty lists

models=[]
mae_models=[]
mse_models=[]

In [3]:
# function to append results in the created lists

def test_eval(algorithm, mae,mse):
    models.append(algorithm)
    mae_models.append(mae)
    mse_models.append(mse)

In [4]:
# function for pre-processing

def preprocessing(df):
    df['Date'] = pd.to_datetime(df['Date'])
    df.set_index('Date', inplace=True)
    # Remove rows with NaN values
    df.dropna(inplace=True)
    return df

In [5]:
# code to show the output for the 'preprocessing' function

df = pd.read_parquet('output2.parquet')
df = df.iloc[:3000]
df = preprocessing(df)
df

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,Symbol,Security Name,vol_moving_avg,adj_close_rolling_med
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1999-12-31,56.866951,57.179901,54.542202,55.302216,47.562416,1931100.0,A,"Agilent Technologies, Inc. Common Stock",5.739950e+06,2937500.0
2000-01-03,56.330471,56.464592,48.193848,51.502148,44.294170,4674300.0,A,"Agilent Technologies, Inc. Common Stock",3.810883e+06,2937500.0
2000-01-04,48.730328,49.266811,46.316166,47.567955,40.910591,4765000.0,A,"Agilent Technologies, Inc. Common Stock",3.461913e+06,2937500.0
2000-01-05,47.389126,47.567955,43.141987,44.617310,38.372894,5758600.0,A,"Agilent Technologies, Inc. Common Stock",3.434607e+06,2937500.0
2000-01-06,44.080830,44.349072,41.577251,42.918453,36.911816,2534400.0,A,"Agilent Technologies, Inc. Common Stock",3.319900e+06,2750800.0
...,...,...,...,...,...,...,...,...,...,...
2011-10-14,23.719599,23.998569,23.619457,23.848354,21.768467,8137000.0,A,"Agilent Technologies, Inc. Common Stock",7.882997e+06,7472500.0
2011-10-17,23.619457,23.934191,23.433475,23.655222,21.592176,6988800.0,A,"Agilent Technologies, Inc. Common Stock",7.851737e+06,7330500.0
2011-10-18,23.626610,24.120173,22.989986,23.948498,21.859880,8460100.0,A,"Agilent Technologies, Inc. Common Stock",7.781300e+06,7330500.0
2011-10-19,23.826895,24.184549,23.490702,23.597998,21.539948,4883300.0,A,"Agilent Technologies, Inc. Common Stock",7.648057e+06,7132600.0


In [6]:
# function for splitting the dataset

def split_data(df, features, target):
    X = df[features]
    y = df[target]
    # Split data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    return X_train, X_test, y_train, y_test

In [7]:
# code to show the output for the 'split_data' function

features = ['vol_moving_avg', 'adj_close_rolling_med']
target = 'Volume'

data = split_data(df, features, target)
data

(            vol_moving_avg  adj_close_rolling_med
 Date                                             
 2002-01-14    3.542970e+06              3243150.0
 2006-03-03    3.819057e+06              3226750.0
 2002-09-16    4.171093e+06              3579850.0
 2009-05-29    4.246730e+06              4096850.0
 2010-05-27    5.788637e+06              5006250.0
 ...                    ...                    ...
 2006-07-10    4.099400e+06              3679700.0
 2004-05-12    3.835250e+06              3459400.0
 2004-07-02    4.417400e+06              4230100.0
 2005-02-25    3.706697e+06              3235600.0
 2003-06-06    3.354940e+06              3154500.0
 
 [2376 rows x 2 columns],
             vol_moving_avg  adj_close_rolling_med
 Date                                             
 2009-07-17    4.166710e+06              4060850.0
 2004-11-11    3.799253e+06              3567150.0
 2009-07-24    4.354320e+06              4123200.0
 2001-04-26    3.933967e+06              3343950.0
 20

In [8]:
# function for random forest algorithm

def randomForest(X_train, X_test, y_train, y_test):
    model=RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    test_eval("Random Forest", mae,mse)
    return model

In [9]:
# code to show the output for the model
randomForest(data[0], data[1], data[2], data[3])

In [10]:
if __name__ == '__main__':
    #location='data/problem2/output/output.parquet'
    location = input("Enter data input location: ")
    df = pd.read_parquet(location)
    df = df.iloc[:3000]
    features = ['vol_moving_avg', 'adj_close_rolling_med']
    target = 'Volume'
    df = preprocessing(df)
    X_train, X_test, y_train, y_test = split_data(df, features, target)
    model = randomForest(X_train, X_test, y_train, y_test)

    eval_ros_df = pd.DataFrame({'model': models,
                                'mae': mae_models,
                                'mse': mse_models})
    # Save the model as a pickle in a file
    joblib.dump(model, 'model.pkl')

Enter data input location:  output2.parquet
