<a href="https://colab.research.google.com/github/MWFK/TunisAir-Stock-Scrapping-Predicting/blob/main/Basic_Classical_ML_Prediction_Models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Libs

In [109]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn import metrics

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

# Data

In [110]:
url = 'https://raw.githubusercontent.com/MWFK/TunisAir-Stock-Scrapping-Predicting/main/TAIR_21_20_19_18_17.csv'
column_names = ['symbole', 'date', 'ouverture', 'haut', 'bas', 'cloture', 'volume']

data = pd.read_csv(url, names=column_names, sep=';', header=0, decimal=',')
data.head()

Unnamed: 0,symbole,date,ouverture,haut,bas,cloture,volume
0,TAIR,02/01/2017,0.6,0.6,0.59,0.6,4651
1,TAIR,03/01/2017,0.6,0.6,0.59,0.6,1159
2,TAIR,04/01/2017,0.6,0.6,0.59,0.6,19524
3,TAIR,05/01/2017,0.6,0.6,0.59,0.6,2205
4,TAIR,06/01/2017,0.59,0.6,0.59,0.6,42941


# Processing

In [111]:
column_names = ['symbole', 'date', 'ouverture', 'haut', 'bas', 'volume', 'cloture']
data = data[column_names]
data.head(10)

Unnamed: 0,symbole,date,ouverture,haut,bas,volume,cloture
0,TAIR,02/01/2017,0.6,0.6,0.59,4651,0.6
1,TAIR,03/01/2017,0.6,0.6,0.59,1159,0.6
2,TAIR,04/01/2017,0.6,0.6,0.59,19524,0.6
3,TAIR,05/01/2017,0.6,0.6,0.59,2205,0.6
4,TAIR,06/01/2017,0.59,0.6,0.59,42941,0.6
5,TAIR,09/01/2017,0.59,0.6,0.59,7313,0.6
6,TAIR,10/01/2017,0.6,0.6,0.59,34185,0.6
7,TAIR,11/01/2017,0.58,0.6,0.58,37960,0.58
8,TAIR,12/01/2017,0.58,0.59,0.57,31085,0.59
9,TAIR,13/01/2017,0.58,0.59,0.58,15664,0.59


In [112]:
# Elimination of the Look-Ahead Bias
data.drop(['ouverture', 'haut', 'bas'], axis=1, inplace=True)
data.head()

Unnamed: 0,symbole,date,volume,cloture
0,TAIR,02/01/2017,4651,0.6
1,TAIR,03/01/2017,1159,0.6
2,TAIR,04/01/2017,19524,0.6
3,TAIR,05/01/2017,2205,0.6
4,TAIR,06/01/2017,42941,0.6


In [115]:
# data["date"] = pd.to_datetime(data["date"]).dt.date
data["date"] = pd.to_datetime(data["date"])
print(data.dtypes)
print(data.iloc[1,1])
data.head()

symbole            object
date       datetime64[ns]
volume              int64
cloture           float64
dtype: object
2017-03-01 00:00:00


Unnamed: 0,symbole,date,volume,cloture
0,TAIR,2017-02-01,4651,0.6
1,TAIR,2017-03-01,1159,0.6
2,TAIR,2017-04-01,19524,0.6
3,TAIR,2017-05-01,2205,0.6
4,TAIR,2017-06-01,42941,0.6


In [116]:
# There're many other interesting feature engineering when it comes to date, but we're keeping it basic
data['date'] = data['date'].dt.strftime("%Y%m%d").astype(int)
data.head()

Unnamed: 0,symbole,date,volume,cloture
0,TAIR,20170201,4651,0.6
1,TAIR,20170301,1159,0.6
2,TAIR,20170401,19524,0.6
3,TAIR,20170501,2205,0.6
4,TAIR,20170601,42941,0.6


In [117]:
# Scaling the data
scaler = MinMaxScaler()
cols = ['date','volume', 'cloture']
data[cols] = scaler.fit_transform(data[cols])
data.head()

Unnamed: 0,symbole,date,volume,cloture
0,TAIR,0.002409,0.003506,0.42
1,TAIR,0.004842,0.000872,0.42
2,TAIR,0.007275,0.014725,0.42
3,TAIR,0.009708,0.001661,0.42
4,TAIR,0.012141,0.032388,0.42


In [119]:
X = data.iloc[:,1:4]  # we do not start with the firt feature, because it has the name of the stock, and for now we'll not use the date 
X = X.astype(float)
print(X.dtypes)  
X.head()

date       float64
volume     float64
cloture    float64
dtype: object


Unnamed: 0,date,volume,cloture
0,0.002409,0.003506,0.42
1,0.004842,0.000872,0.42
2,0.007275,0.014725,0.42
3,0.009708,0.001661,0.42
4,0.012141,0.032388,0.42


In [120]:
X = X.values      
X

array([[2.40864192e-03, 3.50601408e-03, 4.20000000e-01],
       [4.84161355e-03, 8.71977685e-04, 4.20000000e-01],
       [7.27458518e-03, 1.47248066e-02, 4.20000000e-01],
       ...,
       [9.85694127e-01, 3.14771875e-03, 3.60000000e-01],
       [9.85718457e-01, 3.16657640e-03, 3.60000000e-01],
       [9.85742786e-01, 1.24490655e-02, 3.60000000e-01]])

In [121]:
Y = data.cloture # we're going to predict the closing prices of the TAIR
Y = Y.astype(float)
Y = Y.values
Y

array([0.42, 0.42, 0.42, ..., 0.36, 0.36, 0.36])

In [122]:
# Split the data
# random_state to reproduce the same result, and test size will the 33% of the train data
# By default Shuffle is True and since we're dealing with a time series we should not shuffle the data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.20, random_state=73, shuffle=False) 

print("X_train = ",X_train.shape)
print("Y_train",Y_train.shape)
print("X_test = ",X_test.shape)
print("Y_test",Y_test.shape)

X_train =  (844, 3)
Y_train (844,)
X_test =  (212, 3)
Y_test (212,)


# Linear Regression

In [123]:
lr = LinearRegression()
lr_model = lr.fit(X_train, Y_train)

In [124]:
Y_pred_lr = lr_model.predict(X_test) 
# The Last ten prediction
Y_pred_lr[-10:] 

array([0.42, 0.42, 0.42, 0.4 , 0.4 , 0.38, 0.36, 0.36, 0.36, 0.36])

In [125]:
print('LR')
print('Mean Absolute Error:',     metrics.mean_absolute_error(Y_test, Y_pred_lr))  
print('Mean Squared Error:',      metrics.mean_squared_error(Y_test, Y_pred_lr))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(Y_test, Y_pred_lr)))

LR
Mean Absolute Error: 9.086023338323805e-17
Mean Squared Error: 1.0450895320863566e-32
Root Mean Squared Error: 1.022296205649985e-16


# DTR model

In [126]:
dtr = DecisionTreeRegressor()  
dtr_model = dtr.fit(X_train, Y_train)

In [127]:
Y_pred_dtr = dtr_model.predict(X_test) 
# The Last ten prediction
Y_pred_lr[-10:] 

array([0.42, 0.42, 0.42, 0.4 , 0.4 , 0.38, 0.36, 0.36, 0.36, 0.36])

In [128]:
print('DTR')
print('Mean Absolute Error:',     metrics.mean_absolute_error(Y_test, Y_pred_dtr))  
print('Mean Squared Error:',      metrics.mean_squared_error(Y_test, Y_pred_dtr))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(Y_test, Y_pred_dtr)))

DTR
Mean Absolute Error: 1.6182024274017611e-16
Mean Squared Error: 4.468157470978387e-32
Root Mean Squared Error: 2.1138016631127878e-16


# RFR

In [129]:
rfr = RandomForestRegressor()  
rfr_model = rfr.fit(X_train, Y_train)

In [130]:
Y_pred_rfr = dtr_model.predict(X_test) 
# The Last ten prediction
Y_pred_lr[-10:] 

array([0.42, 0.42, 0.42, 0.4 , 0.4 , 0.38, 0.36, 0.36, 0.36, 0.36])

In [131]:
print('RFR')
print('Mean Absolute Error:',     metrics.mean_absolute_error(Y_test, Y_pred_rfr))  
print('Mean Squared Error:',      metrics.mean_squared_error(Y_test, Y_pred_rfr))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(Y_test, Y_pred_rfr)))

RFR
Mean Absolute Error: 1.6182024274017611e-16
Mean Squared Error: 4.468157470978387e-32
Root Mean Squared Error: 2.1138016631127878e-16
