<a href="https://colab.research.google.com/github/MWFK/TunisAir-Stock-Scrapping-Predicting/blob/main/Predict_TAIR_Stock_Value_with_TF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Libs

In [22]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
import tensorflow as tf

# Data

In [40]:
url = 'https://raw.githubusercontent.com/MWFK/TunisAir-Stock-Scrapping-Predicting/main/TAIR_21_20_19_18_17.csv'
column_names = ['symbole', 'date', 'ouverture', 'haut', 'bas', 'cloture', 'volume']

data = pd.read_csv(url, names=column_names,sep=';', header=0, decimal=',')
data

Unnamed: 0,symbole,date,ouverture,haut,bas,cloture,volume
0,TAIR,02/01/2017,0.60,0.60,0.59,0.60,4651
1,TAIR,03/01/2017,0.60,0.60,0.59,0.60,1159
2,TAIR,04/01/2017,0.60,0.60,0.59,0.60,19524
3,TAIR,05/01/2017,0.60,0.60,0.59,0.60,2205
4,TAIR,06/01/2017,0.59,0.60,0.59,0.60,42941
...,...,...,...,...,...,...,...
1051,TAIR,14/06/2021,0.59,0.59,0.58,0.58,15785
1052,TAIR,15/06/2021,0.58,0.58,0.57,0.57,28264
1053,TAIR,16/06/2021,0.58,0.58,0.57,0.57,4176
1054,TAIR,17/06/2021,0.57,0.57,0.56,0.57,4201


# Processing

In [41]:
column_names = ['symbole', 'date', 'ouverture', 'haut', 'bas', 'volume', 'cloture']
data = data[column_names]
data

Unnamed: 0,symbole,date,ouverture,haut,bas,volume,cloture
0,TAIR,02/01/2017,0.60,0.60,0.59,4651,0.60
1,TAIR,03/01/2017,0.60,0.60,0.59,1159,0.60
2,TAIR,04/01/2017,0.60,0.60,0.59,19524,0.60
3,TAIR,05/01/2017,0.60,0.60,0.59,2205,0.60
4,TAIR,06/01/2017,0.59,0.60,0.59,42941,0.60
...,...,...,...,...,...,...,...
1051,TAIR,14/06/2021,0.59,0.59,0.58,15785,0.58
1052,TAIR,15/06/2021,0.58,0.58,0.57,28264,0.57
1053,TAIR,16/06/2021,0.58,0.58,0.57,4176,0.57
1054,TAIR,17/06/2021,0.57,0.57,0.56,4201,0.57


In [6]:
# convert the pandas dataframe feature data type from TimeStamp to Date
# data['date'] = data['date'].apply(lambda x:x.toordinal())

In [42]:
x = data.iloc[:,2:6]  # we do not start with the firt feature, because it has the name of the stock   
print(x.dtypes)  
x = x.astype(float)
print(x.dtypes)  
x = x.values      
x

ouverture    float64
haut         float64
bas          float64
volume         int64
dtype: object
ouverture    float64
haut         float64
bas          float64
volume       float64
dtype: object


array([[6.0000e-01, 6.0000e-01, 5.9000e-01, 4.6510e+03],
       [6.0000e-01, 6.0000e-01, 5.9000e-01, 1.1590e+03],
       [6.0000e-01, 6.0000e-01, 5.9000e-01, 1.9524e+04],
       ...,
       [5.8000e-01, 5.8000e-01, 5.7000e-01, 4.1760e+03],
       [5.7000e-01, 5.7000e-01, 5.6000e-01, 4.2010e+03],
       [5.7000e-01, 5.7000e-01, 5.6000e-01, 1.6507e+04]])

In [43]:
y = data.cloture # we're going to predict the closing prices of the TAIR
y = y.values
y

array([0.6 , 0.6 , 0.6 , ..., 0.57, 0.57, 0.57])

In [44]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=73) # random_state to reproduce the same result, and test size will the 33% of the train data
print("X_train = ",X_train.shape)
print("y_train",y_train.shape)
print("X_test = ",X_test.shape)
print("y_test",y_test.shape)

X_train =  (844, 4)
y_train (844,)
X_test =  (212, 4)
y_test (212,)


In [46]:
SEED = 1111
np.random.seed(SEED)
#  initial parameters from Keras Tuner bayesian optimization

# fit
def create_mlp(num_columns, num_labels, hidden_units, dropout_rates, label_smoothing, learning_rate):

    inp = tf.keras.layers.Input(shape=(num_columns,))
    x = tf.keras.layers.BatchNormalization()(inp)
    x = tf.keras.layers.Dropout(dropout_rates[0])(x)
    for i in range(len(hidden_units)):
        x = tf.keras.layers.Dense(hidden_units[i])(x)
        x = tf.keras.layers.BatchNormalization()(x)
        x = tf.keras.layers.Activation(tf.keras.activations.swish)(x)
        x = tf.keras.layers.Dropout(dropout_rates[i + 1])(x)
    
    x = tf.keras.layers.Dense(num_labels)(x)
    out = tf.keras.layers.Activation("sigmoid")(x)

    model = tf.keras.models.Model(inputs=inp, outputs=out)
    model.compile(
        optimizer=tf.keras.optimizers.Adam(0.001), # RectifiedAdam Optimizer (known to be robust to the choice in learning rate)
        loss=tf.keras.losses.BinaryCrossentropy(label_smoothing=label_smoothing),
        metrics=tf.keras.metrics.AUC(name="AUC"),
    ) 

    return model

num_columns = 4
num_labels = 1
epochs = 10 
batch_size = 4096
hidden_units = [160, 160, 160]
dropout_rates = [0.2, 0.2, 0.2, 0.2]
label_smoothing = 1e-2
#Label Smoothing is a regularization technique that introduces noise for the labels. 
#This accounts for the fact that datasets may have mistakes in them, so maximizing the likelihood of directly can be harmful.
#Assume for a small constant e, the training set label y is correct with probability 1 - e and incorrect otherwise. 
#Label Smoothing regularizes a model based on a softmax with k output values by replacing the hard 0 and 1 classification targets 
#with e/k-1 targets of and 1 - e respectively.
learning_rate = 1e-2
#normally the model training with a batch size of 4096 and learning rate 1e-3 starts to overfit 
#on the train set after only 10 epochs. 

tf.keras.backend.clear_session()
tf.random.set_seed(SEED)

clf = create_mlp(num_columns, num_labels, hidden_units, dropout_rates, label_smoothing, learning_rate)
clf.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, verbose=2)

Epoch 1/10
1/1 - 2s - loss: 0.7797 - AUC: 0.0000e+00
Epoch 2/10
1/1 - 0s - loss: 0.7314 - AUC: 0.0000e+00
Epoch 3/10
1/1 - 0s - loss: 0.7450 - AUC: 0.0000e+00
Epoch 4/10
1/1 - 0s - loss: 0.7200 - AUC: 0.0000e+00
Epoch 5/10
1/1 - 0s - loss: 0.7148 - AUC: 0.0000e+00
Epoch 6/10
1/1 - 0s - loss: 0.7080 - AUC: 0.0000e+00
Epoch 7/10
1/1 - 0s - loss: 0.7111 - AUC: 0.0000e+00
Epoch 8/10
1/1 - 0s - loss: 0.6979 - AUC: 0.0000e+00
Epoch 9/10
1/1 - 0s - loss: 0.6945 - AUC: 0.0000e+00
Epoch 10/10
1/1 - 0s - loss: 0.6979 - AUC: 0.0000e+00


<tensorflow.python.keras.callbacks.History at 0x7f1b0e279090>

In [51]:
print(len(y_test))
print(y_test)

212
[0.62 0.56 0.62 0.4  0.71 0.77 0.59 0.61 0.57 0.71 0.57 0.71 0.58 0.61
 0.53 0.72 0.6  0.41 0.54 0.51 0.84 0.58 0.6  0.83 0.6  0.42 0.63 0.54
 0.6  0.57 0.63 0.53 0.74 0.54 0.64 0.54 0.45 0.6  0.67 0.69 0.54 0.6
 0.55 0.6  0.57 0.59 0.6  0.56 0.5  0.56 0.55 0.57 0.6  0.74 0.62 0.42
 0.55 0.61 0.54 0.61 0.83 0.61 0.66 0.58 0.45 0.63 0.81 0.59 0.59 0.68
 0.58 0.89 0.64 0.55 0.56 0.54 0.41 0.63 0.65 0.72 0.56 0.58 0.64 0.56
 0.6  0.63 0.6  0.64 0.51 0.54 0.55 0.43 0.72 0.61 0.63 0.6  0.79 0.67
 0.39 0.69 0.63 0.63 0.59 0.6  0.47 0.61 0.57 0.57 0.61 0.57 0.65 0.62
 0.6  0.54 0.55 0.58 0.59 0.43 0.65 0.54 0.4  0.54 0.54 0.56 0.51 0.61
 0.45 0.62 0.83 0.61 0.64 0.63 0.58 0.53 0.52 0.41 0.8  0.63 0.57 0.58
 0.43 0.72 0.55 0.56 0.59 0.65 0.63 0.55 0.55 0.69 0.51 0.61 0.75 0.56
 0.55 0.58 0.54 0.56 0.55 0.55 0.56 0.71 0.46 0.63 0.59 0.77 0.62 0.81
 0.72 0.57 0.62 0.59 0.61 0.4  0.59 0.52 0.63 0.57 0.55 0.55 0.57 0.54
 0.43 0.69 0.61 0.55 0.41 0.65 0.6  0.47 0.63 0.53 0.56 0.56 0.55 0.62
 0.