# Data and Rolling labeling

In [2]:
import pytse_client as tse
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pytse_client.download import download_financial_indexes
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from plotly.offline import init_notebook_mode

tickers = tse.download(symbols="فولاد", adjust=True, write_to_csv=True)
df = pd.DataFrame.from_dict(tickers["فولاد"])
df["label"] = ''
df.head()

Unnamed: 0,date,open,high,low,adjClose,value,volume,count,yesterday,close,label
0,2007-03-11,16.0,16.0,16.0,16.0,889437216900,468077431,7736,16.0,16.0,
1,2007-03-12,16.0,16.0,16.0,16.0,193879458000,100041000,9214,16.0,16.0,
2,2007-03-13,16.0,16.0,16.0,16.0,249241504527,126270939,5862,16.0,16.0,
3,2007-03-14,16.0,16.0,16.0,16.0,51666379451,26705128,1901,16.0,16.0,
4,2007-03-17,16.0,16.0,16.0,16.0,28239006789,14877283,1514,16.0,16.0,


In [None]:
#Labeling

ts = df[["adjClose"]]
k = 30
max = ts[::-1].rolling(k,1).max().shift(1)[::-1]
min = ts[::-1].rolling(k,1).min().shift(1)[::-1]
ts = pd.concat([ts, max, min], axis = 1) 

#filling the label column
for i, row in ts.iterrows():
    if (ts.iloc[i,0] == ts.iloc[i,1]):
        df.iloc[i,10] = -1
    elif (ts.iloc[i,0] == ts.iloc[i,2]):
        df.iloc[i,10] = 1
    else:
        df.iloc[i,10] = 0


df['label'].value_counts()[0]
#df.head()

## Random Forest Classification

In [None]:
X = df.iloc[:, 1:10].values
y = df.iloc[:, -1].values

y=y.astype('int')

In [None]:
#Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

In [None]:
#Feature scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [None]:
#Training the Random Forest Classification model on the Training set
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)

In [None]:
#Predicting the Test set results

from sklearn.metrics import confusion_matrix

y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

#If we want to minimize false negatives, we would choose a model with high precision.
#Precision Score = TP / (FP + TP)
precision = precision_score(y_test, y_pred,  average='macro')
print('Precision: ',precision)

#Measures how good model is at identifying all actual positives out of all positives.
#Recall Score = TP / (FN + TP)
recall = recall_score(y_test, y_pred,  average='macro')
print('Recall: ',recall)

#In scenarios where precision or recall score is optimized, the model performance suffers as a result.
#F1 Score = 2* Precision Score * Recall Score/ (Precision Score + Recall Score/)
f1 = f1_score(y_test, y_pred,  average='macro')
print('F1: ',f1)

#Tells us how often we can expect our model will correctly predict an outcome out of the total number of times it predicts.
#Accuracy Score = (TP + TN)/ (TP + FN + TN + FP)
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy: ',accuracy)

# Data and Zigzag labeling

In [39]:
df = pd.read_csv(".\dataset.csv")
df = df.drop(axis=1, columns='jdate')

#Taking care of missing data
df = df.fillna(df.mean())



Dropping of nuisance columns in DataFrame reductions (with 'numeric_only=None') is deprecated; in a future version this will raise TypeError.  Select only valid columns before calling the reduction.



In [40]:
X = df.iloc[:, 2:-1].values
y = df.iloc[:, -1].values

y=y.astype('int')

In [41]:
#Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

In [42]:
#Feature scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [43]:
#Training the Random Forest Classification model on the Training set
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)

In [44]:
#Predicting the Test set results

from sklearn.metrics import confusion_matrix

y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[  0  29   1]
 [  5 803   2]
 [  0  12   0]]


In [45]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

#If we want to minimize false negatives, we would choose a model with high precision.
#Precision Score = TP / (FP + TP)
precision = precision_score(y_test, y_pred,  average='macro')
print('Precision: ',precision)

#Measures how good model is at identifying all actual positives out of all positives.
#Recall Score = TP / (FN + TP)
recall = recall_score(y_test, y_pred,  average='macro')
print('Recall: ',recall)

#In scenarios where precision or recall score is optimized, the model performance suffers as a result.
#F1 Score = 2* Precision Score * Recall Score/ (Precision Score + Recall Score/)
f1 = f1_score(y_test, y_pred,  average='macro')
print('F1: ',f1)

#Tells us how often we can expect our model will correctly predict an outcome out of the total number of times it predicts.
#Accuracy Score = (TP + TN)/ (TP + FN + TN + FP)
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy: ',accuracy)

Precision:  0.31714060031595576
Recall:  0.3304526748971193
F1:  0.3236598145908908
Accuracy:  0.9424882629107981
