In [155]:
import pandas as pd
import seaborn as sb
import numpy as np

In [174]:
df = pd.read_csv('processed_datasets/GMSL_analysis_and_prediction.csv', delimiter=',')

In [175]:
df.head().append(df.tail())

Unnamed: 0,Date,Extent,WaterTemp,O2ml,SiO3,NO3,LandAverageTemperature,LandAndOceanAverageTemperature,CO2,IsGMSLIncreased,GMSL
0,1969-01,11.412998,13.265,5.595,7.0,3.25,1.966,13.518,324.0,0,-35.8
1,1969-02,11.41918,12.435,5.695,6.4,5.2,2.45,13.747,324.42,0,-37.2
2,1969-03,11.42531,15.535,5.49,4.0,0.3,5.131,14.576,325.64,1,-36.2
3,1969-04,11.431388,12.07,5.3125,8.5,6.4,8.576,15.518,326.66,0,-37.4
4,1969-05,11.437415,12.12,5.945,6.0,3.7,11.354,16.329,327.34,0,-38.6
535,2013-08,12.338645,12.053062,5.263625,7.47125,8.925,14.742,17.462,395.2,1,71.6
536,2013-09,12.298467,12.200875,5.30275,6.7575,8.1,13.154,16.894,393.45,0,68.8
537,2013-10,13.236306,12.348688,5.341875,6.04375,7.275,10.256,15.905,393.7,0,66.4
538,2013-11,13.4056,12.4965,5.381,5.33,6.45,7.424,15.107,395.16,0,59.7
539,2013-12,13.066,13.06225,5.42725,4.62625,4.275,4.724,14.339,396.84,0,58.5


In [176]:
df.describe()

Unnamed: 0,Extent,WaterTemp,O2ml,SiO3,NO3,LandAverageTemperature,LandAndOceanAverageTemperature,CO2,IsGMSLIncreased,GMSL
count,540.0,540.0,540.0,540.0,540.0,540.0,540.0,540.0,540.0,540.0
mean,11.633582,12.703859,5.140705,8.623137,7.488843,9.091504,15.553526,355.062907,0.525926,8.211111
std,1.2579,1.178649,0.613406,5.566774,4.980329,4.156488,1.233037,34.925471,0.49979,31.076247
min,8.441054,9.1875,2.32,1.9,0.0,1.882,13.298,-99.99,0.0,-42.6
25%,10.98198,11.967042,5.050313,5.451974,3.85,4.95975,14.38625,338.025,0.0,-15.725
50%,11.750654,12.453371,5.307197,7.508352,6.741667,9.3425,15.581,355.39,1.0,1.9
75%,12.585463,13.27875,5.483333,10.0,9.917708,13.15325,16.72975,373.89,1.0,29.925
max,14.050833,18.6275,6.15,41.4,22.9,15.482,17.609,399.78,1.0,82.4


In [177]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 540 entries, 0 to 539
Data columns (total 11 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Date                            540 non-null    object 
 1   Extent                          540 non-null    float64
 2   WaterTemp                       540 non-null    float64
 3   O2ml                            540 non-null    float64
 4   SiO3                            540 non-null    float64
 5   NO3                             540 non-null    float64
 6   LandAverageTemperature          540 non-null    float64
 7   LandAndOceanAverageTemperature  540 non-null    float64
 8   CO2                             540 non-null    float64
 9   IsGMSLIncreased                 540 non-null    int64  
 10  GMSL                            540 non-null    float64
dtypes: float64(9), int64(1), object(1)
memory usage: 46.5+ KB


#### Podela skupa podataka na training i test

In [188]:
from sklearn.model_selection import train_test_split

X = df.drop(["IsGMSLIncreased", "Date", "GMSL"], axis=1)
Y = df["IsGMSLIncreased"]

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.30, random_state=60)

## Random Forest

In [189]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier()

#### Obučavanje - Random Forest

In [190]:
rfc.fit(X_train, Y_train)

RandomForestClassifier()

#### Predikcija - Random Forest

In [191]:
rfc_predictions = rfc.predict(X_test)

#### F1 Score - Random Forest

In [192]:
from sklearn.metrics import f1_score

f1_sc = f1_score(Y_test, rfc_predictions, average=None)
print("F1 score: ", f1_sc)

F1 score:  [0.5323741  0.64864865]


#### Report - Random Forest

In [193]:
from sklearn.metrics import classification_report

print(classification_report(Y_test, rfc_predictions))

              precision    recall  f1-score   support

           0       0.60      0.48      0.53        77
           1       0.60      0.71      0.65        85

    accuracy                           0.60       162
   macro avg       0.60      0.59      0.59       162
weighted avg       0.60      0.60      0.59       162



#### Tuning Hyperparameters

In [194]:
from sklearn.model_selection import RandomizedSearchCV

n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1000, num = 10)]

max_features = ['auto', 'sqrt',]

max_depth = [int(x) for x in np.linspace(10, 100, num = 5)]
max_depth.append(None)

random_grid = {
 'n_estimators': n_estimators,
 'max_features': max_features,
 'max_depth': max_depth
 }

rfc_random = RandomizedSearchCV(estimator = rfc, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)

rfc_random.fit(X_train, Y_train)

print(rfc_random.best_params_)

Fitting 3 folds for each of 100 candidates, totalling 300 fits
{'n_estimators': 300, 'max_features': 'auto', 'max_depth': 55}


#### Best params: {'n_estimators': 100, 'max_features': 'sqrt', 'max_depth': 10}

In [195]:
rfc_best = RandomForestClassifier(n_estimators=100, max_depth=10, max_features='sqrt')

In [196]:
rfc_best.fit(X_train,Y_train)

RandomForestClassifier(max_depth=10, max_features='sqrt')

In [197]:
rfc_predictions_best = rfc_best.predict(X_test)

In [198]:
print(classification_report(Y_test, rfc_predictions_best))

              precision    recall  f1-score   support

           0       0.60      0.53      0.57        77
           1       0.62      0.68      0.65        85

    accuracy                           0.61       162
   macro avg       0.61      0.61      0.61       162
weighted avg       0.61      0.61      0.61       162

