# Random Forests trainning

Imports:


In [156]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix, f1_score,classification_report,accuracy_score
import numpy as np

Loading datasets:

In [157]:
trainDf = pd.read_csv("training_data_treated.csv")
testDf = pd.read_csv("test_data_treated.csv")

Feature selection:

In [158]:

def featureSelect(trainDf):
    #return trainDf[["delay_in_seconds","avg_temperature","avg_atm_pressure","record_date_month","record_date_day","record_date_isWeekend","record_date_hour","N101","IC5","Affected_Count"]]
    return trainDf[["delay_in_seconds","avg_atm_pressure","record_date_month","record_date_day","dayYear","record_date_isWeekend","record_date_hour","N101","IC5","Affected_Count"]]

In [159]:
x  = featureSelect(trainDf)
y = trainDf[["incidents"]]

Feature select testing using PCA

In [160]:
from sklearn.decomposition import PCA

#pca = PCA(n_components=10)
#principalComponents = pca.fit_transform(x)
#x = pd.DataFrame(data = principalComponents
#             , columns = ['principal component 1', 'principal component 2','principal component 3','principal component 4','principal component 5','principal component 6','principal component 7','principal component 8','principal component 9','principal component 10'])

Classifier:

In [161]:
def randomForestModel():
    return RandomForestClassifier(random_state=13032001,
                            n_estimators=1777,
                            criterion='entropy',
                            max_features='auto',
                            class_weight='balanced',
                            max_depth= 34,
                            min_samples_split=2,
                            min_samples_leaf=1,
                            n_jobs=-1,
                            bootstrap= False)


Code for hyparameter tunning:

In [162]:
from sklearn.model_selection import GridSearchCV

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 1000, stop = 2000, num = 20)]

# Criterion of the quality of a split
criterion = ['gini','entropy', 'log_loss']

# Number of features to consider at every split
max_features = ['auto', 'sqrt','log2']


# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(20, 150, num = 10)]
max_depth.append(None)

# Minimum number of samples required to split a node
min_samples_split = [1, 2, 4, 6]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1,2,4]

n_jobs = [-1]

random_state = [13122001]

class_weight = ['balanced']

# Method of selecting samples for training each tree
bootstrap = [False]# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'criterion' : criterion,
               'max_features': max_features,
               'max_depth': max_depth,
               'n_jobs' :n_jobs,
               'random_state' : random_state,
               'class_weight' : class_weight,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [163]:

# Use the random grid to search for best hyperparameters
# First create the base model to tune
clf = randomForestModel()
# Random search of parameters, using 3 fold cross validation,   
# search across 100 different combinations, and use all available cores
#rf_random = GridSearchCV(estimator = clf, param_grid = random_grid, cv = 3, verbose=2)# Fit the random search model

#Activate this line to perform hyperparameter search
#rf_random.fit(x, y.values.ravel())
#print(rf_random.best_params_)


Small test using SMOTE oversampling

In [164]:
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 0)


oversample = SMOTE(sampling_strategy={0:1398,1:762,2:700,3:762,4:762},random_state=13122001)
x_over, y_over = oversample.fit_resample(X_train, y_train)

clf.fit(x_over,y_over.values.ravel())
predictions = clf.predict(X_test)
  
# print classification report
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       630
           1       0.89      0.92      0.91       215
           2       0.88      0.87      0.88       166
           3       0.90      0.90      0.90       311
           4       0.91      0.90      0.91       178

    accuracy                           0.94      1500
   macro avg       0.92      0.92      0.92      1500
weighted avg       0.94      0.94      0.94      1500



Small test using NearMiss undersampling

In [165]:
# apply near miss
from imblearn.under_sampling import NearMiss
nr = NearMiss(sampling_strategy={0:1100,1:503,2:412,3:762,4:425})

X_train_miss, y_train_miss = nr.fit_resample(X_train, y_train)

clf.fit(X_train_miss,y_train_miss.values.ravel())
predictions = clf.predict(X_test)
# print classification report
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       630
           1       0.90      0.93      0.91       215
           2       0.89      0.86      0.87       166
           3       0.91      0.91      0.91       311
           4       0.92      0.90      0.91       178

    accuracy                           0.94      1500
   macro avg       0.92      0.92      0.92      1500
weighted avg       0.94      0.94      0.94      1500



Validation and training with 10 KFolds

In [166]:
scores =[]
kf = KFold(n_splits=10,shuffle=True)  # we dont need random seed here if cross validation is desired
counter =1

for train,test in kf.split(x):
    clf.fit(x.loc[train,:],y.loc[train,:].values.ravel())
    score = clf.score(x.loc[test,:],y.loc[test,:])
    scores.append(score)
    y_predicted = clf.predict(x.loc[test,:])
    f1 = f1_score(y.loc[test,:],y_predicted,average="weighted")
    print("["+ str(counter) +"] Acc:",score,"F1Score:", f1)
    counter+=1
print("Average:",np.mean(scores))

[1] Acc: 0.946 F1Score: 0.9458757017804462
[2] Acc: 0.94 F1Score: 0.9401964089229568
[3] Acc: 0.93 F1Score: 0.9298087949095163
[4] Acc: 0.94 F1Score: 0.9400846194760961
[5] Acc: 0.92 F1Score: 0.919794965899812
[6] Acc: 0.932 F1Score: 0.9320466600638274
[7] Acc: 0.95 F1Score: 0.9497936656379635
[8] Acc: 0.944 F1Score: 0.9440985257301536
[9] Acc: 0.934 F1Score: 0.9339468452874173
[10] Acc: 0.944 F1Score: 0.9439691511065221
Average: 0.938


### Create submission

In [167]:
x_train = featureSelect(trainDf)
y_train = trainDf[["incidents"]]


x_test =featureSelect(testDf)

In [168]:
# creating a RF classifier
clf = randomForestModel()

# Training the model on the training dataset
# fit function is used to train the model using the training sets as parameters
clf.fit(x,y.values.ravel())
 
# performing predictions on the test dataset
y_pred = clf.predict(x_test)


In [169]:
f = open("Submissions/forestSub.csv", "w")

replace_map = {0:'None', 1:'Low', 2:'Medium',3:'High',4:'Very_High'}

print(y_pred.size)


f.write("RowId,Incidents\n")

for i in range(y_pred.size):
    f.write(str(i+1))
    f.write(",")
    f.write(replace_map[y_pred[i]])
    f.write("\n")

1206


How many lines changed compared to the best submission:

In [170]:
import os

count = 0
filename1 = "Submissions/bestSub.csv"
filename2 = "Submissions/forestSub.csv"
count2=0

with open(filename1) as file1, open(filename2) as file2:
    for line_file_1, line_file_2 in zip(file1, file2):
        if line_file_1 != line_file_2:
            count += 1
        else: count2+=1

print(count)

370
