# Random Forests trainning

Imports:


In [100]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score,train_test_split
from sklearn import preprocessing
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix, mean_absolute_error,mean_squared_error,classification_report
import numpy as np

Loading datasets:

In [118]:
trainDf = pd.read_csv("training_data_treated.csv")
testDf = pd.read_csv("test_data_treated.csv")

Feature selection:

In [102]:
x = trainDf[["delay_in_seconds","avg_temperature","avg_atm_pressure","avg_humidity","record_date_month","record_date_day","record_date_isWeekend","record_date_hour","N101","R206","N105","N206","N309","IC5","N310","EM579","N207-4"]]
y = trainDf[["incidents"]]

Classifier:

In [103]:
clf = RandomForestClassifier(random_state=13122001,
                            n_estimators=1777,
                            criterion='entropy',
                            max_features='auto',
                            class_weight='balanced',
                            max_depth= 42,
                            min_samples_split=2,
                            min_samples_leaf=1,
                            n_jobs=-1,
                            bootstrap= False) 

Code for hyparameter tunning:

In [104]:
from sklearn.model_selection import GridSearchCV

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 1000, stop = 2000, num = 20)]

# Criterion of the quality of a split
criterion = ['gini','entropy', 'log_loss']

# Number of features to consider at every split
max_features = ['auto', 'sqrt','log2']


# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(20, 150, num = 10)]
max_depth.append(None)

# Minimum number of samples required to split a node
min_samples_split = [1, 2, 4, 6]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1,2,4]

n_jobs = [-1]

random_state = [13122001]

class_weight = ['balanced']

# Method of selecting samples for training each tree
bootstrap = [False]# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'criterion' : criterion,
               'max_features': max_features,
               'max_depth': max_depth,
               'n_jobs' :n_jobs,
               'random_state' : random_state,
               'class_weight' : class_weight,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [105]:

# Use the random grid to search for best hyperparameters
# First create the base model to tune

# Random search of parameters, using 3 fold cross validation,   
# search across 100 different combinations, and use all available cores
rf_random = GridSearchCV(estimator = clf, param_grid = random_grid, cv = 3, verbose=2)# Fit the random search model

#Activate this line to perform hyperparameter search
#rf_random.fit(x, y.values.ravel())
#print(rf_random.best_params_)


Validation and training with 10 KFolds

In [106]:
scores =[]
kf = KFold(n_splits=10)
for train,test in kf.split(x): 
    clf.fit(x.loc[train,:],y.loc[train,:].values.ravel())
    score = clf.score(x.loc[test,:],y.loc[test,:])
    scores.append(score)
    y_predicted = clf.predict(x.loc[test,:])
    #print(confusion_matrix(y.loc[test,:],y_predicted))
    #print(score)   
print(np.mean(scores))

0.9314000000000002


Resultados até agora:

Features: "delay_in_seconds","avg_temperature","avg_atm_pressure","avg_humidity","record_date_month","record_date_day","record_date_isWeekend","record_date_hour","N101","R206","N105","N206","N309","IC5","N310","EM579","N207-4"

Clf : RandomForestClassifier(n_estimators = 100,random_state=13122001) 

Acc: 0.9251999999999999

%------------------------------------------------------------------------------------------------------------------------

Features : "delay_in_seconds","avg_temperature","avg_atm_pressure","avg_humidity","avg_wind_speed","avg_rain","record_date_month","record_date_day","record_date_isWeekend","record_date_hour","N101","R206","N105","N206","N309","IC5","N310","EM579","N207-4"


Clf : RandomForestClassifier(n_estimators = 100,random_state=13122001) 

Acc: 0.9187999999999998

%------------------------------------------------------------------------------------------------------------------------

Features : "delay_in_seconds","avg_temperature","avg_atm_pressure","avg_humidity","record_date_month","record_date_isWeekend","record_date_hour","N101","R206","N105","N206","N309","IC5","N310","EM579","N207-4"

Clf : RandomForestClassifier(n_estimators = 100,random_state=13122001) 

Acc: 0.8848

%------------------------------------------------------------------------------------------------------------------------

Features: "delay_in_seconds","avg_temperature","avg_atm_pressure","avg_humidity","record_date_month","record_date_day","record_date_isWeekend","record_date_hour","N101","R206","N105","N206","N309","IC5","N310","EM579","N207-4"

Clf : RandomForestClassifier(n_estimators = 1400,random_state=13122001,min_samples_split= 2,min_samples_leaf= 1,max_depth= 40,bootstrap= False) 


Acc: 0.9282

%------------------------------------------------------------------------------------------------------------------------

Features: "delay_in_seconds","avg_temperature","avg_atm_pressure","avg_humidity","record_date_month","record_date_day","record_date_isWeekend","record_date_hour","N101","R206","N105","N206","N309","IC5","N310","EM579","N207-4"

Clf : RandomForestClassifier(n_estimators = 1400,random_state=13122001,min_samples_split= 2,min_samples_leaf= 1,max_depth= 40,bootstrap= False,criterion="entropy") 


Acc: 0.9308

%------------------------------------------------------------------------------------------------------------------------

Features: "delay_in_seconds","avg_temperature","avg_atm_pressure","avg_humidity","record_date_month","record_date_day","record_date_isWeekend","record_date_hour","N101","R206","N206","N309","IC5","N310","EM579","N207-4"

Clf : RandomForestClassifier(n_estimators = 1400,random_state=13122001,min_samples_split= 2,min_samples_leaf= 1,max_depth= 40,bootstrap= False,criterion="entropy") 


Acc: 0.9326


### Create submission

In [119]:
x_train = trainDf[["delay_in_seconds","avg_temperature","avg_atm_pressure","avg_humidity","record_date_month","record_date_day","record_date_isWeekend","record_date_hour","N101","R206","N105","N206","N309","IC5","N310","EM579","N207-4"]]
y_train = trainDf[["incidents"]]
x_test = testDf[["delay_in_seconds","avg_temperature","avg_atm_pressure","avg_humidity","record_date_month","record_date_day","record_date_isWeekend","record_date_hour","N101","R206","N105","N206","N309","IC5","N310","EM579","N207-4"]]

In [120]:
# creating a RF classifier
#clf2 = RandomForestClassifier(n_estimators = 1400,random_state=13122001,min_samples_split= 2,min_samples_leaf= 1,max_depth= 40,bootstrap= False,criterion="entropy")
clf = RandomForestClassifier(random_state=13122001,
                            n_estimators=1777,
                            criterion='entropy',
                            max_features='auto',
                            class_weight='balanced',
                            max_depth= 42,
                            min_samples_split=2,
                            min_samples_leaf=1,
                            n_jobs=-1,
                            bootstrap= False) 

# Training the model on the training dataset
# fit function is used to train the model using the training sets as parameters
clf.fit(x_train,y_train.values.ravel())
 
# performing predictions on the test dataset
y_pred = clf.predict(x_test)


In [123]:
f = open("Submissions/forestSub.csv", "w")

replace_map = {0:'None', 1:'Low', 2:'Medium',3:'High',4:'Very_High'}

print(y_pred.size)


f.write("RowId,Incidents\n")

for i in range(y_pred.size):
    f.write(str(i+1))
    f.write(",")
    f.write(replace_map[y_pred[i]])
    f.write("\n")

1206


How many lines changed compared to the best submission:

In [124]:
import os

count = 0
filename1 = "Submissions/bestSub.csv"
filename2 = "Submissions/forestSub.csv"
count2=0

with open(filename1) as file1, open(filename2) as file2:
    for line_file_1, line_file_2 in zip(file1, file2):
        if line_file_1 != line_file_2:
            count += 1
        else: count2+=1

print(count)

14
