In [9]:
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score,classification_report,accuracy_score
import numpy as np

In [10]:
trainDf = pd.read_csv("training_data_treated.csv")
testDf = pd.read_csv("test_data_treated.csv")

In [11]:

def featureSelect(trainDf):
    return trainDf[["delay_in_seconds","avg_temperature","avg_atm_pressure","record_date_month","record_date_day","record_date_isWeekend","record_date_hour","N101","Affected_Count"]]
    #return trainDf[["delay_in_seconds","record_date_month","dayYear","record_date_isWeekend","record_date_hour","N101","IC5","Affected_Count"]]

In [12]:
x = featureSelect(trainDf)
y = trainDf[["incidents"]]

In [13]:
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 1000, stop = 2000, num = 20)]

# Criterion of the quality of a split
criterion = ['gini','entropy', 'log_loss']

# Number of features to consider at every split
max_features = ['auto', 'sqrt','log2']


# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(20, 150, num = 10)]
max_depth.append(None)

# Minimum number of samples required to split a node
min_samples_split = [1, 2, 4, 6]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1,2,4]

n_jobs = [-1]

random_state = [13122001]

class_weight = ['balanced']

# Method of selecting samples for training each tree
bootstrap = [False]# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'criterion' : criterion,
               'max_features': max_features,
               'max_depth': max_depth,
               'n_jobs' :n_jobs,
               'random_state' : random_state,
               'class_weight' : class_weight,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [14]:
import xgboost as xgb
def retXGBoost():
    return  xgb.XGBClassifier(
              booster='gbtree',
              colsample_bynode=0.8, colsample_bytree=1, eval_metric='mlogloss',
              gamma=15, importance_type="total_cover",
              interaction_constraints='', learning_rate=0.05,
              max_delta_step=15, max_depth=6, min_child_weight=1, 
              n_estimators=2000, n_jobs=16, min_split_loss =0,
              objective='multi:softprob', random_state=13122001,
              subsample=0.5,
              use_label_encoder=False,
              validate_parameters=1, verbosity=None)

In [None]:
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 1000, stop = 2000, num = 20)]
max_depth = [2,4,6,10]
max_leaves = [2,6,8,10]
grow_policy = [ 'lossguide']
eta = [ 0.05,0.1,0.01]
boost = ["gbtree"]
gamma = [5 ,10 ,15,20]
min_child_weight = [1,2,4,8]
min_split_loss = [0]
colsample_bytree = [1]
colsample_bynode = [0.8]
max_delta_step = [15]
importance_type = [ "total_cover","total_gain"]

random_grid = {'n_estimators': n_estimators,
               'importance_type' : importance_type,
               "max_leaves":max_leaves,
               'max_depth': max_depth,
               "max_delta_step":max_delta_step,
               'min_split_loss':min_split_loss,
               "colsample_bytree":colsample_bytree,
               "colsample_bynode":colsample_bynode,
               'eta' : eta,
               "booster":boost,
               "gama":gamma,
               'grow_policy': grow_policy,
               'min_child_weight': min_child_weight
               }

In [15]:

clf = retXGBoost()

In [16]:
scores =[]
kf = KFold(n_splits=5)
counter=1
for train,test in kf.split(x):
    clf.fit(x.loc[train,:],y.loc[train,:].values.ravel())
    score = clf.score(x.loc[test,:],y.loc[test,:])
    scores.append(score)
    y_predicted = clf.predict(x.loc[test,:])
    f1 = f1_score(y.loc[test,:],y_predicted,average="weighted")
    print("["+ str(counter) +"] Acc:",score,"F1Score:", f1)
    counter+=1
print("Average:",np.mean(scores))

[1] Acc: 0.934 F1Score: 0.9339555532673441
[2] Acc: 0.928 F1Score: 0.927945466190974
[3] Acc: 0.936 F1Score: 0.9359111651357293
[4] Acc: 0.916 F1Score: 0.9164153022962701
[5] Acc: 0.952 F1Score: 0.9517778691427993
[6] Acc: 0.924 F1Score: 0.9241336417193518
[7] Acc: 0.948 F1Score: 0.9478902464498763
[8] Acc: 0.928 F1Score: 0.9273067324726519
[9] Acc: 0.924 F1Score: 0.9233715177760393
[10] Acc: 0.948 F1Score: 0.9483538978725226
Average: 0.9338000000000001


In [17]:
x_train = featureSelect(trainDf)
y_train = trainDf[["incidents"]]

x_test =featureSelect(testDf)

In [18]:
# creating a RF classifier
#clf2 = RandomForestClassifier(n_estimators = 1400,random_state=13122001,min_samples_split= 2,min_samples_leaf= 1,max_depth= 40,bootstrap= False,criterion="entropy")
clf = retXGBoost()
# Training the model on the training dataset
# fit function is used to train the model using the training sets as parameters
clf.fit(x_train,y_train.values.ravel())
 
# performing predictions on the test dataset
y_pred = clf.predict(x_test)


In [22]:
f = open("Submissions/xgboostSub.csv", "w")

replace_map = {0:'None', 1:'Low', 2:'Medium',3:'High',4:'Very_High'}

print(y_pred.size)


f.write("RowId,Incidents\n")

for i in range(y_pred.size):
    f.write(str(i+1))
    f.write(",")
    f.write(replace_map[y_pred[i]])
    f.write("\n")

1206


In [23]:
import os

count = 0
filename1 = "Submissions/bestSub.csv"
filename2 = "Submissions/xgboostSub.csv"
count2=0

with open(filename1) as file1, open(filename2) as file2:
    for line_file_1, line_file_2 in zip(file1, file2):
        if line_file_1 != line_file_2:
            count += 1
        else: count2+=1

print(count)

50
