In [677]:
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
from sklearn.preprocessing import OneHotEncoder
import xgboost as xgb
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

In [678]:
def compute_hot_encoding_features(X):
        one_hot_encoding = pd.get_dummies(X[['Accident_Type_Code','Violations']])
        
        return one_hot_encoding

In [688]:
def data_preparation(df):
#     df['Accident_Type_Code'] = pd.Categorical(df['Accident_Type_Code'])
#     df['Violations'] = pd.Categorical(df['Violations'])
# #     df['Days_Since_Inspection'] = pd.Categorical(df['Days_Since_Inspection'])
# #     df['Total_Safety_Complaints'] = pd.Categorical(df['Total_Safety_Complaints'])
#     one_hot_encoding = compute_hot_encoding_features(df)
#     df = pd.concat([df, one_hot_encoding], axis=1)
    df = df.drop(['Accident_Type_Code','Violations'],axis=1, inplace=False)
        
    return df

In [689]:
def Standization(df):
    scaler_X = preprocessing.StandardScaler().fit(df[['Safety_Score','Days_Since_Inspection','Total_Safety_Complaints','Control_Metric','Turbulence_In_gforces','Cabin_Temperature','Max_Elevation','Violations','Adverse_Weather_Metric']])
    scaled_data = scaler_X.transform(df[['Safety_Score','Days_Since_Inspection','Total_Safety_Complaints','Control_Metric','Turbulence_In_gforces','Cabin_Temperature','Max_Elevation','Violations','Adverse_Weather_Metric']])
    scaled_data_df = pd.DataFrame(data = scaled_data,  columns = ['Safety_Score','Days_Since_Inspection','Total_Safety_Complaints','Control_Metric','Turbulence_In_gforces','Cabin_Temperature','Max_Elevation','Violations','Adverse_Weather_Metric'])
    df[['Safety_Score','Days_Since_Inspection','Total_Safety_Complaints','Control_Metric','Turbulence_In_gforces','Cabin_Temperature','Max_Elevation','Violations','Adverse_Weather_Metric']] = scaled_data_df[['Safety_Score','Days_Since_Inspection','Total_Safety_Complaints','Control_Metric','Turbulence_In_gforces','Cabin_Temperature','Max_Elevation','Violations','Adverse_Weather_Metric']]
         
    return df

In [690]:
def Normalization(df):
    scaler_X = preprocessing.MinMaxScaler().fit(df[['Safety_Score','Days_Since_Inspection','Total_Safety_Complaints','Control_Metric','Turbulence_In_gforces','Cabin_Temperature','Max_Elevation','Violations','Adverse_Weather_Metric']])
    scaled_data = scaler_X.transform(df[['Safety_Score','Days_Since_Inspection','Total_Safety_Complaints','Control_Metric','Turbulence_In_gforces','Cabin_Temperature','Max_Elevation','Violations','Adverse_Weather_Metric']])
    scaled_data_df = pd.DataFrame(data = scaled_data,  columns = ['Safety_Score','Days_Since_Inspection','Total_Safety_Complaints','Control_Metric','Turbulence_In_gforces','Cabin_Temperature','Max_Elevation','Violations','Adverse_Weather_Metric'])
    df[['Safety_Score','Days_Since_Inspection','Total_Safety_Complaints','Control_Metric','Turbulence_In_gforces','Cabin_Temperature','Max_Elevation','Violations','Adverse_Weather_Metric']] = scaled_data_df[['Safety_Score','Days_Since_Inspection','Total_Safety_Complaints','Control_Metric','Turbulence_In_gforces','Cabin_Temperature','Max_Elevation','Violations','Adverse_Weather_Metric']]
         
    return df

# Training_Data

In [691]:
class_map = {
    'Minor_Damage_And_Injuries': 0,
    'Significant_Damage_And_Fatalities': 1,
    'Significant_Damage_And_Serious_Injuries': 2,
    'Highly_Fatal_And_Damaging': 3
}
inverse_class_map = {
    0: 'Minor_Damage_And_Injuries',
    1: 'Significant_Damage_And_Fatalities',
    2: 'Significant_Damage_And_Serious_Injuries',
    3: 'Highly_Fatal_And_Damaging'
}

In [692]:
def data_reading():
    data = pd.read_csv("../data/data_folder/train.csv")
    
    X = data.drop(['Severity', 'Accident_ID'],axis=1, inplace=False)
    Y = data['Severity']
    Y = Y.map(class_map).astype(np.uint8)
    
    X = data_preparation(X)
#     X = Standization(X)
#     X = Normalization(X)
    return X,Y

# Models

In [716]:
def model_xgboost(dtrain):
    
    
    param = {
        'max_depth': 5,  # the maximum depth of each tree
#         'booster':'dart',
        #'gamma': 3,
        #'min_child_weight': 1,
        #'subsample': 1,
        #'colsample_bytree': 1,
        #'alpha': 0.1,
        'eta': 0.1,  # the training step for each iteration
        'silent': 1,  # logging mode - quiet
        'objective': 'multi:softprob',  # error evaluation for multiclass training
        'num_class': 4    # the number of classes that exist in this datset
    }  
    nrounds = 1000  # the number of training iterations
    
    
    bst_xgb = xgb.train(param, dtrain,nrounds)
    
    return bst_xgb

In [717]:
def model_rf(X_train, y_train):
    
    clf = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=100, max_features=5, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=3,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)
    
    bst_rf = clf.fit(X_train, y_train)
    
    return bst_rf

In [718]:
def model_svm(X_train, y_train):
    clf = SVC(C=1.0, kernel='rbf', degree=3, gamma='scale', coef0=0.0, shrinking=True, 
              probability=False, tol=0.001, cache_size=200, class_weight=None, verbose=False,
              max_iter=-1, decision_function_shape='ovr', random_state=None)
    
    bst_svc = clf.fit(X_train, y_train)
    
    return bst_svc

# Training & Evaluation


In [719]:
X,Y = data_reading()
print(X.columns)    
X_train, X_test, y_train, y_test = train_test_split(X, Y, stratify=Y, test_size=0.1)

dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)
    
xgb_model = model_xgboost(dtrain)            #XGBoost model training
rf_model =  model_rf(X_train, y_train)       #RF model training
svm_model =  model_svm(X_train, y_train)     #SVM model training

Index(['Safety_Score', 'Days_Since_Inspection', 'Total_Safety_Complaints',
       'Control_Metric', 'Turbulence_In_gforces', 'Cabin_Temperature',
       'Max_Elevation', 'Adverse_Weather_Metric'],
      dtype='object')


  if getattr(data, 'base', None) is not None and \


### XGBoost Performance

In [720]:
train_preds = xgb_model.predict(dtrain)
train_preds = np.asarray([np.argmax(line) for line in train_preds])
train_score = f1_score(y_train, train_preds, average='weighted')
print('train_F1_score_XGB:', train_score ,'/n')

cv_preds = xgb_model.predict(dtest)
cv_preds_xgboost = np.asarray([np.argmax(line) for line in cv_preds])
score_xgb = f1_score(y_test, cv_preds_xgboost, average='weighted')
print('CV_F1_score_XGB:', score_xgb )

train_F1_score_XGB: 1.0 /n
CV_F1_score_XGB: 0.9489774040171063


### RF Performance

In [657]:
train_y_pred_rf = rf_model.predict(X_train)
train_score_rf = f1_score(y_train, train_y_pred_rf, average='weighted')
print('train_F1_score_RF:', train_score_rf ,'/n')

cv_y_pred_rf = rf_model.predict(X_test)
score_rf = f1_score(y_test, cv_y_pred_rf, average='weighted')
print('CV_F1_score_RF:', score_rf ,'/n')

train_F1_score_RF: 1.0 /n
CV_F1_score_RF: 0.8941295600644422 /n


### SVM Performance

In [565]:
train_y_pred_svm = svm_model.predict(X_train)
train_score_svm = f1_score(y_train, train_y_pred_svm, average='weighted')
print('train_F1_score_RF:', train_score_svm ,'/n')

cv_y_pred_svm = svm_model.predict(X_test)
score_svm = f1_score(y_test, cv_y_pred_svm, average='weighted')
print('CV_F1_score_RF:', score_svm ,'/n')

  'precision', 'predicted', average, warn_for)


train_F1_score_RF: 0.1424753254617015 /n
CV_F1_score_RF: 0.14256704980842913 /n


  'precision', 'predicted', average, warn_for)


### Ensamble

In [574]:
ensamble_pred = ((cv_y_pred_rf*0.5)+(cv_preds_xgboost*0.5)).astype(int)

In [575]:
ensamble_score = f1_score(y_test, ensamble_pred, average='weighted')
print(ensamble_score)

0.9641363776844659


# Submission_Data

In [702]:
def test_set(test_data):
    test_data_set = test_data.drop(['Accident_ID'],axis=1, inplace=False)
    
    test_data_set = data_preparation(test_data_set)
#     test_data_set = Standization(test_data_set)
#     test_data_set = Normalization(test_data_set)
    
    dtest_1 = xgb.DMatrix(test_data_set)
    
    prob_xgb = xgb_model.predict(dtest_1)
    pred_xgb = np.asarray([np.argmax(line) for line in prob_xgb])     #prediction of XGBoost
    
    pred_rf = rf_model.predict(test_data_set)                             #pediction of RF
    
    pred_svm = svm_model.predict(test_data_set)                             #pediction of SVM
    
    ensamble_pred_test = ((pred_rf*0.4)+(pred_xgb*0.6))             #prediction of ensamble
    
    return pred_xgb, pred_rf, pred_svm, ensamble_pred_test

In [703]:
test_data = pd.read_csv("../data/data_folder/test.csv")

In [704]:
final_pred_xgb, final_pred_rf, pred_svm, final_pred_ensamble = test_set(test_data)

In [705]:
#Choose prediction of your choice for sbmission 

submission = pd.DataFrame([test_data['Accident_ID'], np.vectorize(inverse_class_map.get)(final_pred_xgb)], index=['Accident_ID', 'Severity']).T
submission.to_csv( "../data/data_folder/submission.csv",index=False)