In [1]:
import pandas as pd

preprocessed_accidents_df = pd.read_csv('dataset/pre-processed_dataset_no_state.csv')


In [2]:
preprocessed_accidents_df.drop('Unnamed: 0',axis=1,inplace=True)

In [3]:

preprocessed_accidents_df['Severity'].info

<bound method Series.info of 0         1.0
1         0.0
2         0.0
3         0.0
4         1.0
         ... 
606663    0.0
606664    0.0
606665    0.0
606666    0.0
606667    0.0
Name: Severity, Length: 606668, dtype: float64>

In [4]:
print(preprocessed_accidents_df.columns)

Index(['Severity', 'Start_Lat', 'Start_Lng', 'End_Lat', 'End_Lng',
       'Distance(mi)', 'Side', 'Temperature', 'Wind_Chill', 'Humidity',
       ...
       'Thunder / Windy', 'Thunder and Hail / Windy',
       'Thunder in the Vicinity', 'Thunderstorm', 'Thunderstorms and Rain',
       'Tornado', 'Volcanic Ash', 'Widespread Dust', 'Wintry Mix',
       'Wintry Mix / Windy'],
      dtype='object', length=125)


In [5]:
from numpy import mean
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_validate
from sklearn.metrics import accuracy_score, f1_score, make_scorer
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import SelectKBest
from sklearn.preprocessing import OneHotEncoder, StandardScaler 
from imblearn.over_sampling._smote.base import SMOTE, Counter
from imblearn.under_sampling import RandomUnderSampler

def execute_classifier(classifier):

    y = preprocessed_accidents_df['Severity']
    X = preprocessed_accidents_df.drop('Severity',axis=1)

    oversampler = SMOTE()
    X,y = oversampler.fit_resample(X,y)

    sampler = RandomUnderSampler(random_state=42)
    X, y = sampler.fit_resample(X, y)

    pipeline_estimators = [('scaling',StandardScaler()),('clf',classifier)]
    pipe = Pipeline(pipeline_estimators)
    skf = StratifiedKFold(10, shuffle = True, random_state = 21)

    results_validation = cross_validate(pipe, 
                        X,
                        y,
                        scoring = {'fscore': make_scorer(f1_score),
                                   'accuracy': make_scorer(accuracy_score)},
                        error_score= 'raise',
                        return_estimator = True,
                        cv = skf,
                        n_jobs = -1) 
    
    metrics = results_validation['test_fscore']

    print(results_validation)
    print("Mean fscore: ",mean(metrics))

In [6]:
from sklearn.tree import DecisionTreeClassifier

#decisionTree = DecisionTreeClassifier()
#execute_classifier(decisionTree)

In [1]:
from sklearn.ensemble import RandomForestClassifier

#randomForest = RandomForestClassifier()
#execute_classifier(randomForest)

In [None]:
from sklearn.linear_model import LogisticRegression

#logisticRegression = LogisticRegression()
#execute_classifier(logisticRegression)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
#knn = KNeighborsClassifier(10) 
#execute_classifier(knn)

In [9]:
from sklearn.ensemble import AdaBoostClassifier

adaboost = AdaBoostClassifier()
execute_classifier(adaboost)