In [1]:
import matplotlib.pyplot as plt
import matplotlib.patches as pt
import seaborn as sns
import pandas as pd
import numpy as np
import math
import category_encoders as ce
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier, RadiusNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier

# Extract_Clean

In [2]:
#To ordinal categorical feature
def to_ordinal(df,column,category):
        df[column] = pd.Categorical(df[column], categories = category )
        return df

In [3]:
#Replacing synonyms of null entries with NaN 
def replace_to_null(df,lookuptable):
    df =df.replace('Data missing or out of range',np.nan)
    for column in df :   
        if ((column == 'local_authority_highway') | (column == 'lsoa_of_accident_location') | (column =='second_road_class')) :
            df[column]=df[column].replace('-1',np.nan)
            newrow =pd.DataFrame({'feature': [column] , 'oldvalue':[-1] , 'newvalue' : [np.nan] })
            lookuptable=pd.concat([lookuptable,newrow])
        elif ((column == 'road_type') | (column == 'second_road_number') |(column == 'weather_conditions')) :
            df[column]=df[column].replace(' ',np.nan)
            newrow =pd.DataFrame({'feature': [column] , 'oldvalue':[' '] , 'newvalue' : [np.nan]})
            lookuptable=pd.concat([lookuptable,newrow])
            df[column]=df[column].replace('',np.nan)
            newrow =pd.DataFrame({'feature': [column] , 'oldvalue':[''] , 'newvalue' : [np.nan]})
            lookuptable=pd.concat([lookuptable,newrow])    
    return df,lookuptable        

In [4]:
#Replacing synonyms entries to shorter interpret. 
def replace_x_to_y(df,column,x,y,lookuptable):
        df[column]=df[column].replace(x,y)
        newrow =pd.DataFrame({'feature': [column] , 'oldvalue':[x] , 'newvalue' : [y] })
        lookuptable=pd.concat([lookuptable,newrow])
        return df,lookuptable

In [5]:
#find corresping frequent weather condition on same date and district to impute with
def correspondingweather(df,district,date):
    if df.loc[(df['local_authority_ons_district']==district)&(df['date']==date),'weather_conditions'].any():
        x= df.loc[(df['local_authority_ons_district']==district)&(df['date']==date),'weather_conditions'].value_counts().idxmax()
    else :
        x=np.nan
    return x

In [6]:
#Dropping a column
def drop_column(df,column):
    df = df.drop(column, axis=1)
    return df
#perform CCA
def cca(df,column):
    df=df.dropna(axis='index', subset=[column])
    return df
#Handling duplicated rows
def duplicates(df,lookuptable):
    #Same accidents with same conditions ( 6 duplicate rows ) : remove immidiately
    df=df.drop_duplicates(subset=['longitude','latitude','date','time', 'accident_severity' , 'weather_conditions' ,'number_of_casualties', 'number_of_vehicles' ,'pedestrian_crossing_physical_facilities' , 'road_surface_conditions' ,'junction_detail'])
    #For number_of_casualties we will consider the the latest as may someone die later or there was who came back to life.
    df=df.drop_duplicates(subset=['longitude','latitude','date','time', 'accident_severity' , 'weather_conditions' , 'number_of_vehicles' ,'pedestrian_crossing_physical_facilities' , 'road_surface_conditions' ,'junction_detail'],keep='last')
    #For accident_severity  we will consider it when slight as it is the most occurance and this agrees also with the corresponding number_of_vehicles which will be considered as first appearance.
    df=df.drop_duplicates(subset=['longitude','latitude','date','time'  ],keep='first')
    # for did_police_officer_attend_scene_of_accident we have 2 No category , one with the notice that it is self reported we can compine both in one category : just No
    df,lookuptable=replace_x_to_y(df,'did_police_officer_attend_scene_of_accident','No - accident was reported using a self completion  form (self rep only)','No',lookuptable)
    return df,lookuptable


## Handling missing Data

In [7]:
def missing(df,lookupTable):
    #perform cca
    df=cca(df,'longitude')
    df=cca(df,'light_conditions')
    df=cca(df,'junction_detail')
    df=cca(df,'speed_limit')
    df=cca(df,'road_surface_conditions')
    
    #perform frequancy category imputation
    maxlocal_authority_highway=df[df['local_authority_highway'].isnull()]['police_force'].value_counts().idxmax()
    df,lookupTable=replace_x_to_y(df,'local_authority_highway',np.nan,maxlocal_authority_highway,lookupTable)
    maxpedestrian_crossing_human_control ,maxpedestrian_crossing_physical_facilities = df[df['did_police_officer_attend_scene_of_accident']=='No'][['pedestrian_crossing_human_control','pedestrian_crossing_physical_facilities']].value_counts().idxmax()
    #replace null pedestrian_crossing_human_control and pedestrian_crossing_physical_facilities of No did_police_officer_attend_scene_of_accident category with most frequent category in the case of No attending
    df.loc[((df['did_police_officer_attend_scene_of_accident']=='No')| (df['police_force']=='Lincolnshire')) & (df['pedestrian_crossing_human_control'].isnull()) ,'pedestrian_crossing_human_control']=df.loc[((df['did_police_officer_attend_scene_of_accident']=='No')| (df['police_force']=='Lincolnshire')) & (df['pedestrian_crossing_human_control'].isnull()) ,'pedestrian_crossing_human_control'].fillna(maxpedestrian_crossing_human_control)
    df.loc[((df['did_police_officer_attend_scene_of_accident']=='No')| (df['police_force']=='Lincolnshire')) & (df['pedestrian_crossing_physical_facilities'].isnull()) ,'pedestrian_crossing_physical_facilities']=df.loc[((df['did_police_officer_attend_scene_of_accident']=='No')| (df['police_force']=='Lincolnshire')) & (df['pedestrian_crossing_physical_facilities'].isnull()) ,'pedestrian_crossing_physical_facilities'].fillna(maxpedestrian_crossing_physical_facilities)
    maxcarriageway_hazards=df.loc[df['did_police_officer_attend_scene_of_accident']=="No",'carriageway_hazards'].value_counts().idxmax()
    df.loc[(df['did_police_officer_attend_scene_of_accident']=='No') & (df['carriageway_hazards'].isnull()) ,'carriageway_hazards']=df.loc[(df['did_police_officer_attend_scene_of_accident']=='No') & (df['carriageway_hazards'].isnull()) ,'carriageway_hazards'].fillna(maxcarriageway_hazards)
    df.loc[((df['first_road_class']=='A')|(df['first_road_class']=='B')|(df['first_road_class']=='Unclassified') | (df['first_road_class']=='C')) & (df['road_type'].isnull()), ['road_type']]=df.loc[((df['first_road_class']=='A')|(df['first_road_class']=='B')|(df['first_road_class']=='Unclassified') | (df['first_road_class']=='C')) & (df['road_type'].isnull()), ['road_type']].replace(np.nan , 'Single carriageway')
    df.loc[((df['first_road_class']=='A(M)')|(df['first_road_class']=='Motorway')) & (df['road_type'].isnull()), ['road_type']]=df.loc[((df['first_road_class']=='A(M)')|(df['first_road_class']=='Motorway')) & (df['road_type'].isnull()), ['road_type']].replace(np.nan , 'Dual carriageway')
    df.loc[((df['speed_limit']==20)|(df['speed_limit']==30)|(df['speed_limit']==40)) & (df['trunk_road_flag'].isnull()), ['trunk_road_flag']]=df.loc[((df['speed_limit']==20)|(df['speed_limit']==30)|(df['speed_limit']==40)) & (df['trunk_road_flag'].isnull()), ['trunk_road_flag']].replace(np.nan , 'Non-trunk')
    df.loc[((df['speed_limit']==50)|(df['speed_limit']==60)|(df['speed_limit']==70)) & (df['trunk_road_flag'].isnull()), ['trunk_road_flag']]=df.loc[((df['speed_limit']==50)|(df['speed_limit']==60)|(df['speed_limit']==70)) & (df['trunk_road_flag'].isnull()), ['trunk_road_flag']].replace(np.nan , 'Trunk (Roads managed by Highways England)')
    for index , row in df.iterrows() : 
        if (df.loc[index,'weather_conditions'])!= (df.loc[index,'weather_conditions']) :
            df.loc[index,'weather_conditions'] =  correspondingweather(df,df.loc[index,'local_authority_ons_district'],df.loc[index,'date'])
    
    # inpute with missing category Arbitrary value 
    df.loc[(df['junction_detail']=='Not at junction or within 20 metres' ) & (df['junction_control'].isnull()), ['junction_control']]=df.loc[(df['junction_detail']=='Not at junction or within 20 metres' ) & (df['junction_control'].isnull()), ['junction_control']].replace(np.nan , 'Not at junction or within 20 metres')
    df.loc[(df['junction_detail']=='Not at junction or within 20 metres' ) & (df['second_road_number'].isnull()), ['second_road_number']]=df.loc[(df['junction_detail']=='Not at junction or within 20 metres' ) & (df['second_road_number'].isnull()), ['second_road_number']].replace(np.nan  , -1)
    df.loc[df.index[(df['junction_detail']=='Not at junction or within 20 metres' ) & (df['second_road_class'].isnull())].tolist(), 'second_road_class'] = 'No road'
    df.loc[df.index[df['lsoa_of_accident_location'].isnull()].tolist(), 'lsoa_of_accident_location'] = 'missing'
    newrow =pd.DataFrame({'feature': ['second_road_class','lsoa_of_accident_location'] , 'oldvalue':[np.nan,np.nan] , 'newvalue' : ['No road','missing']})
    lookupTable=pd.concat([lookupTable,newrow])
    #apply cca 
    df=cca(df,'junction_control')
    df=cca(df,'second_road_class')
    df=cca(df,'second_road_number')
    df=cca(df,'weather_conditions')
    
    #drop redundent column
    df=drop_column(df,'local_authority_district')
    df=drop_column(df,'local_authority_ons_district')
    
    #change types
    df[['first_road_number','second_road_number']] = df[['first_road_number','second_road_number']].astype(str).astype(float).astype(int)

    return df , lookupTable

## Handeling outliers 

In [8]:
# For outliers
def frequency(column):
    label_freq = (column.value_counts() / len(column))*100
    return(label_freq)
def rare(column, threshold):
    label_freq = frequency(column)
    i=0
    x=[]
    while( i < len(label_freq.values) ):
        if label_freq.values[i] <= threshold :
            x.append(label_freq.index[i])
        i= i+1    
    return(x)
def removeoutliers(df,column,rare):
    for index , row in df.iterrows() : 
        if (df.loc[index,column]) in rare :
            df=df.drop(index)
    return df   
def replacerarecategory(df,column,rare):
    for index , row in df.iterrows() : 
        if (df.loc[index,column]) in rare :
            # change 
            df[column]=df[column].replace(df.loc[index,column],'rare')
    return df        
def filllookuptable(cleaningdflookupTable, column ,old , new):
    newrow =pd.DataFrame({'feature': [column] , 'oldvalue':[old]  , 'newvalue' : [new] })
    cleaningdflookupTable=pd.concat([cleaningdflookupTable,newrow])
    return cleaningdflookupTable
def handle_categoricaloutliers(df,lookupTable):
    for column in df :
        percentage=5
        if ((column=='police_force')|(column=='accident_severity') |(column=='day_of_week')|(column=='local_authority_highway')|(column=='first_road_class')|(column=='second_road_class')|(column=='road_type')|(column=='junction_detail')|(column=='junction_control')|(column=='pedestrian_crossing_human_control')|(column=='pedestrian_crossing_physical_facilities')|(column=='light_conditions')|(column=='weather_conditions')|(column=='road_surface_conditions')|(column=='special_conditions_at_site')|(column=='carriageway_hazards') ):
            if (column=='police_force')|(column=='local_authority_highway') :
                percentage =0.5
            rareofcolumn =rare(df[column],percentage)
            for i in rareofcolumn :
                lookupTable = filllookuptable(lookupTable,column,i,'rare')
            df=replacerarecategory(df,column,rareofcolumn)   
    return  df,lookupTable
def outliers(df,lookupTable):
    cap = df['number_of_vehicles'].quantile(0.99)
    df["number_of_vehicles"] = np.where(df["number_of_vehicles"] >cap,cap,df['number_of_vehicles'])
    lookupTable=filllookuptable(lookupTable, 'number_of_vehicles' ,'>5' , '5')
    minfirst=((df['first_road_number'].value_counts()/len(df['first_road_number']))*100).min()
    minsecond=((df['second_road_number'].value_counts()/len(df['second_road_number']))*100).min()
    rarefirst_road_number=rare(df.first_road_number, minfirst)
    df=removeoutliers(df,'first_road_number',rarefirst_road_number)
    raresecond_road_number=rare(df.second_road_number, minsecond)
    df=removeoutliers(df,'second_road_number',raresecond_road_number)
    # Categorical data
    df,lookupTable=handle_categoricaloutliers(df,lookupTable)
    return df,lookupTable

In [9]:
def Extract_Clean(csv):
    #extract csv
    df = pd.read_csv(csv , low_memory=False , index_col=0) 
    # initailize lookup table
    cleaningdflookupTable = pd.DataFrame(columns =['feature','oldvalue', 'newvalue'])
    # change to ordinal
    categories = ["Slight", "Serious" , "Fatal"]
    df=to_ordinal(df,'accident_severity',categories)
    # sorting df 
    df = df.sort_values(by= ['date' ,'time','accident_severity'])
    df['date']= pd.to_datetime(df['date'], format='%d/%m/%Y')
    #drop not useful columns
    df=drop_column(df,'accident_reference')
    df=drop_column(df,'accident_year')
    # change some entries values
    df,cleaningdflookupTable=replace_to_null(df,cleaningdflookupTable)
    df,cleaningdflookupTable=replace_x_to_y(df,'first_road_number','first_road_class is C or Unclassified. These roads do not have official numbers so recorded as zero ',0,cleaningdflookupTable)
    df,cleaningdflookupTable=replace_x_to_y(df,'second_road_number','first_road_class is C or Unclassified. These roads do not have official numbers so recorded as zero ',0,cleaningdflookupTable)
    # duplicates 
    df,cleaningdflookupTable = duplicates(df,cleaningdflookupTable)
    # missing values
    df,cleaningdflookupTable = missing(df,cleaningdflookupTable)
    # outliers
    df,cleaningdflookupTable = outliers(df,cleaningdflookupTable)
    df.to_csv('2016_Accidents_UK_cleaned.csv',index=False)
    cleaningdflookupTable.to_csv('cleaningdflookupTable.csv',index=False)
    return df

In [10]:
df_cleaning = Extract_Clean('2016_Accidents_UK.csv')

In [11]:
df_cleaning.accident_severity.value_counts()

Slight     110761
Serious     21063
rare         1655
Name: accident_severity, dtype: int64

# Encoding_Load 

In [12]:
def fillDTlookuptable(DTlookuptable,column,old):
    i=0
    while(i<len(old)):
        newrow =pd.DataFrame({'feature': [column] , 'oldvalue':[old[i]] , 'newvalue' : [i] })
        DTlookuptable=pd.concat([DTlookuptable,newrow])        
        i=i+1
    return DTlookuptable

def fillDTlookuptable3(DTlookuptable,replace_map_comp):
    for i in replace_map_comp.keys():
        for j in replace_map_comp[i].keys() :
            newrow =pd.DataFrame({'feature': [i] , 'oldvalue':[j] , 'newvalue' : [replace_map_comp[i][j]] })
            DTlookuptable=pd.concat([DTlookuptable,newrow])                    
    return DTlookuptable

def fillDTlookuptable4(DTlookuptable,column,dictionary):
    for i in dictionary.keys() :
        newrow =pd.DataFrame({'feature': [column] , 'oldvalue':[i] , 'newvalue' : [dictionary[i]] })
        DTlookuptable=pd.concat([DTlookuptable,newrow])               
    return DTlookuptable

def group4(df,column,DTlookuptable):
    if (len(frequency(df[column]).values) == len(set(frequency(df[column]).values))) : #perform Count/Frequency Encoding
        dictionary=dict(frequency(df[column]))
        freqencoding=[]
        for i in df[column] :
            if i in dictionary.keys():
                freqencoding.append(dictionary[i])
        df[column] = freqencoding   
        DTlookuptable=fillDTlookuptable4(DTlookuptable,column,dictionary)
    else : #perform binary Encoding
        encoder = ce.BinaryEncoder(cols=[column])
        df = encoder.fit_transform(df) 
    return df , DTlookuptable   

In [13]:
def encoding(df,lookuptable):
    # splitting for lsoa_of_accident_location feature :
    # Replacing 'missing' to same formate :'N01000000'
    df['lsoa_of_accident_location']=df['lsoa_of_accident_location'].replace('missing','M01000000')
    lookuptable.loc[lookuptable.feature=='lsoa_of_accident_location','newvalue']=lookuptable.loc[lookuptable.feature=='lsoa_of_accident_location','newvalue'].replace('missing','M01000000')
    # splitting by 010
    #DTdf.lsoa_of_accident_location= DTdf.lsoa_of_accident_location.str.split("010",expand=True)
    df['lsoa_of_accident_location_1'] = df['lsoa_of_accident_location'].astype(str).str[0]
    df['lsoa_of_accident_location_2'] = df['lsoa_of_accident_location'].astype(str).str[-5:]
    df=df.drop(['lsoa_of_accident_location'], axis=1)
    # convert lsoa_of_accident_location_2 type to int 
    df['lsoa_of_accident_location_2'] = df['lsoa_of_accident_location_2'].astype(str).astype(int)
    Class = ["Unclassified","Motorway", "C" , "B" , "A(M)" ,"A"]
    light =["Daylight", "Darkness - lights lit" , "Darkness - no lighting" ,"Darkness - lighting unknown"]
    severity = ["Slight", "Serious" , "rare"]
    df.first_road_class = pd.Categorical(df.first_road_class, categories = Class )
    df.second_road_class = pd.Categorical(df.second_road_class, categories = Class )
    df.light_conditions = pd.Categorical(df.light_conditions, categories = light )
    df.accident_severity = pd.Categorical(df.accident_severity, categories = severity )
    lookuptable=fillDTlookuptable(lookuptable,'first_road_class',Class)
    lookuptable=fillDTlookuptable(lookuptable,'second_road_class',Class)
    lookuptable=fillDTlookuptable(lookuptable,'light_conditions',light)
    lookuptable=fillDTlookuptable(lookuptable,'accident_severity',severity)
    for columnname in df:
        # Group 1 : label encoding
        if ((columnname=='accident_severity') | (columnname=='light_conditions')  | (columnname=='first_road_class')  | (columnname=='second_road_class')) :
            df[columnname] = df[columnname].cat.codes
            
        else :
            # Group 2 : Binary encoding
            if  ((columnname=='police_force') | (columnname=='local_authority_highway') | (columnname=='lsoa_of_accident_location_1')) :
                encoder = ce.BinaryEncoder(cols=[columnname])
                df = encoder.fit_transform(df) 
            #Group 3 : # Replace value encoding 
            if ((columnname=='urban_or_rural_area') | (columnname=='did_police_officer_attend_scene_of_accident') | (columnname=='trunk_road_flag') ) :
                labels = df[columnname].astype('category').cat.categories.tolist()
                replace_map_comp = {columnname : {k: v for k,v in zip(labels,list(range(1,len(labels)+1)))}}
                lookuptable= fillDTlookuptable3(lookuptable,replace_map_comp)
                df.replace(replace_map_comp, inplace=True)
            else :
            # Group 4 : # Count/Frequency Encoding or Binary encoding
                if ((columnname=='day_of_week') | (columnname=='road_type') | (columnname=='junction_detail') | (columnname=='junction_control') | (columnname=='pedestrian_crossing_human_control') | (columnname=='pedestrian_crossing_physical_facilities') | (columnname=='weather_conditions') | (columnname=='road_surface_conditions') | (columnname=='special_conditions_at_site') | (columnname=='carriageway_hazards') ) :
                    df,lookuptable=group4(df,columnname,lookuptable)
    return df,lookuptable       

In [14]:
def time_to_float(time):
    hours, minutes = time.split(':')
    total_seconds = int(hours) * 3600 + int(minutes) * 60
    return float(total_seconds)

In [15]:
def Encoding_Load(csv,lookuptable):
    # extract csv
    df = pd.read_csv(csv, index_col=0) 
    # initailize lookup table
    EncodinglookupTable = pd.read_csv(lookuptable) 
    # Discretization
    df['date']= pd.to_datetime(df['date'])
    df['week_number']=df['date'].dt.isocalendar().week
    # Drop date column 
    df = df.drop(['date'], axis=1)
    #converting 'time' column to be float 
    df['time'] = df['time'].apply(time_to_float)
    # Encoding
    df,EncodinglookupTable=encoding(df,EncodinglookupTable)
    df.to_csv('2016_Accidents_UK_cleaned_encoded.csv',index=False)
    EncodinglookupTable.to_csv('EncodinglookupTable.csv',index=False)    
    return df

In [16]:
df = Encoding_Load('2016_Accidents_UK_cleaned.csv','cleaningdflookupTable.csv')

In [71]:
ML_X = df.drop(['accident_severity'], axis=1)
ML_Y = df['accident_severity']

# Normalization 

In [72]:
# Create a StandardScaler object
scaler = StandardScaler()
# Fit the scaler to the data
scaler.fit(ML_X)
# Normalize the entire dataset
ML_X_norm = scaler.transform(ML_X)
ML_X = pd.DataFrame(ML_X_norm, columns=ML_X.columns)

# Features selection

In [19]:
def featureselection(df,target,threshold):
    corr_matrix = df.corr()
    target_var = target
    corr_with_target = corr_matrix[target_var].abs().sort_values(ascending=False)
    corr_threshold = threshold
    selected_columns = list(corr_with_target[( corr_with_target > corr_threshold) & (corr_with_target < 1)].index)
    ML_X = df[selected_columns].reset_index()
    return ML_X 

# Splitting ( 60% Training set - 20% Validation set - 20% Test set)

## Random sampling

In [20]:
def random_sampling(ML_X,ML_Y):
    # Split data into training and testing sets
    X_train, X_test_random, y_train, y_test_random = train_test_split(ML_X,ML_Y, test_size=0.2, random_state=42)

    # Split training set into training and validation sets
    X_train_random, X_val_random, y_train_random, y_val_random = train_test_split(X_train, y_train, test_size=0.25, random_state=42)
    return X_train_random , y_train_random , X_val_random , y_val_random , X_test_random , y_test_random

## Stratified K-Fold sampling

In [78]:
def stf_sampling(ML_X,ML_Y):
    # Split the dataset into training and testing sets
    X_train, X_test_stf, y_train, y_test_stf = train_test_split(ML_X, ML_Y, test_size=0.2, random_state=42)

    # Define the number of folds
    splits = 5

    # StratifiedKFold cross-validation method
    stf = StratifiedKFold(n_splits=splits, shuffle=True, random_state=42)
    return X_train , y_train ,  X_test_stf , y_test_stf , stf
def Stf(stf,model , X_train, y_train):
    # Iterate over the folds and split the training set into training and validation sets
    accuracy=[]
    #precision=[]
    #recall=[]
    for train_index, val_index in stf.split(X_train, y_train):
        X_train_stf, X_val_stf = X_train.iloc[train_index], X_train.iloc[val_index]
        y_train_stf, y_val_stf = y_train.iloc[train_index], y_train.iloc[val_index]
        # Train and evaluate the model 
        model.fit(X_train_stf, y_train_stf)
        y_pred = model.predict(X_val_stf)
        accuracy.append(accuracy_score(y_val_stf, y_pred))
        #precision.append(precision_score(y_val_stf, y_pred))
        #recall.append(recall_score(y_val_stf, y_pred))
    avg_accuracy = np.mean(accuracy)
#     avg_precision = np.mean(precision)
#     avg_recall = np.mean(recall)
    return avg_accuracy , model  # ,avg_precision,avg_recall

In [22]:
# # Finally, evaluate your model on the testing set using the best hyperparameters found during cross-validation
# model.fit(X_train_full, y_train_full)
# test_score = model.score(X_test, y_test)
# print("Test score: {}".format(test_score))

# KNN

In [44]:
#create a KNN model with random/Stratified K-Fold sampling with feature selection
x = featureselection(df,'accident_severity',0.018)
n_neighbors_values = [9 , 20 , 50 ,80 , 150]
# first is random sampling
KNN_X_train_random , KNN_y_train_random , KNN_X_val_random , KNN_y_val_random , KNN_X_test_random , KNN_y_test_random = random_sampling(x,ML_Y)
# loop over the hyperparameters and fit a KNN model for each value
KNN_random_best_accuracy = 0
KNN_random_best_n_neighbors = None
best_knn_random = None
for n in n_neighbors_values:
    knn = KNeighborsClassifier(n_neighbors=n)
    knn.fit(KNN_X_train_random, KNN_y_train_random)
    y_pred = knn.predict(KNN_X_val_random)
    accuracy = accuracy_score(KNN_y_val_random, y_pred)
    if accuracy > KNN_random_best_accuracy:
        KNN_random_best_accuracy = accuracy
        KNN_random_best_n_neighbors = n
        best_knn_random = knn

# evaluate the performance of the KNN model on the test set
y_pred = best_knn_random.predict(KNN_X_test_random)
accuracy_score_KNN_random = accuracy_score(KNN_y_test_random, y_pred)
precision_score_KNN_random = precision_score(KNN_y_test_random, y_pred ,average='weighted' , zero_division=0)
recall_score_KNN_random = recall_score(KNN_y_test_random, y_pred , average='weighted')
cm = confusion_matrix(KNN_y_test_random, y_pred)

In [45]:
accuracy_score_KNN_random , precision_score_KNN_random , recall_score_KNN_random , KNN_random_best_n_neighbors 

(0.830011986814504, 0.793857468955879, 0.830011986814504, 50)

In [46]:
cm

array([[22156,     0,     0],
       [ 4202,     2,     0],
       [  335,     1,     0]], dtype=int64)

In [56]:
# second is stf sampling
KNN_stf_best_accuracy = 0
KNN_stf_best_n_neighbors = None
best_knn_stf = None
KNN_X_train_stf , KNN_y_train_stf ,  KNN_X_test_stf , KNN_y_test_stf , stf= stf_sampling(x,ML_Y)
n_neighbors_values = [9 , 20 , 30]
for n in n_neighbors_values:
    knn = KNeighborsClassifier(n_neighbors=n)    
    avg_accuracy = Stf(stf,knn, KNN_X_train_stf, KNN_y_train_stf)
    if avg_accuracy > KNN_stf_best_accuracy:
        KNN_stf_best_accuracy = avg_accuracy
        KNN_stf_best_n_neighbors = n
        best_knn_stf = knn

# evaluate the performance of the KNN model on the test set
y_pred = best_knn_stf.predict(KNN_X_test_stf)
accuracy_score_KNN_stf = accuracy_score(KNN_y_test_stf, y_pred)
precision_score_KNN_stf = precision_score(KNN_y_test_stf, y_pred ,average='weighted' , zero_division=0)
recall_score_KNN_stf = recall_score(KNN_y_test_stf, y_pred , average='weighted')
confusion_matrix_KNN_stf = confusion_matrix(KNN_y_test_random, y_pred)

In [57]:
accuracy_score_KNN_stf , precision_score_KNN_stf , recall_score_KNN_stf , KNN_stf_best_n_neighbors

(0.8298996104285287, 0.7575153794909943, 0.8298996104285287, 30)

In [58]:
confusion_matrix_KNN_stf

array([[22145,    11,     0],
       [ 4194,    10,     0],
       [  334,     2,     0]], dtype=int64)

### KNN is quite fast classification technique but it faied to predict class 'rare' for any test point but we can see in stf sampling the KNN performed better in pridection even though it is still iniffeccient 

# Logistic Regression

In [84]:
#create a logistic regression model with L2 regularization applying different c strenght values and random/Stratified K-Fold sampling
c=[0.001 , 0.01 ,0.1]
# Fisrt Try Random sampling 
X_train_random , y_train_random , X_val_random , y_val_random , X_test_random , y_test_random = random_sampling(ML_X,ML_Y)
models_random = []
for C in c:
    model = LogisticRegression(solver='sag', C=C, max_iter=10000)
    # fit the model on the training set
    model.fit(X_train_random, y_train_random)
    models_random.append(model)
    
# evaluate the performance of each model on the validation set
validation_scores_random = []
for model in models_random:
    score = model.score(X_val_random, y_val_random)
    validation_scores_random.append(score)

# select the best hyperparameter based on the validation score
best_index = validation_scores_random.index(max(validation_scores_random))
best_model = models_random[best_index]
best_c = c[best_index]
y_pred = best_model.predict(X_test_random)
# evaluate the best model on the test set
accuracy_score_random = accuracy_score(y_test_random, y_pred)
precision_score_random = precision_score(y_test_random, y_pred ,average='weighted' , zero_division=0)
recall_score_random = recall_score(y_test_random, y_pred , average='weighted')
confusion_matrix_LR_random = confusion_matrix(y_test_random, y_pred)

In [85]:
accuracy_score_random , precision_score_random , recall_score_random , confusion_matrix_LR_random , best_c

(0.8299745280191789,
 0.773919656904943,
 0.8299745280191789,
 array([[22154,     2,     0],
        [ 4201,     2,     1],
        [  335,     0,     1]], dtype=int64),
 0.01)

In [None]:
# second Try stf sampling 
LR_X_train_stf , LR_y_train_stf ,  LR_X_test_stf , LR_y_test_stf , stf= stf_sampling(x,ML_Y)

models_stf = []
validation_accuracy = []

for C in c:
    model = LogisticRegression(solver='sag', C=C, max_iter=10000)
    # fit the model on the training set
    avg_accuracy = Stf(stf , model, LR_X_train_stf , LR_y_train_stf)
    models_stf.append(model)
    validation_accuracy.append(avg_accuracy)

# select the best hyperparameter based on the validation score
LR_stf_best_index = validation_accuracy.index(max(validation_accuracy))
best_LR_stf = models_stf[best_index]
best_C = c[best_index]

# evaluate the best model on the test set
y_pred = best_LR_stf.predict(LR_X_test_stf)
# evaluate the best model on the test set
accuracy_score_stf = accuracy_score(y_test_stf, y_pred)
precision_score_stf = precision_score(y_test_stf, y_pred ,average='weighted' , zero_division=0)
recall_score_stf = recall_score(y_test_stf, y_pred , average='weighted' , zero_division=0)
confusion_matrix_LR_stf = confusion_matrix(y_test_random, y_pred)

In [70]:
accuracy_score_stf , precision_score_stf , recall_score_stf ,confusion_matrix_LR_stf ,  best_C

(0.8299370692238538, 0.6887955388718798, 0.8299370692238538)

#  Naive Bayes 

In [67]:
#create a Naive Bayes model with L2 regularization applying different alpha strenght values and random/Stratified K-Fold sampling
alpha_values=[0.00001 , 0.0001 , 0.001 , 0.01]
# Fisrt Try Random sampling 
NB_X_train_random , NB_y_train_random , NB_X_val_random , NB_y_val_random , NB_X_test_random , NB_y_test_random = random_sampling(ML_X,ML_Y)
scaler = MinMaxScaler()
NB_X_train_random = scaler.fit_transform(NB_X_train_random)
NB_X_val_random = scaler.transform(NB_X_val_random)
NB_X_test_random = scaler.transform(NB_X_test_random)
NB_models_random = []
NB_validation_scores_random = []
for a in alpha_values :
    nb = MultinomialNB(alpha=a)
    nb.fit(NB_X_train_random, NB_y_train_random)
    NB_models_random.append(nb)
    score = nb.score(NB_X_val_random, NB_y_val_random)
    NB_validation_scores_random.append(score)
    
# select the best alpha based on the validation score
NB_random_best_index = NB_validation_scores_random.index(max(NB_validation_scores_random))
NB_random_best_model = NB_models_random[NB_random_best_index]
NB_random_best_alpha = alpha_values[NB_random_best_index]
y_pred = NB_random_best_model.predict(NB_X_test_random)

# evaluate the best model on the test set
accuracy_score_NB_random = accuracy_score(NB_y_test_random, y_pred)
precision_score_NB_random = precision_score(NB_y_test_random, y_pred ,average='weighted' , zero_division=0)
recall_score_NB_random = recall_score(NB_y_test_random, y_pred , average='weighted')    
confusion_matrix_NB_random = confusion_matrix(NB_y_test_random, y_pred)

In [68]:
accuracy_score_NB_random , precision_score_NB_random , recall_score_NB_random , NB_random_best_alpha , confusion_matrix_NB_random

(0.8299370692238538,
 0.6887955388718798,
 0.8299370692238538,
 1e-05,
 array([[22156,     0,     0],
        [ 4204,     0,     0],
        [  336,     0,     0]], dtype=int64))

In [79]:
# second Try stf sampling
NB_stf_best_accuracy = 0
best_NB_stf = None
NB_stf_best_alpha = None
NB_X_train_stf , NB_y_train_stf ,  NB_X_test_stf , NB_y_test_stf , stf= stf_sampling(ML_X,ML_Y)
# Scale the features using StandardScaler
scaler = StandardScaler()
NB_X_train_stf = scaler.fit_transform(NB_X_train_stf)
NB_X_train_stf = pd.DataFrame(NB_X_train_stf, columns=ML_X.columns)
NB_X_test_stf = scaler.transform(NB_X_test_stf) 
NB_X_test_stf = pd.DataFrame(NB_X_test_stf, columns=ML_X.columns)

for a in alpha_values:
    nb = GaussianNB(var_smoothing=a)
    avg_accuracy, nb = Stf(stf,nb, NB_X_train_stf, NB_y_train_stf)  
    if avg_accuracy > NB_stf_best_accuracy:
        NB_stf_best_accuracy = avg_accuracy
        best_NB_stf = nb
        NB_stf_best_alpha = a


# evaluate the performance of the KNN model on the test set
y_pred = best_NB_stf.predict(NB_X_test_stf)
accuracy_score_NB_stf = accuracy_score(NB_y_test_stf, y_pred)
precision_score_NB_stf = precision_score(NB_y_test_stf, y_pred ,average='weighted' , zero_division=0)
recall_score_NB_stf = recall_score(NB_y_test_stf, y_pred , average='weighted')
confusion_matrix_NB_stf = confusion_matrix(NB_y_test_stf, y_pred)

In [80]:
 accuracy_score_NB_stf , precision_score_NB_stf ,  recall_score_NB_stf , NB_stf_best_alpha

(0.6777045250224752, 0.7411663538297, 0.6777045250224752, 0.01)

In [81]:
confusion_matrix_NB_stf

array([[17765,   483,  3908],
       [ 3009,   145,  1050],
       [  148,     6,   182]], dtype=int64)

## stf Naive Bayes  has proved a better performance when regularization parameter is high 0.01

# Neural Networks

In [83]:
# apply stf sampling and Create a neural network classifier with initializain of hypermater 
NN_X_train_stf , NN_y_train_stf ,  NN_X_test_stf , NN_y_test_stf , stf= stf_sampling(ML_X,ML_Y)
NN_stf_best_accuracy = 0
best_NN_stf = None
NN_stf_best_activation = None
NN_stf_best_reg = None
NN_stf_best_learning_rate = None
clf = MLPClassifier(hidden_layer_sizes=(100, 50), activation='relu', alpha=0.0001, max_iter=500, random_state=42 , learning_rate_init = 0.001)

# Train the classifier on the training set
avg_accuracy , clf = Stf(stf,clf, NB_X_train_stf, NB_y_train_stf) 
#val_loss = clf.score(X_val, y_val)

# Evaluate the classifier on the testing set
y_pred = clf.predict(NN_X_test_stf)
test_accuracy = accuracy_score(NN_y_test_stf, y_pred)

# Print the results
print("Validation accuracy: ", avg_accuracy)
print("Test accuracy: ", test_accuracy)

Validation accuracy:  0.7875972254791828
Test accuracy:  0.7891069823194486


In [90]:
# when learning rate is decreased to be 0.0001
X_train, X_test_stf, y_train, y_test_stf = train_test_split(ML_X, ML_Y, test_size=0.2, random_state=42)
# Define the number of folds
splits = 5
# StratifiedKFold cross-validation method
stf = StratifiedKFold(n_splits=splits, shuffle=True, random_state=42)
accuracy=[]
clf = MLPClassifier(hidden_layer_sizes=(100, 50), activation='relu', alpha=0.0001, max_iter=300, random_state=42 , learning_rate_init = 0.0001)
for train_index, val_index in stf.split(X_train, y_train):
        X_train_stf, X_val_stf = X_train.iloc[train_index], X_train.iloc[val_index]
        y_train_stf, y_val_stf = y_train.iloc[train_index], y_train.iloc[val_index]
        # Train and evaluate the model 
        clf.fit(X_train_stf, y_train_stf)
        y_pred = model.predict(X_val_stf)
        accuracy.append(accuracy_score(y_val_stf, y_pred))
avg_accuracy = np.mean(accuracy)



In [92]:
avg_accuracy

(0.8297856409991692, 0.824030717362802)

In [93]:
# evaluate the performance of the KNN model on the test set
y_pred = clf.predict(X_test_stf)
accuracy_score_NN_stf = accuracy_score(y_test_stf, y_pred)
precision_score_NN_stf = precision_score(y_test_stf, y_pred ,average='weighted' , zero_division=0)
recall_score_NN_stf = recall_score(y_test_stf, y_pred , average='weighted')
confusion_matrix_NN_stf = confusion_matrix(y_test_stf, y_pred)

In [94]:
accuracy_score_NN_stf , precision_score_NN_stf , recall_score_NN_stf , confusion_matrix_NN_stf

(0.8264534012586155,
 0.7537664275773187,
 0.8264534012586155,
 array([[21907,   247,     2],
        [ 4050,   152,     2],
        [  295,    37,     4]], dtype=int64))

In [95]:
#  max_iter= 300
# when learning rate is decreased to be 0.0001
X_train, X_test_stf, y_train, y_test_stf = train_test_split(ML_X, ML_Y, test_size=0.2, random_state=42)
# Define the number of folds
splits = 5
# StratifiedKFold cross-validation method
stf = StratifiedKFold(n_splits=splits, shuffle=True, random_state=42)
accuracy=[]
clf = MLPClassifier(hidden_layer_sizes=(100, 50), activation='relu', alpha=0.0001, max_iter=300, random_state=42 , learning_rate_init = 0.0001)
for train_index, val_index in stf.split(X_train, y_train):
        X_train_stf, X_val_stf = X_train.iloc[train_index], X_train.iloc[val_index]
        y_train_stf, y_val_stf = y_train.iloc[train_index], y_train.iloc[val_index]
        # Train and evaluate the model 
        clf.fit(X_train_stf, y_train_stf)
        y_pred = model.predict(X_val_stf)
        accuracy.append(accuracy_score(y_val_stf, y_pred))
avg_accuracy = np.mean(accuracy)



In [96]:
# evaluate the performance of the KNN model on the test set
y_pred = clf.predict(X_test_stf)
accuracy_score_NN_stf = accuracy_score(y_test_stf, y_pred)
precision_score_NN_stf = precision_score(y_test_stf, y_pred ,average='weighted' , zero_division=0)
recall_score_NN_stf = recall_score(y_test_stf, y_pred , average='weighted')
confusion_matrix_NN_stf = confusion_matrix(y_test_stf, y_pred)
accuracy_score_NN_stf , precision_score_NN_stf , recall_score_NN_stf , confusion_matrix_NN_stf

(0.8161147737488762,
 0.7432218928913371,
 0.8161147737488762,
 array([[21499,   649,     8],
        [ 3916,   283,     5],
        [  279,    52,     5]], dtype=int64))

In [97]:
# we achieved more validation average accuracy  when decreasing the learning rate to be 0.0001 

## Neural Network has achieved best classification performance due to  accuracy_score, precision_score, recall_score, and confusion_matrix cobinations