In [348]:
import pandas as pd
import numpy as np
import seaborn as sns
import warnings

# Suppress all warnings
warnings.filterwarnings('ignore')
sigma_df = pd.read_csv('KSI.csv')

In [349]:
sigma_df.info()



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18194 entries, 0 to 18193
Data columns (total 57 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   X                  18194 non-null  float64
 1   Y                  18194 non-null  float64
 2   INDEX_             18194 non-null  int64  
 3   ACCNUM             13264 non-null  float64
 4   YEAR               18194 non-null  int64  
 5   DATE               18194 non-null  object 
 6   TIME               18194 non-null  int64  
 7   STREET1            18194 non-null  object 
 8   STREET2            16510 non-null  object 
 9   OFFSET             3402 non-null   object 
 10  ROAD_CLASS         17818 non-null  object 
 11  DISTRICT           18089 non-null  object 
 12  WARDNUM            17332 non-null  float64
 13  LATITUDE           18194 non-null  float64
 14  LONGITUDE          18194 non-null  float64
 15  LOCCOORD           18099 non-null  object 
 16  ACCLOC             127

In [350]:
sigma_df.describe()

Unnamed: 0,X,Y,INDEX_,ACCNUM,YEAR,TIME,WARDNUM,LATITUDE,LONGITUDE,FATAL_NO,ObjectId
count,18194.0,18194.0,18194.0,13264.0,18194.0,18194.0,17332.0,18194.0,18194.0,827.0,18194.0
mean,-8838345.0,5420748.0,38188700.0,424844400.0,2012.934869,1362.615917,2521.028,43.710459,-79.396201,29.073761,9097.5
std,11625.33,8682.16,37264630.0,1065503000.0,4.754258,630.816048,184480.3,0.056369,0.104432,17.803627,5252.299734
min,-8865305.0,5402162.0,3363207.0,25301.0,2006.0,0.0,1.0,43.589678,-79.63839,1.0,1.0
25%,-8846591.0,5413242.0,5391370.0,1021229.0,2009.0,920.0,7.0,43.661727,-79.47028,14.0,4549.25
50%,-8838448.0,5419556.0,7644612.0,1197308.0,2012.0,1450.0,13.0,43.702745,-79.397132,28.0,9097.5
75%,-8829671.0,5427813.0,80782610.0,1365020.0,2017.0,1850.0,22.0,43.756345,-79.318286,42.0,13645.75
max,-8807929.0,5443099.0,81706060.0,4008024000.0,2022.0,2359.0,17162220.0,43.855445,-79.122974,78.0,18194.0


In [351]:
sigma_df.isnull().sum()

X                        0
Y                        0
INDEX_                   0
ACCNUM                4930
YEAR                     0
DATE                     0
TIME                     0
STREET1                  0
STREET2               1684
OFFSET               14792
ROAD_CLASS             376
DISTRICT               105
WARDNUM                862
LATITUDE                 0
LONGITUDE                0
LOCCOORD                95
ACCLOC                5450
TRAFFCTL                34
VISIBILITY              20
LIGHT                    0
RDSFCOND                25
ACCLASS                  5
IMPACTYPE                4
INVTYPE                 16
INVAGE                   0
INJURY                8567
FATAL_NO             17367
INITDIR               5052
VEHTYPE               3228
MANOEUVER             7660
DRIVACT               8951
DRIVCOND              8954
PEDTYPE              15134
PEDACT               15112
PEDCOND              15110
CYCLISTYPE           17420
CYCACT               17428
C

In [352]:
# Plotting Graphs


In [353]:
features = [ 'ROAD_CLASS', 'DISTRICT', 'LOCCOORD', 'TRAFFCTL', 'VISIBILITY', 'LIGHT', 'RDSFCOND', 'ACCLASS', 'IMPACTYPE', 'INVTYPE', 'INVAGE', 'INJURY', 'INITDIR', 'MANOEUVER','DRIVACT','PEDTYPE','CYCACT', 'PEDESTRIAN', 'CYCLIST', 'AUTOMOBILE', 'MOTORCYCLE', 'TRUCK', 'TRSN_CITY_VEH', 'PASSENGER', 'SPEEDING', 'AG_DRIV', 'REDLIGHT', 'ALCOHOL', 'NEIGHBOURHOOD_158']
binary_features = ['PEDESTRIAN', 'CYCLIST', 'AUTOMOBILE', 'MOTORCYCLE', 'TRUCK', 'TRSN_CITY_VEH', 'PASSENGER', 'SPEEDING', 'AG_DRIV', 'REDLIGHT', 'ALCOHOL']
mode_features = ['ROAD_CLASS', 'DISTRICT','LOCCOORD', 'TRAFFCTL', 'VISIBILITY', 'LIGHT', 'RDSFCOND','IMPACTYPE', 'INVTYPE', 'INJURY', 'INITDIR' , 'INVAGE_Categorized']
hash_features = ['NEIGHBOURHOOD_158']
columns_to_onehot_encode = ['ROAD_CLASS', 'DISTRICT', 'LOCCOORD', 'TRAFFCTL', 'VISIBILITY', 'LIGHT', 'RDSFCOND', 'IMPACTYPE', 'INVTYPE', 'INJURY', 'INITDIR', 'DRIVACT','PEDTYPE','CYCACT','PEDESTRIAN', 'CYCLIST', 'AUTOMOBILE', 'MOTORCYCLE', 'TRUCK', 'TRSN_CITY_VEH', 'PASSENGER', 'SPEEDING', 'AG_DRIV', 'REDLIGHT', 'ALCOHOL', 'INVAGE_Categorized']

In [354]:
# For dropping unnecessary columns
def drop_unnecassary_columns(df, features):
    for column in df.columns:
        if column not in features:
            df.drop(column, axis=1, inplace=True)
    return df
df = drop_unnecassary_columns(sigma_df, features)

# For converting age into categories
def categorize_age(df):
    """
    Categorizes an age range into predefined age groups.
    
    Parameters:
    - df: pandas.DataFrame, the original dataframe.
    
    Returns:
    - pandas.DataFrame, the modified dataframe with the 'INVAGE' column replaced by the 'INVAGE_Categorized' column.
    """
    def age_category(age_range):
        if age_range in ['0 to 4', '5 to 9', '10 to 14']:
            return 'kid'
        elif age_range == '15 to 19':
            return 'teenager'
        elif age_range in ['20 to 24', '25 to 29']:
            return 'youth'
        elif age_range in ['30 to 34', '35 to 39', '40 to 44', '45 to 49', '50 to 54', '55 to 59', '60 to 64']:
            return 'adult'
        elif age_range in ['65 to 69', '70 to 74', '75 to 79', '80 to 84', '85 to 89', '90 to 94', 'Over 95']:
            return 'old'
        else:  # Handles 'unknown' and any other unexpected values
            return 'unknown'
    
    df['INVAGE_Categorized'] = df['INVAGE'].apply(age_category)
    df.drop('INVAGE', axis=1, inplace=True)
    features.remove('INVAGE')
    
    return df

df = categorize_age(df)


In [355]:
# Handling missing values
from sklearn.impute import SimpleImputer

def fill_with_mode(df, mode_features):
    
    """
    Fills missing values in the specified features of a dataframe with the mode.
    
    Parameters:
    - df: pandas.DataFrame, the dataframe to process.
    - mode_features: list, a list of column names whose missing values need to be filled with the mode.
    
    Returns:
    - pandas.DataFrame, the dataframe with missing values filled with the mode.
    """
    for feature in mode_features:
        imputer = SimpleImputer(strategy='most_frequent')
        df[feature] = imputer.fit_transform(df[[feature]])[:, 0]
    
    return df

def target_class_remove_missing_rows(df):
    """
    Removes rows with missing values in the target class column.
    
    Parameters:
    - df: pandas.DataFrame, the dataframe to process.
    
    Returns:
    - pandas.DataFrame, the dataframe with rows missing the target class removed.
    """
    df.dropna(subset=['ACCLASS'], inplace=True)
    return df


def hash_features_neighbourhood(df):
    # Creating a mapping from each unique neighborhood name to a unique integer
    neighbourhood_mapping = {name: idx for idx, name in enumerate(df['NEIGHBOURHOOD_158'].unique())}

    # Apply the mapping to the 'NEIGHBOURHOOD_158' column to create a numerical encoding
    df['NEIGHBOURHOOD_158_encoded'] = df['NEIGHBOURHOOD_158'].map(neighbourhood_mapping)
    df.drop('NEIGHBOURHOOD_158', axis=1, inplace=True)

    return df

def preprocess_high_null_values(df):
    df['maneuver_missing_info?'] = df['MANOEUVER'].isnull().replace({True: 'Yes', False: 'No'})
    df['pedtype_missing_info?'] = df['PEDTYPE'].isnull().replace({True: 'Yes', False: 'No'})
    df['cycact_missing_info?'] = df['CYCACT'].isnull().replace({True: 'Yes', False: 'No'})
    df['drivact_missing_info?'] = df['DRIVACT'].isnull().replace({True: 'Yes', False: 'No'})
    df.drop(['MANOEUVER', 'PEDTYPE', 'CYCACT', 'DRIVACT'], axis=1, inplace=True)
    return df

def fill_missing_values(df, features):
    """
    Fills missing values in the specified features of a dataframe.
    
    Parameters:
    - df: pandas.DataFrame, the dataframe to process.
    - features: list, a list of column names whose missing values need to be filled.
    
    Returns:
    - pandas.DataFrame, the dataframe with missing values filled.
    """
    for feature in features:
            
        if feature in ['PEDESTRIAN', 'CYCLIST', 'AUTOMOBILE', 'MOTORCYCLE', 'TRUCK', 'TRSN_CITY_VEH', 'PASSENGER', 'SPEEDING', 'AG_DRIV', 'REDLIGHT', 'ALCOHOL']:  # Binary/Flag columns
            # Fill with 'No' assuming missing values imply the absence
            df[feature].fillna('No', inplace=True)
        else:  
            # Fill with 'Unknown' or the mode of the column
            df[feature].fillna(np.nan, inplace=True)
    for feature in ['PEDESTRIAN', 'CYCLIST', 'AUTOMOBILE', 'MOTORCYCLE', 'TRUCK', 'TRSN_CITY_VEH', 'PASSENGER', 'SPEEDING', 'AG_DRIV', 'REDLIGHT', 'ALCOHOL']:
        df[feature] = df[feature].replace({'Yes': 1, 'No': 0})
    df = preprocess_high_null_values(df)
    df = hash_features_neighbourhood(df)
    df = target_class_remove_missing_rows(df)
    df = fill_with_mode(df, mode_features)
    return df

df = fill_missing_values(df, features)
df.isnull().sum()
pd.set_option('display.max_columns', None)



In [356]:
df

Unnamed: 0,ROAD_CLASS,DISTRICT,LOCCOORD,TRAFFCTL,VISIBILITY,LIGHT,RDSFCOND,ACCLASS,IMPACTYPE,INVTYPE,INJURY,INITDIR,PEDESTRIAN,CYCLIST,AUTOMOBILE,MOTORCYCLE,TRUCK,TRSN_CITY_VEH,PASSENGER,SPEEDING,AG_DRIV,REDLIGHT,ALCOHOL,INVAGE_Categorized,maneuver_missing_info?,pedtype_missing_info?,cycact_missing_info?,drivact_missing_info?,NEIGHBOURHOOD_158_encoded
0,Major Arterial,Toronto and East York,Intersection,Traffic Signal,Clear,Daylight,Dry,Fatal,Pedestrian Collisions,Driver,Major,South,1,0,1,0,0,0,0,0,1,0,0,unknown,No,Yes,Yes,No,0
1,Major Arterial,Toronto and East York,Intersection,Traffic Signal,Clear,Daylight,Dry,Fatal,Pedestrian Collisions,Pedestrian,Fatal,North,1,0,1,0,0,0,0,0,1,0,0,old,Yes,No,Yes,Yes,0
2,Major Arterial,Scarborough,Intersection,Traffic Signal,Clear,Daylight,Dry,Fatal,Turning Movement,Motorcycle Driver,Fatal,East,0,0,1,1,0,0,0,0,1,1,0,adult,No,Yes,Yes,No,1
3,Major Arterial,Toronto and East York,Intersection,No Control,Clear,Dark,Wet,Non-Fatal Injury,Approaching,Passenger,Major,East,0,0,1,0,0,0,1,1,1,0,1,adult,Yes,Yes,Yes,Yes,2
4,Major Arterial,Scarborough,Intersection,Traffic Signal,Clear,Daylight,Dry,Fatal,Turning Movement,Driver,Major,South,0,0,1,1,0,0,0,0,1,1,0,unknown,No,Yes,Yes,No,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18189,Local,Toronto and East York,Intersection,Stop Sign,Clear,Dark,Wet,Non-Fatal Injury,Pedestrian Collisions,Pedestrian,Minimal,North,1,0,1,0,0,0,0,0,1,0,0,kid,Yes,No,Yes,Yes,72
18190,Local,Toronto and East York,Intersection,Stop Sign,Clear,Dark,Wet,Non-Fatal Injury,Pedestrian Collisions,Pedestrian,Minor,North,1,0,1,0,0,0,0,0,1,0,0,kid,Yes,No,Yes,Yes,72
18191,Local,Toronto and East York,Intersection,Stop Sign,Clear,Dark,Wet,Non-Fatal Injury,Pedestrian Collisions,Pedestrian,Minimal,North,1,0,1,0,0,0,0,0,1,0,0,kid,Yes,No,Yes,Yes,72
18192,Major Arterial,Toronto and East York,Mid-Block,No Control,Rain,"Dark, artificial",Wet,Non-Fatal Injury,Pedestrian Collisions,Driver,Major,East,1,0,1,0,0,0,0,0,0,0,0,adult,No,Yes,Yes,No,79


In [357]:
df.dtypes

ROAD_CLASS                   object
DISTRICT                     object
LOCCOORD                     object
TRAFFCTL                     object
VISIBILITY                   object
LIGHT                        object
RDSFCOND                     object
ACCLASS                      object
IMPACTYPE                    object
INVTYPE                      object
INJURY                       object
INITDIR                      object
PEDESTRIAN                    int64
CYCLIST                       int64
AUTOMOBILE                    int64
MOTORCYCLE                    int64
TRUCK                         int64
TRSN_CITY_VEH                 int64
PASSENGER                     int64
SPEEDING                      int64
AG_DRIV                       int64
REDLIGHT                      int64
ALCOHOL                       int64
INVAGE_Categorized           object
maneuver_missing_info?       object
pedtype_missing_info?        object
cycact_missing_info?         object
drivact_missing_info?       

In [358]:
from sklearn.preprocessing import OneHotEncoder

def apply_one_hot_encoding_to_object_columns(df):
    """
    Applies one-hot encoding to columns of dtype 'object' in a dataframe using sklearn.
    
    Parameters:
    - df: pandas.DataFrame, the original dataframe.
    
    Returns:
    - pandas.DataFrame, the modified dataframe with one-hot encoded columns added.
    """
    object_columns = df.select_dtypes(include=['object']).columns
    object_columns = [col for col in object_columns if col != 'ACCLASS']
    encoder = OneHotEncoder(drop='first', sparse=False)
    
    for column in object_columns:
        encoded_array = encoder.fit_transform(df[[column]])
        encoded_df = pd.DataFrame(encoded_array, columns=encoder.get_feature_names_out([column]))
        df = pd.concat([df, encoded_df], axis=1)
        df.drop(column, axis=1, inplace=True)  

    return df

# Applying the one-hot encoding function to the dataframe
data_encoded = apply_one_hot_encoding_to_object_columns(df)
# Display the first few rows of the modified dataframe to verify the changes
data_encoded.head()

def target_classes_encoding(df, target_column='ACCLASS'):
    """
    Replaces the values in the target column with 0 and 1.
    
    Parameters:
    - df: pandas.DataFrame, the original dataframe.
    - target_column: str, the name of the target variable column.
    
    Returns:
    - pandas.DataFrame, the modified dataframe with the target variable replaced.
    """
    df[target_column] = df[target_column].replace({'Non-Fatal': 0, 'Fatal': 1})
    return df

df = target_classes_encoding(df)





In [359]:
from sklearn.model_selection import train_test_split

def split_data(df, target_column='ACCLASS', test_size=0.2, random_state=42):
    """
    Splits a dataframe into a training set and a test set, and separates the target variable.
    
    Parameters:
    - df: pandas.DataFrame, the original dataframe.
    - target_column: str, the name of the target variable column.
    - test_size: float, the proportion of the dataset to include in the test split.
    - random_state: int, the seed used by the random number generator.
    
    Returns:
    - pandas.DataFrame, pandas.Series, pandas.DataFrame, pandas.Series: the feature and target variable for the training set and the test set.
    """
    X = df.drop(target_column, axis=1)
    y = df[target_column]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    
    return X_train, y_train, X_test, y_test

# Split the encoded data into a training set and a test set, and separate the target variable
X_train, y_train, X_test, y_test = split_data(data_encoded)

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(drop='first', sparse=False))
])





