In [121]:
import pandas as pd
import numpy as np
import seaborn as sns
import warnings
import sklearn

warnings.filterwarnings('ignore')
sigma_df = pd.read_csv('KSI.csv')

In [122]:
features = [ 'ROAD_CLASS', 'DISTRICT', 'LOCCOORD', 'TRAFFCTL', 'VISIBILITY', 'LIGHT', 'RDSFCOND', 'IMPACTYPE', 'INVTYPE', 'INVAGE', 'INJURY', 'INITDIR', 'MANOEUVER','DRIVACT','PEDTYPE','CYCACT', 'PEDESTRIAN', 'CYCLIST', 'AUTOMOBILE', 'MOTORCYCLE', 'TRUCK', 'TRSN_CITY_VEH', 'PASSENGER', 'SPEEDING', 'AG_DRIV', 'REDLIGHT', 'ALCOHOL', 'NEIGHBOURHOOD_158']
binary_features = ['PEDESTRIAN', 'CYCLIST', 'AUTOMOBILE', 'MOTORCYCLE', 'TRUCK', 'TRSN_CITY_VEH', 'PASSENGER', 'SPEEDING', 'AG_DRIV', 'REDLIGHT', 'ALCOHOL']
mode_features = ['ROAD_CLASS', 'DISTRICT','LOCCOORD', 'TRAFFCTL', 'VISIBILITY', 'LIGHT', 'RDSFCOND','IMPACTYPE', 'INVTYPE', 'INJURY', 'INITDIR' , 'INVAGE']
hash_features = ['NEIGHBOURHOOD_158']
columns_to_onehot_encode = ['ROAD_CLASS', 'DISTRICT', 'LOCCOORD', 'TRAFFCTL', 'VISIBILITY', 'LIGHT', 'RDSFCOND', 'IMPACTYPE', 'INVTYPE', 'INJURY', 'INITDIR', 'DRIVACT','PEDTYPE','CYCACT','PEDESTRIAN', 'CYCLIST', 'AUTOMOBILE', 'MOTORCYCLE', 'TRUCK', 'TRSN_CITY_VEH', 'PASSENGER', 'SPEEDING', 'AG_DRIV', 'REDLIGHT', 'ALCOHOL', 'INVAGE_Categorized']

In [123]:
sigma_df['NEIGHBOURHOOD_158'].shape

(18194,)

In [124]:
sigma_df = sigma_df.dropna(subset=['ACCLASS'], inplace=False)
sigma_df.shape  

(18189, 57)

In [125]:
def drop_unnecassary_columns(df, features):
    Y = df['ACCLASS']
    for column in df.columns:
        if column not in features:
            df.drop(column, axis=1, inplace=True)
    return df
df = drop_unnecassary_columns(sigma_df, features)

In [126]:
def preprocess_high_null_values(df):
    df['maneuver_missing_info?'] = df['MANOEUVER'].isnull().replace({True: 'Yes', False: 'No'})
    df['pedtype_missing_info?'] = df['PEDTYPE'].isnull().replace({True: 'Yes', False: 'No'})
    df['cycact_missing_info?'] = df['CYCACT'].isnull().replace({True: 'Yes', False: 'No'})
    df['drivact_missing_info?'] = df['DRIVACT'].isnull().replace({True: 'Yes', False: 'No'})
    df.drop(['MANOEUVER', 'PEDTYPE', 'CYCACT', 'DRIVACT'], axis=1, inplace=True)
    return df
df = preprocess_high_null_values(df)

In [127]:
from sklearn.preprocessing import FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

def categorize_age(X):
    def age_category(age_range):
        if age_range in ['0 to 4', '5 to 9', '10 to 14']:
            return 'kid'
        elif age_range == '15 to 19':
            return 'teenager'
        elif age_range in ['20 to 24', '25 to 29']:
            return 'youth'
        elif age_range in ['30 to 34', '35 to 39', '40 to 44', '45 to 49', '50 to 54', '55 to 59', '60 to 64']:
            return 'adult'
        elif age_range in ['65 to 69', '70 to 74', '75 to 79', '80 to 84', '85 to 89', '90 to 94', 'Over 95']:
            return 'old'
        else:
            return 'unknown'
    # Assuming X has a single column relevant for age categorization
    transformed_column = X.iloc[:, 0].apply(age_category)
    print(X.columns[0])
    return pd.DataFrame(transformed_column, columns=[X.columns[0]])

def frequency_encode(X):
    # Assume X is a DataFrame with a single column to encode
    freq = X.iloc[:, 0].value_counts(normalize=True)
    transformed_column = X.iloc[:, 0].map(freq)
    return pd.DataFrame(transformed_column, columns=[X.columns[0]])


# Create FunctionTransformer objects
age_categorizer_transformer = FunctionTransformer(categorize_age, validate=False)
frequency_encoder_transformer = FunctionTransformer(frequency_encode, validate=False)
binary_features = ['PEDESTRIAN', 'CYCLIST', 'AUTOMOBILE', 'MOTORCYCLE', 'TRUCK', 'TRSN_CITY_VEH', 'PASSENGER', 'SPEEDING', 'AG_DRIV', 'REDLIGHT', 'ALCOHOL']
clmn_trfm1 = ColumnTransformer(
    transformers=[
        ('fill_mode', SimpleImputer(strategy='most_frequent'), mode_features),
        ('binary_fill', SimpleImputer(strategy='constant', fill_value='No'), binary_features),
        ('age_categorization', age_categorizer_transformer, ['INVAGE']),
        ('neighbourhood_frequency_encoding', frequency_encoder_transformer, ['NEIGHBOURHOOD_158']),
    ],
    remainder='passthrough'
)

x_transformed = clmn_trfm1.fit_transform(df)
x_transformed = pd.DataFrame(x_transformed)
x_transformed

INVAGE


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,19,20,21,22,23,24,25,26,27,28
0,Major Arterial,Toronto and East York,Intersection,Traffic Signal,Clear,Daylight,Dry,Pedestrian Collisions,Driver,Major,...,No,Yes,No,No,unknown,0.002749,No,Yes,Yes,No
1,Major Arterial,Toronto and East York,Intersection,Traffic Signal,Clear,Daylight,Dry,Pedestrian Collisions,Pedestrian,Fatal,...,No,Yes,No,No,old,0.002749,Yes,No,Yes,Yes
2,Major Arterial,Scarborough,Intersection,Traffic Signal,Clear,Daylight,Dry,Turning Movement,Motorcycle Driver,Fatal,...,No,Yes,Yes,No,adult,0.008302,No,Yes,Yes,No
3,Major Arterial,Toronto and East York,Intersection,No Control,Clear,Dark,Wet,Approaching,Passenger,Major,...,Yes,Yes,No,Yes,adult,0.002254,Yes,Yes,Yes,Yes
4,Major Arterial,Scarborough,Intersection,Traffic Signal,Clear,Daylight,Dry,Turning Movement,Driver,Major,...,No,Yes,Yes,No,unknown,0.008302,No,Yes,Yes,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18184,Local,Toronto and East York,Intersection,Stop Sign,Clear,Dark,Wet,Pedestrian Collisions,Pedestrian,Minimal,...,No,Yes,No,No,kid,0.003134,Yes,No,Yes,Yes
18185,Local,Toronto and East York,Intersection,Stop Sign,Clear,Dark,Wet,Pedestrian Collisions,Pedestrian,Minor,...,No,Yes,No,No,kid,0.003134,Yes,No,Yes,Yes
18186,Local,Toronto and East York,Intersection,Stop Sign,Clear,Dark,Wet,Pedestrian Collisions,Pedestrian,Minimal,...,No,Yes,No,No,kid,0.003134,Yes,No,Yes,Yes
18187,Major Arterial,Toronto and East York,Mid-Block,No Control,Rain,"Dark, artificial",Wet,Pedestrian Collisions,Driver,Major,...,No,No,No,No,adult,0.004948,No,Yes,Yes,No


In [128]:
def convert_yes_no_to_binary_and_reshape(X):
    """
    Converts 'Yes' to 1 and 'No' to 0 for a specific column in a numpy array, ensuring 2D output.
    Assumes the column to transform is the first column in X.
    """
    # Convert 'Yes'/'No' to 1/0
    transformed = np.where(X.flatten() == 'Yes', 1, 0)
    # Ensure the output is 2D
    return transformed.reshape(-1, 1)

from sklearn.preprocessing import FunctionTransformer

# Wrap the custom function in FunctionTransformer
yes_no_transformer = FunctionTransformer(convert_yes_no_to_binary_and_reshape, validate=False)

In [129]:
x_transformed

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,19,20,21,22,23,24,25,26,27,28
0,Major Arterial,Toronto and East York,Intersection,Traffic Signal,Clear,Daylight,Dry,Pedestrian Collisions,Driver,Major,...,No,Yes,No,No,unknown,0.002749,No,Yes,Yes,No
1,Major Arterial,Toronto and East York,Intersection,Traffic Signal,Clear,Daylight,Dry,Pedestrian Collisions,Pedestrian,Fatal,...,No,Yes,No,No,old,0.002749,Yes,No,Yes,Yes
2,Major Arterial,Scarborough,Intersection,Traffic Signal,Clear,Daylight,Dry,Turning Movement,Motorcycle Driver,Fatal,...,No,Yes,Yes,No,adult,0.008302,No,Yes,Yes,No
3,Major Arterial,Toronto and East York,Intersection,No Control,Clear,Dark,Wet,Approaching,Passenger,Major,...,Yes,Yes,No,Yes,adult,0.002254,Yes,Yes,Yes,Yes
4,Major Arterial,Scarborough,Intersection,Traffic Signal,Clear,Daylight,Dry,Turning Movement,Driver,Major,...,No,Yes,Yes,No,unknown,0.008302,No,Yes,Yes,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18184,Local,Toronto and East York,Intersection,Stop Sign,Clear,Dark,Wet,Pedestrian Collisions,Pedestrian,Minimal,...,No,Yes,No,No,kid,0.003134,Yes,No,Yes,Yes
18185,Local,Toronto and East York,Intersection,Stop Sign,Clear,Dark,Wet,Pedestrian Collisions,Pedestrian,Minor,...,No,Yes,No,No,kid,0.003134,Yes,No,Yes,Yes
18186,Local,Toronto and East York,Intersection,Stop Sign,Clear,Dark,Wet,Pedestrian Collisions,Pedestrian,Minimal,...,No,Yes,No,No,kid,0.003134,Yes,No,Yes,Yes
18187,Major Arterial,Toronto and East York,Mid-Block,No Control,Rain,"Dark, artificial",Wet,Pedestrian Collisions,Driver,Major,...,No,No,No,No,adult,0.004948,No,Yes,Yes,No


In [130]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

# Identify object columns, excluding 'ACCLASS' if present in your column selection logic
object_columns = [col for col in df.columns if df[col].dtype == 'object']

# Create the ColumnTransformer with OneHotEncoder
clmn_trfm2 = ColumnTransformer(
    transformers=[
        # Apply OneHotEncoder to object columns, drop the first category to avoid dummy variable trap
        ('one_hot_encoder', OneHotEncoder(drop='first', sparse=False), object_columns),
    ],
    remainder='passthrough'  # Keep other columns unchanged
)

x_transformed = clmn_trfm2.fit_transform(df)
x_transformed[0]

array([0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
       1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
       1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 1., 0., 0., 0., 1., 0., 1., 0., 0., 0., 0., 1., 0., 1.,
       1., 1., 1., 1., 0., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0.

In [131]:
from sklearn.pipeline import Pipeline
from sklearn.base import TransformerMixin
from sklearn.base import BaseEstimator

class ArrayToDataFrame(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        if isinstance(X, pd.DataFrame):
            return X
        else:
            num_columns = X.shape[1]
            column_names = [f'Column_{i}' for i in range(num_columns)]
            return pd.DataFrame(X, columns=column_names)

pipeline = Pipeline([
    ('preprocessing_1', clmn_trfm1),
    ('preprocessing_2', clmn_trfm2),
    # ('to_dataframe', ArrayToDataFrame()) ,
    ('binarization', yes_no_transformer),
    ('to_dataframe', ArrayToDataFrame()) 
])

x_transformed = pipeline.fit_transform(df)
x_transformed[0]


INVAGE


ValueError: Specifying the columns using strings is only supported for pandas DataFrames