# Model Building

In [70]:
import pandas as pd
import numpy as np
import seaborn as sb

# For data splitting
from sklearn.model_selection import train_test_split

# Import the encoder from sklearn
from sklearn.preprocessing import OneHotEncoder

# For resampling
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTEN

In [71]:
df = pd.read_csv("./sanfrancisco_cleaned_2.csv")
df.head()

Unnamed: 0,subject_race,subject_sex,outcome,contraband_found,search_vehicle,reason_for_stop,ageband
0,white,male,with valid reason,search not conducted,False,Mechanical or Non-Moving Violation (V.C.),adult
1,white,male,with valid reason,search not conducted,False,Moving Violation,adult
2,asian/pacific islander,female,with valid reason,search not conducted,False,Moving Violation,adult
3,hispanic,male,with valid reason,search not conducted,False,Moving Violation,adult
4,hispanic,female,with valid reason,search not conducted,False,Moving Violation,adult


# Feature Selection

In [72]:
feature_list = ["subject_race", "subject_sex","outcome", "contraband_found", "search_vehicle", "reason_for_stop", "ageband"]
model_df = df[feature_list].copy()
model_df.head()

Unnamed: 0,subject_race,subject_sex,outcome,contraband_found,search_vehicle,reason_for_stop,ageband
0,white,male,with valid reason,search not conducted,False,Mechanical or Non-Moving Violation (V.C.),adult
1,white,male,with valid reason,search not conducted,False,Moving Violation,adult
2,asian/pacific islander,female,with valid reason,search not conducted,False,Moving Violation,adult
3,hispanic,male,with valid reason,search not conducted,False,Moving Violation,adult
4,hispanic,female,with valid reason,search not conducted,False,Moving Violation,adult


In [73]:
model_df.isna().sum()

subject_race        0
subject_sex         0
outcome             0
contraband_found    0
search_vehicle      0
reason_for_stop     0
ageband             0
dtype: int64

## Data Preprocessing for model

In [74]:
# Extract Response and Predictors
response = "outcome"
y = pd.DataFrame(model_df[response])
X = pd.DataFrame(model_df.drop(response, axis = 1))

# Split the dataset into 75% train and 25% test set
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75, random_state=20)

y_train.value_counts()

outcome             
with valid reason       622237
without valid reason     10930
dtype: int64

In [75]:
# concat training data
train_df = pd.concat([X_train, y_train], axis=1).reset_index(drop=True)
train_df.head()

Unnamed: 0,subject_race,subject_sex,contraband_found,search_vehicle,reason_for_stop,ageband,outcome
0,white,male,search not conducted,False,Moving Violation,adult,with valid reason
1,white,female,search not conducted,False,Mechanical or Non-Moving Violation (V.C.),youth,with valid reason
2,white,male,search not conducted,False,Moving Violation,youth,with valid reason
3,white,male,True,True,Moving Violation,adult,with valid reason
4,other,male,search not conducted,False,Moving Violation,adult,with valid reason


### Resampling Outcome

In [76]:
y_train.value_counts()

outcome             
with valid reason       622237
without valid reason     10930
dtype: int64

#### Combination of Random Undersampling and Random Oversampling

In [77]:
# Random Undersampling
undersampled_strategy = {"with valid reason": 400000}
under_sampler = RandomUnderSampler(sampling_strategy=undersampled_strategy, random_state=20)
X_under, y_under = under_sampler.fit_resample(X_train, y_train)

y_under.value_counts()

outcome             
with valid reason       400000
without valid reason     10930
dtype: int64

In [78]:
# Random Oversampling
strategy = {"without valid reason": 400000}
over_sampler = RandomOverSampler(sampling_strategy=strategy, random_state=20)
X_over, y_over = over_sampler.fit_resample(X_under, y_under)

y_over.value_counts()

outcome             
with valid reason       400000
without valid reason    400000
dtype: int64

In [79]:
train_oversampled_df = pd.concat([X_over, y_over], axis=1)
train_oversampled_df.head()

Unnamed: 0,subject_race,subject_sex,contraband_found,search_vehicle,reason_for_stop,ageband,outcome
0,white,male,search not conducted,False,Mechanical or Non-Moving Violation (V.C.),adult,with valid reason
1,black,female,search not conducted,False,Mechanical or Non-Moving Violation (V.C.),adult,with valid reason
2,white,female,search not conducted,False,Moving Violation,youth,with valid reason
3,white,male,search not conducted,False,Moving Violation,adult,with valid reason
4,black,male,search not conducted,False,Moving Violation,adult,with valid reason


#### Combination of Random Undersampling and SMOTEN Oversampling

In [80]:
# SMOTEN Oversampling
strategy = {"without valid reason": 400000}
smoten_sampler = SMOTEN(sampling_strategy=strategy, random_state=20)
X_smoten, y_smoten = smoten_sampler.fit_resample(X_under, y_under)

y_smoten.value_counts()

  stats.mode(X_class[nn_indices[samples_indices]], axis=1).mode, axis=1


outcome             
with valid reason       400000
without valid reason    400000
dtype: int64

In [81]:
train_smoten_df = pd.concat([X_smoten, y_smoten], axis=1)
train_smoten_df.head()

Unnamed: 0,subject_race,subject_sex,contraband_found,search_vehicle,reason_for_stop,ageband,outcome
0,white,male,search not conducted,False,Mechanical or Non-Moving Violation (V.C.),adult,with valid reason
1,black,female,search not conducted,False,Mechanical or Non-Moving Violation (V.C.),adult,with valid reason
2,white,female,search not conducted,False,Moving Violation,youth,with valid reason
3,white,male,search not conducted,False,Moving Violation,adult,with valid reason
4,black,male,search not conducted,False,Moving Violation,adult,with valid reason


##### Combining test_df

In [82]:
test_df = pd.concat([X_test, y_test], axis=1).reset_index(drop=True)
test_df.head()

Unnamed: 0,subject_race,subject_sex,contraband_found,search_vehicle,reason_for_stop,ageband,outcome
0,white,female,search not conducted,False,Moving Violation,adult,with valid reason
1,white,male,search not conducted,False,Mechanical or Non-Moving Violation (V.C.),adult,with valid reason
2,black,female,search not conducted,False,Moving Violation,youth,with valid reason
3,black,male,search not conducted,False,Mechanical or Non-Moving Violation (V.C.),adult,with valid reason
4,white,male,search not conducted,False,Moving Violation,adult,with valid reason


### Encoding categorical values

##### Ordinal categorical variables - ageband, outcome
##### Nominal categorical variable - subject_race, subject_sex, contraband_found, search_vehicle, reason_for_stop

In [83]:
ordinal_cat = ["ageband", "outcome"]
nominal_cat = ["subject_race","subject_sex", "contraband_found", "search_vehicle", "reason_for_stop"]

numerical = []

In [84]:
# Function for handling ordinal categorical variables

def encode_ord_cat(df):
    # label encoding
    ab_mapping = {"children": 0, "youth": 1, "adult": 2, "senior":3}
    oc_mapping = {"without valid reason": 0, "with valid reason": 1}
    
    model_df_ord = df.copy()
    
    model_df_ord["ageband"] = model_df_ord["ageband"].map(ab_mapping)
    model_df_ord["outcome"] = model_df_ord["outcome"].map(oc_mapping)

    return model_df_ord

In [85]:
# Function for handling Nominal Categorical Variables

def encode_norm_cat(df):
    ohe = OneHotEncoder()
    model_df_cat = df[nominal_cat]
    ohe.fit(model_df_cat)
    model_df_cat_ohe = pd.DataFrame(ohe.transform(model_df_cat).toarray(), 
                                    columns=ohe.get_feature_names_out(model_df_cat.columns))
    return model_df_cat_ohe

In [86]:
# Encode all datasets

# Imbalanced training dataset
model_df_ord = encode_ord_cat(train_df)[ordinal_cat]
model_df_norm = encode_norm_cat(train_df)
model_df_num = train_df[numerical]
# Combine all features
train_df_encoded = pd.concat([ model_df_norm, model_df_ord, model_df_num], 
                        sort = False, axis = 1).reindex(index=model_df_ord.index)
train_df_encoded.head()

Unnamed: 0,subject_race_asian/pacific islander,subject_race_black,subject_race_hispanic,subject_race_other,subject_race_white,subject_sex_female,subject_sex_male,contraband_found_False,contraband_found_True,contraband_found_search not conducted,...,search_vehicle_True,reason_for_stop_Assistance to Motorist,reason_for_stop_BOLO/APB/Warrant,reason_for_stop_DUI Check,reason_for_stop_MPC Violation,reason_for_stop_Mechanical or Non-Moving Violation (V.C.),reason_for_stop_Moving Violation,reason_for_stop_Traffic Collision,ageband,outcome
0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2,1
1,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1,1
2,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,1
3,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2,1
4,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2,1


In [87]:
# Encode all datasets

# Random Upsampling Dataset
model_df_ord = encode_ord_cat(train_oversampled_df)[ordinal_cat]
model_df_norm = encode_norm_cat(train_oversampled_df)
model_df_num = train_oversampled_df[numerical]
train_df_oversampled_encoded = pd.concat([ model_df_norm, model_df_ord, model_df_num], 
                        sort = False, axis = 1).reindex(index=model_df_ord.index)
train_df_oversampled_encoded.head()

Unnamed: 0,subject_race_asian/pacific islander,subject_race_black,subject_race_hispanic,subject_race_other,subject_race_white,subject_sex_female,subject_sex_male,contraband_found_False,contraband_found_True,contraband_found_search not conducted,...,search_vehicle_True,reason_for_stop_Assistance to Motorist,reason_for_stop_BOLO/APB/Warrant,reason_for_stop_DUI Check,reason_for_stop_MPC Violation,reason_for_stop_Mechanical or Non-Moving Violation (V.C.),reason_for_stop_Moving Violation,reason_for_stop_Traffic Collision,ageband,outcome
0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,2,1
1,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,2,1
2,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,1
3,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2,1
4,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2,1


In [88]:
# Encode all datasets

# SMOTEN Oversampling Dataset
model_df_ord = encode_ord_cat(train_smoten_df)[ordinal_cat]
model_df_norm = encode_norm_cat(train_smoten_df)
model_df_num = train_smoten_df[numerical]
train_df_smoten_encoded = pd.concat([ model_df_norm, model_df_ord, model_df_num], 
                        sort = False, axis = 1).reindex(index=model_df_ord.index)
train_df_smoten_encoded.head()

Unnamed: 0,subject_race_asian/pacific islander,subject_race_black,subject_race_hispanic,subject_race_other,subject_race_white,subject_sex_female,subject_sex_male,contraband_found_False,contraband_found_True,contraband_found_search not conducted,...,search_vehicle_True,reason_for_stop_Assistance to Motorist,reason_for_stop_BOLO/APB/Warrant,reason_for_stop_DUI Check,reason_for_stop_MPC Violation,reason_for_stop_Mechanical or Non-Moving Violation (V.C.),reason_for_stop_Moving Violation,reason_for_stop_Traffic Collision,ageband,outcome
0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,2,1
1,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,2,1
2,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,1
3,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2,1
4,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2,1


In [89]:
# Encode all datasets

# Test dataset
model_df_ord = encode_ord_cat(test_df)[ordinal_cat]
model_df_norm = encode_norm_cat(test_df)
model_df_num = test_df[numerical]
test_df_encoded = pd.concat([ model_df_norm, model_df_ord, model_df_num], 
                        sort = False, axis = 1).reindex(index=model_df_ord.index)
test_df_encoded.head()

Unnamed: 0,subject_race_asian/pacific islander,subject_race_black,subject_race_hispanic,subject_race_other,subject_race_white,subject_sex_female,subject_sex_male,contraband_found_False,contraband_found_True,contraband_found_search not conducted,...,search_vehicle_True,reason_for_stop_Assistance to Motorist,reason_for_stop_BOLO/APB/Warrant,reason_for_stop_DUI Check,reason_for_stop_MPC Violation,reason_for_stop_Mechanical or Non-Moving Violation (V.C.),reason_for_stop_Moving Violation,reason_for_stop_Traffic Collision,ageband,outcome
0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2,1
1,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,2,1
2,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,1
3,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,2,1
4,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2,1


In [90]:
train_df_encoded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 633167 entries, 0 to 633166
Data columns (total 21 columns):
 #   Column                                                     Non-Null Count   Dtype  
---  ------                                                     --------------   -----  
 0   subject_race_asian/pacific islander                        633167 non-null  float64
 1   subject_race_black                                         633167 non-null  float64
 2   subject_race_hispanic                                      633167 non-null  float64
 3   subject_race_other                                         633167 non-null  float64
 4   subject_race_white                                         633167 non-null  float64
 5   subject_sex_female                                         633167 non-null  float64
 6   subject_sex_male                                           633167 non-null  float64
 7   contraband_found_False                                     633167 non-null  float64

### Export the files

In [91]:
file_path = "./train.csv"
train_df_encoded.to_csv(path_or_buf=file_path, index=False);

file_path = "./train_oversampled.csv"
train_df_oversampled_encoded.to_csv(path_or_buf=file_path, index=False);

file_path = "./train_smoten.csv"
train_df_smoten_encoded.to_csv(path_or_buf=file_path, index=False);

file_path = "./test.csv"
test_df_encoded.to_csv(path_or_buf=file_path, index=False);