# Preprocessing Pipelines:

## Importing the data

In [79]:
import pandas as pd
import scipy.sparse as sp
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer

In [80]:
# import df_clean
df = pd.read_csv('data/df_clean.csv')

## Split into categorical and numerical

In [81]:
# split df into categorical and numerical
df_cat = df.select_dtypes(include=['object'])
df_num = df.select_dtypes(exclude=['object'])

## Preprocessing Numericals

In [82]:
preprocessing_pipeline_num = Pipeline([
    ('scaler', StandardScaler())  # Scale the features using standardization
])

In [83]:
# put df_num through preprocessing_pipeline_num
df_num_processed = preprocessing_pipeline_num.fit_transform(df_num)

In [84]:
df_num_processed

array([[-1.73194836, -0.67938194, -1.26618593, ..., -0.35405882,
         0.39334448, -1.3973756 ],
       [-1.73174346, -1.55008598,  0.24158079, ..., -0.35405882,
         0.36925466, -0.1715541 ],
       [-1.73153855,  0.1913221 , -0.50117188, ..., -0.35405882,
         0.37067171, -0.1715541 ],
       ...,
       [ 1.73153855,  0.1913221 , -0.4213136 , ...,  1.01692569,
         0.40042972,  1.05426739],
       [ 1.73174346,  0.1913221 ,  0.60200311, ...,  2.38791021,
        -2.4634249 ,  2.28008889],
       [ 1.73194836,  0.1913221 , -0.7124783 , ..., -0.35405882,
         0.38200809, -0.1715541 ]])

In [85]:
# turn df_num_processed back into a dataframe
df_num_processed = pd.DataFrame(df_num_processed, columns=df_num.columns)

In [86]:
df_num_processed

Unnamed: 0.1,Unnamed: 0,POSTED_SPEED_LIMIT,STREET_NO,NUM_UNITS,INJURIES_TOTAL,INJURIES_FATAL,INJURIES_INCAPACITATING,INJURIES_NON_INCAPACITATING,INJURIES_REPORTED_NOT_EVIDENT,INJURIES_NO_INDICATION,...,CRASH_HOUR,CRASH_DAY_OF_WEEK,CRASH_MONTH,LATITUDE,LONGITUDE,CRASH_UNIT_ID,UNIT_NO,NUM_PASSENGERS,VEHICLE_YEAR,OCCUPANT_CNT
0,-1.731948,-0.679382,-1.266186,-0.259909,-0.367855,-0.039246,-0.129993,-0.284566,-0.206624,-0.981936,...,0.666788,0.453218,-1.043827,0.091821,-0.079025,-0.409642,0.646149,-0.354059,0.393344,-1.397376
1,-1.731743,-1.550086,0.241581,-0.259909,-0.367855,-0.039246,-0.129993,-0.284566,-0.206624,-0.168300,...,-0.425936,0.959535,1.288111,0.072645,-0.093852,-0.553598,0.646149,-0.354059,0.369255,-0.171554
2,-1.731539,0.191322,-0.501172,-1.932545,-0.367855,-0.039246,-0.129993,-0.284566,-0.206624,-0.981936,...,1.031030,-0.053100,0.996619,0.083237,-0.083597,-0.065144,-0.876968,-0.354059,0.370672,-0.171554
3,-1.731334,0.191322,-0.288452,-0.259909,-0.367855,-0.039246,-0.129993,-0.284566,-0.206624,-0.168300,...,-0.608056,0.453218,0.122142,0.087256,-0.087612,-1.321571,0.646149,-0.354059,-2.463425,-0.171554
4,-1.731129,0.191322,1.456769,-0.259909,-0.367855,-0.039246,-0.129993,-0.284566,-0.206624,-0.168300,...,0.302547,0.959535,1.288111,0.131145,-0.083670,-1.498573,-0.876968,-0.354059,0.393344,-0.171554
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16901,1.731129,0.191322,0.117200,1.412728,-0.367855,-0.039246,-0.129993,-0.284566,-0.206624,-0.981936,...,0.848909,1.465852,-0.460843,0.105171,-0.091723,-0.285949,-0.876968,-0.354059,-2.463425,-0.171554
16902,1.731334,0.191322,-1.019191,-0.259909,-0.367855,-0.039246,-0.129993,-0.284566,-0.206624,-0.168300,...,-1.154419,-0.559417,0.413634,0.088548,-0.094228,-1.278574,-0.876968,-0.354059,0.379174,-0.171554
16903,1.731539,0.191322,-0.421314,-0.259909,1.177610,-0.039246,5.152863,-0.284566,-0.206624,-0.168300,...,1.577392,-1.065734,0.122142,0.081731,-0.091494,1.276318,-0.876968,1.016926,0.400430,1.054267
16904,1.731743,0.191322,0.602003,-0.259909,-0.367855,-0.039246,-0.129993,-0.284566,-0.206624,1.458971,...,1.213150,0.453218,0.996619,0.067386,-0.091307,0.934155,-0.876968,2.387910,-2.463425,2.280089


## Preprocessing Categoricals

In [87]:
# Create the pipeline with OneHotEncoder
preprocessing_pipeline_categorical = Pipeline(steps=[
    ('encoder', OneHotEncoder())
])

# Fit and transform the data
df_cat_processed = preprocessing_pipeline_categorical.fit_transform(df_cat)

# Convert the transformed data back to a DataFrame
# df_cat_processed = pd.DataFrame(df_cat_processed)

# # Apply one-hot encoding using pandas get_dummies
# df_cat_processed = pd.get_dummies(df_cat)

# # Print the encoded DataFrame
# print(df_cat_processed)

In [88]:
df_cat_processed

<16906x50858 sparse matrix of type '<class 'numpy.float64'>'
	with 456462 stored elements in Compressed Sparse Row format>

# Put it all back together

In [89]:
import scipy.sparse as sp

# Assuming df_numerical is your preprocessed numerical DataFrame
# Assuming sparse_matrix is your sparse matrix obtained from one-hot encoding

# Convert the numerical DataFrame to a sparse matrix
df_num_processed = sp.csr_matrix(df_num_processed.values)

# Concatenate the sparse matrix and the sparse categorical matrix horizontally
combined_sparse = sp.hstack((df_num_processed, df_cat_processed), format='csr')


In [90]:
combined_sparse

<16906x50879 sparse matrix of type '<class 'numpy.float64'>'
	with 794582 stored elements in Compressed Sparse Row format>

In [91]:
sp.save_npz('data/combined_sparse.npz', combined_sparse)

# Pipeline for target