# Preprocessing Pipelines:

## Importing the data

In [53]:
import pandas as pd
import scipy.sparse as sp
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer

In [54]:
# import df_clean
df = pd.read_csv('data/df_clean.csv')

## Split into categorical and numerical

In [55]:
# split df into categorical and numerical
df_cat = df.select_dtypes(include=['object'])
df_num = df.select_dtypes(exclude=['object'])

## Preprocessing Numericals

In [56]:
preprocessing_pipeline_num = Pipeline([
    ('scaler', StandardScaler())  # Scale the features using standardization
])

In [57]:
# put df_num through preprocessing_pipeline_num
df_num_processed = preprocessing_pipeline_num.fit_transform(df_num)

In [58]:
df_num_processed

array([[-1.73195674,  1.05051842, -0.14538348, ..., -0.35296737,
         0.40975318, -0.15203607],
       [-1.73176859,  0.21714293, -0.02399762, ..., -0.35296737,
        -2.37685669, -0.15203607],
       [-1.73158045, -0.61623257,  1.28493485, ..., -0.35296737,
         0.40005339, -1.42270702],
       ...,
       [ 1.73158045,  0.21714293, -0.4660386 , ..., -0.35296737,
         0.40975318, -0.15203607],
       [ 1.73176859,  0.21714293, -0.73792889, ..., -0.35296737,
         0.39173929, -0.15203607],
       [ 1.73195674,  0.21714293, -0.72564998, ..., -0.35296737,
         0.42222433, -0.15203607]])

In [59]:
# turn df_num_processed back into a dataframe
df_num_processed = pd.DataFrame(df_num_processed, columns=df_num.columns)

In [60]:
df_num_processed

Unnamed: 0.1,Unnamed: 0,POSTED_SPEED_LIMIT,STREET_NO,NUM_UNITS,INJURIES_TOTAL,INJURIES_FATAL,INJURIES_INCAPACITATING,INJURIES_NON_INCAPACITATING,INJURIES_REPORTED_NOT_EVIDENT,INJURIES_NO_INDICATION,...,CRASH_HOUR,CRASH_DAY_OF_WEEK,CRASH_MONTH,LATITUDE,LONGITUDE,CRASH_UNIT_ID,UNIT_NO,NUM_PASSENGERS,VEHICLE_YEAR,OCCUPANT_CNT
0,-1.731957,1.050518,-0.145383,-0.237731,-0.374261,-0.038322,-0.133749,-0.290553,-0.210556,-0.123340,...,-1.315000,0.943819,1.274518,0.034349,-0.065242,0.986158,-0.888333,-0.352967,0.409753,-0.152036
1,-1.731769,0.217143,-0.023998,-0.237731,-0.374261,-0.038322,-0.133749,-0.290553,-0.210556,-0.123340,...,0.495470,0.438539,0.398273,0.109276,-0.099772,0.866995,-0.888333,-0.352967,-2.376857,-0.152036
2,-1.731580,-0.616233,1.284935,2.971290,1.179491,-0.038322,-0.133749,-0.290553,2.689037,-0.944361,...,1.762800,-0.572022,1.566600,0.056109,-0.079076,-0.501737,3.787031,-0.352967,0.400053,-1.422707
3,-1.731392,0.217143,-0.674780,-0.237731,2.733243,-0.038322,-0.133749,-0.290553,5.588630,-1.765382,...,0.133376,-1.077302,0.398273,0.118024,-0.082985,-0.113160,0.670122,-0.352967,0.408367,-0.152036
4,-1.731204,0.217143,-1.206282,-0.237731,-0.374261,-0.038322,-0.133749,-0.290553,-0.210556,3.160742,...,0.676517,0.438539,-1.354217,0.091957,-0.094180,0.162239,0.670122,5.441803,0.406982,4.930648
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18407,1.731204,0.217143,-1.251889,-0.237731,-0.374261,-0.038322,-0.133749,-0.290553,-0.210556,-0.123340,...,-0.771859,0.438539,0.398273,0.092452,-0.077116,-0.129643,-0.888333,-0.352967,0.413910,-0.152036
18408,1.731392,1.050518,-0.225723,-0.237731,-0.374261,-0.038322,-0.133749,-0.290553,-0.210556,-0.123340,...,-0.409765,1.449100,1.274518,0.036244,-0.086181,1.498471,-0.888333,-0.352967,0.420839,-0.152036
18409,1.731580,0.217143,-0.466039,-0.237731,-0.374261,-0.038322,-0.133749,-0.290553,-0.210556,-0.123340,...,-0.952906,-0.066741,0.398273,0.103369,-0.084765,-1.553495,0.670122,-0.352967,0.409753,-0.152036
18410,1.731769,0.217143,-0.737929,-0.237731,-0.374261,-0.038322,-0.133749,-0.290553,-0.210556,-0.123340,...,0.133376,0.438539,-0.477972,0.099194,-0.079454,-1.337913,0.670122,-0.352967,0.391739,-0.152036


## Preprocessing Categoricals

In [61]:
# Create the pipeline with OneHotEncoder
preprocessing_pipeline_categorical = Pipeline(steps=[
    ('encoder', OneHotEncoder())
])

# Fit and transform the data
df_cat_processed = preprocessing_pipeline_categorical.fit_transform(df_cat)

# Convert the transformed data back to a DataFrame
# df_cat_processed = pd.DataFrame(df_cat_processed)

# # Apply one-hot encoding using pandas get_dummies
# df_cat_processed = pd.get_dummies(df_cat)

# # Print the encoded DataFrame
# print(df_cat_processed)

In [62]:
df_cat_processed

<18412x55257 sparse matrix of type '<class 'numpy.float64'>'
	with 497124 stored elements in Compressed Sparse Row format>

# Put it all back together

In [63]:
import scipy.sparse as sp

# Assuming df_numerical is your preprocessed numerical DataFrame
# Assuming sparse_matrix is your sparse matrix obtained from one-hot encoding

# Convert the numerical DataFrame to a sparse matrix
df_num_processed = sp.csr_matrix(df_num_processed.values)

# Concatenate the sparse matrix and the sparse categorical matrix horizontally
combined_sparse = sp.hstack((df_num_processed, df_cat_processed), format='csr')


In [64]:
combined_sparse

<18412x55278 sparse matrix of type '<class 'numpy.float64'>'
	with 865364 stored elements in Compressed Sparse Row format>

In [65]:
sp.save_npz('data/combined_sparse.npz', combined_sparse)

# Pipeline for target