# Preprocessing Pipelines:

## Importing the data

In [27]:
import pandas as pd
import scipy.sparse as sp
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer

In [28]:
# import df_clean
df = pd.read_csv('data/df_clean.csv')

## Split into categorical and numerical

In [29]:
# split df into categorical and numerical
df_cat = df.select_dtypes(include=['object'])
df_num = df.select_dtypes(exclude=['object'])

## Preprocessing Numericals

In [30]:
preprocessing_pipeline_num = Pipeline([
    ('scaler', StandardScaler())  # Scale the features using standardization
])

In [31]:
# put df_num through preprocessing_pipeline_num
df_num_processed = preprocessing_pipeline_num.fit_transform(df_num)

In [32]:
df_num_processed

array([[-1.73193471,  0.25711228, -0.32712804, ..., -0.32086131,
         0.44848668, -0.0721804 ],
       [-1.7317025 ,  0.25711228,  0.46318237, ..., -0.32086131,
         0.44722369, -0.0721804 ],
       [-1.73147029,  0.25711228,  0.24419375, ..., -0.32086131,
         0.45480164, -0.0721804 ],
       ...,
       [ 1.73147029,  1.08471359,  1.08003645, ..., -0.32086131,
         0.46111661, -0.0721804 ],
       [ 1.7317025 ,  0.25711228, -0.2852817 , ..., -0.32086131,
         0.46111661, -0.0721804 ],
       [ 1.73193471, -0.57048903, -0.31281789, ..., -0.32086131,
         0.45353865, -1.34950807]])

In [33]:
# turn df_num_processed back into a dataframe
df_num_processed = pd.DataFrame(df_num_processed, columns=df_num.columns)

In [34]:
df_num_processed

Unnamed: 0.1,Unnamed: 0,POSTED_SPEED_LIMIT,STREET_NO,NUM_UNITS,INJURIES_TOTAL,INJURIES_FATAL,INJURIES_INCAPACITATING,INJURIES_NON_INCAPACITATING,INJURIES_REPORTED_NOT_EVIDENT,INJURIES_NO_INDICATION,...,CRASH_HOUR,CRASH_DAY_OF_WEEK,CRASH_MONTH,LATITUDE,LONGITUDE,CRASH_UNIT_ID,UNIT_NO,NUM_PASSENGERS,VEHICLE_YEAR,OCCUPANT_CNT
0,-1.731935,0.257112,-0.327128,3.101601,-0.335980,-0.030891,-0.124713,-0.249071,-0.197034,2.268793,...,0.148461,-0.565408,-0.466844,0.101268,-0.084868,1.675478,0.695775,-0.320861,0.448487,-0.072180
1,-1.731702,0.257112,0.463182,-0.228351,-0.335980,-0.030891,-0.124713,-0.249071,-0.197034,-0.041399,...,-0.211747,0.951181,-1.345741,0.063190,-0.072135,0.087819,-0.872136,-0.320861,0.447224,-0.072180
2,-1.731470,0.257112,0.244194,-0.228351,-0.335980,-0.030891,-0.124713,-0.249071,-0.197034,-0.041399,...,-0.932161,-0.059879,-0.759810,0.067961,-0.091767,1.634168,-0.872136,-0.320861,0.454802,-0.072180
3,-1.731238,0.257112,1.754131,3.101601,1.338085,-0.030891,-0.124713,-0.249071,2.772009,3.038857,...,0.508668,1.456710,-0.173878,0.031497,-0.081715,-0.859501,0.695775,-0.320861,0.461117,-0.072180
4,-1.731006,0.257112,0.122991,-0.228351,-0.335980,-0.030891,-0.124713,-0.249071,-0.197034,-0.041399,...,0.508668,0.951181,0.997984,0.057957,-0.089804,0.395849,0.695775,-0.320861,0.454802,-0.072180
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14913,1.731006,0.257112,-0.796327,-0.228351,-0.335980,-0.030891,-0.124713,-0.249071,-0.197034,-0.041399,...,-0.752058,-1.576468,0.997984,0.089805,-0.084526,-1.532765,0.695775,-0.320861,0.456065,-0.072180
14914,1.731238,-0.570489,-0.620052,-0.228351,-0.335980,-0.030891,-0.124713,-0.249071,-0.197034,-0.811463,...,0.148461,1.456710,-1.345741,0.107434,-0.078856,0.068841,0.695775,-0.320861,0.453539,-1.349508
14915,1.731470,1.084714,1.080036,-0.228351,-0.335980,-0.030891,-0.124713,-0.249071,-0.197034,-0.041399,...,1.229082,-0.565408,-1.345741,0.047682,-0.083342,-1.014947,0.695775,-0.320861,0.461117,-0.072180
14916,1.731702,0.257112,-0.285282,-0.228351,-0.335980,-0.030891,-0.124713,-0.249071,-0.197034,-0.041399,...,1.589289,-0.059879,-0.173878,0.113092,-0.084207,-0.263717,-0.872136,-0.320861,0.461117,-0.072180


## Preprocessing Categoricals

In [35]:
# Create the pipeline with OneHotEncoder
preprocessing_pipeline_categorical = Pipeline(steps=[
    ('encoder', OneHotEncoder())
])

# Fit and transform the data
df_cat_processed = preprocessing_pipeline_categorical.fit_transform(df_cat)

# Convert the transformed data back to a DataFrame
# df_cat_processed = pd.DataFrame(df_cat_processed)

# # Apply one-hot encoding using pandas get_dummies
# df_cat_processed = pd.get_dummies(df_cat)

# # Print the encoded DataFrame
# print(df_cat_processed)

In [36]:
df_cat_processed

<14918x45272 sparse matrix of type '<class 'numpy.float64'>'
	with 402786 stored elements in Compressed Sparse Row format>

# Put it all back together

In [37]:
import scipy.sparse as sp

# Assuming df_numerical is your preprocessed numerical DataFrame
# Assuming sparse_matrix is your sparse matrix obtained from one-hot encoding

# Convert the numerical DataFrame to a sparse matrix
df_num_processed = sp.csr_matrix(df_num_processed.values)

# Concatenate the sparse matrix and the sparse categorical matrix horizontally
combined_sparse = sp.hstack((df_num_processed, df_cat_processed), format='csr')


In [38]:
combined_sparse

<14918x45293 sparse matrix of type '<class 'numpy.float64'>'
	with 701146 stored elements in Compressed Sparse Row format>

In [39]:
sp.save_npz('data/combined_sparse.npz', combined_sparse)

# Pipeline for target