# Preprocessing Pipelines:

## Importing the data

In [40]:
import pandas as pd
import scipy.sparse as sp
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer

In [41]:
# import df_clean
df = pd.read_csv('data/df_clean.csv')

## Split into categorical and numerical

In [42]:
# split df into categorical and numerical
df_cat = df.select_dtypes(include=['object'])
df_num = df.select_dtypes(exclude=['object'])

## Preprocessing Numericals

In [43]:
preprocessing_pipeline_num = Pipeline([
    ('scaler', StandardScaler())  # Scale the features using standardization
])

In [44]:
# put df_num through preprocessing_pipeline_num
df_num_processed = preprocessing_pipeline_num.fit_transform(df_num)

In [45]:
df_num_processed

array([[-1.73204849,  0.25341247, -0.520899  , ..., -0.33147988,
         0.4525077 , -0.07047688],
       [-1.73204384,  0.25341247,  0.72517678, ..., -0.33147988,
         0.45123319, -0.07047688],
       [-1.7320392 ,  0.25341247,  0.37989927, ..., -0.33147988,
         0.45888027, -0.07047688],
       ...,
       [ 1.7320392 ,  0.25341247,  0.07564482, ..., -0.33147988,
         0.45378222, -1.35577012],
       [ 1.73204384,  0.25341247, -0.16707501, ..., -0.33147988,
         0.4716254 , -0.07047688],
       [ 1.73204849,  0.25341247,  0.07256809, ...,  1.18957089,
         0.44995868,  1.21481636]])

In [46]:
# turn df_num_processed back into a dataframe
df_num_processed = pd.DataFrame(df_num_processed, columns=df_num.columns)

In [47]:
df_num_processed

Unnamed: 0.1,Unnamed: 0,POSTED_SPEED_LIMIT,STREET_NO,NUM_UNITS,INJURIES_TOTAL,INJURIES_FATAL,INJURIES_INCAPACITATING,INJURIES_NON_INCAPACITATING,INJURIES_REPORTED_NOT_EVIDENT,INJURIES_NO_INDICATION,...,CRASH_HOUR,CRASH_DAY_OF_WEEK,CRASH_MONTH,LATITUDE,LONGITUDE,CRASH_UNIT_ID,UNIT_NO,NUM_PASSENGERS,VEHICLE_YEAR,OCCUPANT_CNT
0,-1.732048,0.253412,-0.520899,3.155190,-0.337120,-0.0316,-0.1229,-0.256319,-0.194771,2.483579,...,0.140107,-0.566452,-0.466318,0.098876,-0.081949,1.695863,-0.001058,-0.331480,0.452508,-0.070477
1,-1.732044,0.253412,0.725177,-0.230431,-0.337120,-0.0316,-0.1229,-0.256319,-0.194771,-0.039188,...,-0.218928,0.946012,-1.342600,0.059352,-0.068732,0.096212,-0.001287,-0.331480,0.451233,-0.070477
2,-1.732039,0.253412,0.379899,-0.230431,-0.337120,-0.0316,-0.1229,-0.256319,-0.194771,-0.039188,...,-0.936997,-0.062297,-0.758412,0.064304,-0.089110,1.654240,-0.001287,-0.331480,0.458880,-0.070477
3,-1.732035,0.253412,2.760605,3.155190,1.345313,-0.0316,-0.1229,-0.256319,2.848621,3.324502,...,0.499141,1.450167,-0.174224,0.026455,-0.078676,-0.858263,-0.001058,-0.331480,0.465253,-0.070477
4,-1.732030,0.253412,0.188800,-0.230431,-0.337120,-0.0316,-0.1229,-0.256319,-0.194771,-0.039188,...,0.499141,0.946012,0.994152,0.053920,-0.087072,0.406568,-0.001058,-0.331480,0.458880,-0.070477
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
745892,1.732030,0.253412,-1.148210,-0.230431,-0.337120,-0.0316,-0.1229,-0.256319,-0.194771,-0.039188,...,-0.936997,-1.070607,-0.758412,0.102611,-0.073993,-0.957205,-0.001058,-0.331480,0.460155,-0.070477
745893,1.732035,0.253412,-0.674394,-0.230431,-0.337120,-0.0316,-0.1229,-0.256319,-0.194771,-0.880111,...,0.678658,-1.574762,-0.466318,0.080179,-0.075864,-0.306943,-0.001058,-0.331480,0.463978,-1.355770
745894,1.732039,0.253412,0.075645,-0.230431,-0.337120,-0.0316,-0.1229,-0.256319,-0.194771,-0.880111,...,1.217210,-0.566452,0.117870,0.069570,-0.073056,1.253322,-0.001058,-0.331480,0.453782,-1.355770
745895,1.732044,0.253412,-0.167075,-0.230431,-0.337120,-0.0316,-0.1229,-0.256319,-0.194771,-0.039188,...,1.037693,1.450167,1.286246,0.097939,-0.083910,1.433384,-0.001058,-0.331480,0.471625,-0.070477


## Preprocessing Categoricals

In [48]:
# Create the pipeline with OneHotEncoder
preprocessing_pipeline_categorical = Pipeline(steps=[
    ('encoder', OneHotEncoder())
])

# Fit and transform the data
df_cat_processed = preprocessing_pipeline_categorical.fit_transform(df_cat)

# Convert the transformed data back to a DataFrame
# df_cat_processed = pd.DataFrame(df_cat_processed)

# # Apply one-hot encoding using pandas get_dummies
# df_cat_processed = pd.get_dummies(df_cat)

# # Print the encoded DataFrame
# print(df_cat_processed)

In [49]:
df_cat_processed

<745897x1215739 sparse matrix of type '<class 'numpy.float64'>'
	with 20139219 stored elements in Compressed Sparse Row format>

# Put it all back together

In [50]:
import scipy.sparse as sp

# Assuming df_numerical is your preprocessed numerical DataFrame
# Assuming sparse_matrix is your sparse matrix obtained from one-hot encoding

# Convert the numerical DataFrame to a sparse matrix
df_num_processed = sp.csr_matrix(df_num_processed.values)

# Concatenate the sparse matrix and the sparse categorical matrix horizontally
combined_sparse = sp.hstack((df_num_processed, df_cat_processed), format='csr')


In [51]:
combined_sparse

<745897x1215760 sparse matrix of type '<class 'numpy.float64'>'
	with 35057158 stored elements in Compressed Sparse Row format>

In [52]:
sp.save_npz('data/combined_sparse.npz', combined_sparse)

# Pipeline for target