# Preprocessing Pipelines:

## Importing the data

In [40]:
import pandas as pd
import scipy.sparse as sp
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer

In [41]:
# import df_clean
df = pd.read_csv('data/df_clean.csv')

## Split into categorical and numerical

In [42]:
# split df into categorical and numerical
df_cat = df.select_dtypes(include=['object'])
df_num = df.select_dtypes(exclude=['object'])

## Preprocessing Numericals

In [43]:
preprocessing_pipeline_num = Pipeline([
    ('scaler', StandardScaler())  # Scale the features using standardization
])

In [44]:
# put df_num through preprocessing_pipeline_num
df_num_processed = preprocessing_pipeline_num.fit_transform(df_num)

In [45]:
df_num_processed

array([[-1.73186267,  1.05282525, -0.13463675, ..., -0.38402085,
         0.41761049, -0.17303482],
       [-1.73148639,  0.21543331, -0.01245143, ..., -0.38402085,
        -2.31268499, -0.17303482],
       [-1.7311101 , -0.62195863,  1.30510182, ..., -0.38402085,
         0.40810672, -1.51730368],
       ...,
       [ 1.7311101 ,  0.21543331, -0.68341127, ..., -0.38402085,
         0.42575657, -0.17303482],
       [ 1.73148639,  0.21543331,  2.91858366, ..., -0.38402085,
         0.41217976, -0.17303482],
       [ 1.73186267,  0.21543331, -1.2131164 , ..., -0.38402085,
         0.42847193, -1.51730368]])

In [46]:
# turn df_num_processed back into a dataframe
df_num_processed = pd.DataFrame(df_num_processed, columns=df_num.columns)

In [47]:
df_num_processed

Unnamed: 0.1,Unnamed: 0,POSTED_SPEED_LIMIT,STREET_NO,NUM_UNITS,INJURIES_TOTAL,INJURIES_FATAL,INJURIES_INCAPACITATING,INJURIES_NON_INCAPACITATING,INJURIES_REPORTED_NOT_EVIDENT,INJURIES_NO_INDICATION,...,CRASH_HOUR,CRASH_DAY_OF_WEEK,CRASH_MONTH,LATITUDE,LONGITUDE,CRASH_UNIT_ID,UNIT_NO,NUM_PASSENGERS,VEHICLE_YEAR,OCCUPANT_CNT
0,-1.731863,1.052825,-0.134637,-0.236568,-0.379233,-0.039026,-0.12709,-0.30044,-0.207306,-0.128499,...,-1.313035,0.951055,1.284959,0.030066,-0.061942,0.988888,-0.889789,-0.384021,0.417610,-0.173035
1,-1.731486,0.215433,-0.012451,-0.236568,-0.379233,-0.039026,-0.12709,-0.30044,-0.207306,-0.128499,...,0.508706,0.444611,0.402107,0.107432,-0.097597,0.869747,-0.889789,-0.384021,-2.312685,-0.173035
2,-1.731110,-0.621959,1.305102,3.070707,1.220043,-0.039026,-0.12709,-0.30044,2.706370,-0.959816,...,1.783925,-0.568279,1.579243,0.052534,-0.076227,-0.498721,3.761771,-0.384021,0.408107,-1.517304
3,-1.730734,0.215433,-0.667520,-0.236568,2.819319,-0.039026,-0.12709,-0.30044,5.620046,-1.791132,...,0.144358,-1.074723,0.402107,0.116466,-0.080263,-0.110220,0.660731,-0.384021,0.416253,-0.173035
4,-1.730358,0.215433,-1.202522,-0.236568,-0.379233,-0.039026,-0.12709,-0.30044,-0.207306,3.196767,...,0.690880,0.444611,-1.363596,0.089550,-0.091822,0.165127,0.660731,5.804681,0.414895,5.204041
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9201,1.730358,-0.621959,-0.171010,-0.236568,-0.379233,-0.039026,-0.12709,-0.30044,-0.207306,-0.959816,...,-0.766513,-1.074723,1.579243,0.086067,-0.084749,1.550928,-0.889789,-0.384021,-2.312685,-1.517304
9202,1.730734,0.215433,-0.365235,-0.236568,-0.379233,-0.039026,-0.12709,-0.30044,-0.207306,-0.128499,...,0.690880,0.444611,0.107823,0.042862,-0.064242,-1.275107,0.660731,-0.384021,0.420326,-0.173035
9203,1.731110,0.215433,-0.683411,-0.236568,-0.379233,-0.039026,-0.12709,-0.30044,-0.207306,-0.128499,...,1.783925,-0.061834,1.284959,0.097729,-0.074512,0.989253,0.660731,-0.384021,0.425757,-0.173035
9204,1.731486,0.215433,2.918584,-0.236568,-0.379233,-0.039026,-0.12709,-0.30044,-0.207306,-0.128499,...,0.326532,-0.568279,-1.657880,0.028009,-0.073030,1.057049,-0.889789,-0.384021,0.412180,-0.173035


## Preprocessing Categoricals

In [48]:
# Create the pipeline with OneHotEncoder
preprocessing_pipeline_categorical = Pipeline(steps=[
    ('encoder', OneHotEncoder())
])

# Fit and transform the data
df_cat_processed = preprocessing_pipeline_categorical.fit_transform(df_cat)

# Convert the transformed data back to a DataFrame
# df_cat_processed = pd.DataFrame(df_cat_processed)

# # Apply one-hot encoding using pandas get_dummies
# df_cat_processed = pd.get_dummies(df_cat)

# # Print the encoded DataFrame
# print(df_cat_processed)

In [49]:
df_cat_processed

<9206x28397 sparse matrix of type '<class 'numpy.float64'>'
	with 248562 stored elements in Compressed Sparse Row format>

# Put it all back together

In [50]:
import scipy.sparse as sp

# Assuming df_numerical is your preprocessed numerical DataFrame
# Assuming sparse_matrix is your sparse matrix obtained from one-hot encoding

# Convert the numerical DataFrame to a sparse matrix
df_num_processed = sp.csr_matrix(df_num_processed.values)

# Concatenate the sparse matrix and the sparse categorical matrix horizontally
combined_sparse = sp.hstack((df_num_processed, df_cat_processed), format='csr')


In [51]:
combined_sparse

<9206x28418 sparse matrix of type '<class 'numpy.float64'>'
	with 432682 stored elements in Compressed Sparse Row format>

In [52]:
sp.save_npz('data/combined_sparse.npz', combined_sparse)

# Pipeline for target