# Preprocessing Pipelines:

## Importing the data

In [1]:
import pandas as pd
import scipy.sparse as sp
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer

In [2]:
# import df_clean
df = pd.read_csv('data/df_clean.csv')

## Split into categorical and numerical

In [3]:
# split df into categorical and numerical
df_cat = df.select_dtypes(include=['object'])
df_num = df.select_dtypes(exclude=['object'])

## Preprocessing Numericals

In [4]:
preprocessing_pipeline_num = Pipeline([
    ('scaler', StandardScaler())  # Scale the features using standardization
])

In [5]:
# put df_num through preprocessing_pipeline_num
df_num_processed = preprocessing_pipeline_num.fit_transform(df_num)

In [6]:
df_num_processed

array([[-1.73200983, -0.6844618 , -1.27359555, ..., -0.3584101 ,
         0.39644782, -1.4304843 ],
       [-1.73192787, -1.55929935,  0.23276214, ..., -0.3584101 ,
         0.37220737, -0.17300886],
       [-1.7318459 ,  0.19037574, -0.50929642, ..., -0.3584101 ,
         0.37363328, -0.17300886],
       ...,
       [ 1.7318459 ,  0.19037574, -0.43622024, ...,  1.05480036,
         0.38931828,  1.08446657],
       [ 1.73192787, -0.6844618 , -0.60390712, ..., -0.3584101 ,
         0.37505919, -0.17300886],
       [ 1.73200983,  0.19037574, -0.19051484, ..., -0.3584101 ,
         0.393596  , -0.17300886]])

In [7]:
# turn df_num_processed back into a dataframe
df_num_processed = pd.DataFrame(df_num_processed, columns=df_num.columns)

In [8]:
df_num_processed

Unnamed: 0.1,Unnamed: 0,POSTED_SPEED_LIMIT,STREET_NO,NUM_UNITS,INJURIES_TOTAL,INJURIES_FATAL,INJURIES_INCAPACITATING,INJURIES_NON_INCAPACITATING,INJURIES_REPORTED_NOT_EVIDENT,INJURIES_NO_INDICATION,...,CRASH_HOUR,CRASH_DAY_OF_WEEK,CRASH_MONTH,LATITUDE,LONGITUDE,CRASH_UNIT_ID,UNIT_NO,NUM_PASSENGERS,VEHICLE_YEAR,OCCUPANT_CNT
0,-1.732010,-0.684462,-1.273596,-0.259737,-0.374394,-0.034904,-0.134348,-0.284785,-0.214535,-0.966772,...,0.672635,0.438595,-1.050659,0.094217,-0.081524,-0.396888,0.639990,-0.35841,0.396448,-1.430484
1,-1.731928,-1.559299,0.232762,-0.259737,-0.374394,-0.034904,-0.134348,-0.284785,-0.214535,-0.161510,...,-0.420330,0.944357,1.281226,0.075550,-0.095957,-0.540806,0.639990,-0.35841,0.372207,-0.173009
2,-1.731846,0.190376,-0.509296,-1.876969,-0.374394,-0.034904,-0.134348,-0.284785,-0.214535,-0.966772,...,1.036957,-0.067168,0.989741,0.085861,-0.085974,-0.052480,-0.878688,-0.35841,0.373633,-0.173009
3,-1.731764,0.190376,-0.296775,-0.259737,-0.374394,-0.034904,-0.134348,-0.284785,-0.214535,-0.161510,...,-0.602490,0.438595,0.115284,0.089774,-0.089883,-1.308577,0.639990,-0.35841,-2.478184,-0.173009
4,-1.731682,0.190376,1.446815,-0.259737,-0.374394,-0.034904,-0.134348,-0.284785,-0.214535,-0.161510,...,0.308314,0.944357,1.281226,0.132498,-0.086045,-1.485532,-0.878688,-0.35841,0.396448,-0.173009
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42260,1.731682,0.190376,0.729468,-1.876969,-0.374394,-0.034904,-0.134348,-0.284785,-0.214535,-0.966772,...,-0.238169,-0.067168,-1.342144,0.122528,-0.085624,0.564374,-0.878688,-0.35841,0.380763,-0.173009
42261,1.731764,0.190376,0.656392,-0.259737,-0.374394,-0.034904,-0.134348,-0.284785,-0.214535,-0.161510,...,0.308314,1.450120,-0.759173,-11.409902,11.412713,0.632665,-0.878688,-0.35841,-2.478184,-0.173009
42262,1.731846,0.190376,-0.436220,-0.259737,-0.374394,-0.034904,-0.134348,-0.284785,-0.214535,0.643752,...,1.583439,1.450120,-0.176202,0.089741,-0.088607,-0.827783,-0.878688,1.05480,0.389318,1.084467
42263,1.731928,-0.684462,-0.603907,-0.259737,-0.374394,-0.034904,-0.134348,-0.284785,-0.214535,-0.161510,...,-0.602490,1.450120,1.281226,0.086833,-0.083214,-1.136097,0.639990,-0.35841,0.375059,-0.173009


## Preprocessing Categoricals

In [9]:
# Create the pipeline with OneHotEncoder
preprocessing_pipeline_categorical = Pipeline(steps=[
    ('encoder', OneHotEncoder())
])

# Fit and transform the data
df_cat_processed = preprocessing_pipeline_categorical.fit_transform(df_cat)

# Convert the transformed data back to a DataFrame
# df_cat_processed = pd.DataFrame(df_cat_processed)

# # Apply one-hot encoding using pandas get_dummies
# df_cat_processed = pd.get_dummies(df_cat)

# # Print the encoded DataFrame
# print(df_cat_processed)

In [10]:
df_cat_processed

<42265x121632 sparse matrix of type '<class 'numpy.float64'>'
	with 1141155 stored elements in Compressed Sparse Row format>

# Put it all back together

In [11]:
import scipy.sparse as sp

# Assuming df_numerical is your preprocessed numerical DataFrame
# Assuming sparse_matrix is your sparse matrix obtained from one-hot encoding

# Convert the numerical DataFrame to a sparse matrix
df_num_processed = sp.csr_matrix(df_num_processed.values)

# Concatenate the sparse matrix and the sparse categorical matrix horizontally
combined_sparse = sp.hstack((df_num_processed, df_cat_processed), format='csr')


In [12]:
combined_sparse

<42265x121653 sparse matrix of type '<class 'numpy.float64'>'
	with 1986454 stored elements in Compressed Sparse Row format>

In [13]:
sp.save_npz('data/combined_sparse.npz', combined_sparse)

# Pipeline for target