# Preprocessing Pipelines:

## Importing the data

In [57]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
import pandas as pd

In [58]:
# import df_clean
df = pd.read_csv('data/df_clean.csv')

## Split into categorical and numerical

In [59]:
# split df into categorical and numerical
df_cat = df.select_dtypes(include=['object'])
df_num = df.select_dtypes(exclude=['object'])

## Preprocessing Numericals

In [60]:
preprocessing_pipeline_num = Pipeline([
    ('scaler', StandardScaler())  # Scale the features using standardization
])

In [61]:
# put df_num through preprocessing_pipeline_num
df_num_processed = preprocessing_pipeline_num.fit_transform(df_num)

In [62]:
df_num_processed

array([[-1.73204965,  0.25409834, -1.09426593, ...,  1.18327123,
         0.46773941,  1.19695084],
       [-1.73204732,  0.25409834, -1.09426593, ..., -0.32943142,
         0.45882345, -0.06903189],
       [-1.732045  ,  0.25409834,  0.39233957, ..., -0.32943142,
         0.44863378, -0.06903189],
       ...,
       [ 1.732045  , -1.38371563,  0.0578967 , ...,  1.18327123,
         0.45500232,  1.19695084],
       [ 1.73204732, -1.38371563,  0.0578967 , ..., -0.32943142,
         0.46009716, -0.06903189],
       [ 1.73204965, -1.38371563,  0.0578967 , ..., -0.32943142,
         0.44226524, -1.33501463]])

In [63]:
# turn df_num_processed back into a dataframe
df_num_processed = pd.DataFrame(df_num_processed, columns=df_num.columns)

In [64]:
df_num_processed

Unnamed: 0.1,Unnamed: 0,POSTED_SPEED_LIMIT,STREET_NO,NUM_UNITS,INJURIES_TOTAL,INJURIES_FATAL,INJURIES_INCAPACITATING,INJURIES_NON_INCAPACITATING,INJURIES_REPORTED_NOT_EVIDENT,INJURIES_NO_INDICATION,...,CRASH_HOUR,CRASH_DAY_OF_WEEK,CRASH_MONTH,LATITUDE,LONGITUDE,CRASH_UNIT_ID,UNIT_NO,NUM_PASSENGERS,VEHICLE_YEAR,OCCUPANT_CNT
0,-1.732050,0.254098,-1.094266,-0.230366,1.348329,-0.032438,-0.12319,-0.25632,2.840255,-0.039009,...,0.140730,-1.070998,-1.051104,0.088024,-0.074436,-0.403396,-0.001001,1.183271,0.467739,1.196951
1,-1.732047,0.254098,-1.094266,-0.230366,1.348329,-0.032438,-0.12319,-0.25632,2.840255,-0.039009,...,0.140730,-1.070998,-1.051104,0.088024,-0.074436,-0.403394,-0.000678,-0.329431,0.458823,-0.069032
2,-1.732045,0.254098,0.392340,-0.230366,-0.337399,-0.032438,-0.12319,-0.25632,-0.194936,-0.039009,...,-0.936284,-0.062592,0.700958,0.113300,-0.088725,-0.707556,-0.001001,-0.329431,0.448634,-0.069032
3,-1.732043,0.254098,0.392340,-0.230366,-0.337399,-0.032438,-0.12319,-0.25632,-0.194936,-0.039009,...,-0.936284,-0.062592,0.700958,0.113300,-0.088725,-0.707554,-0.000678,-0.329431,0.463918,-0.069032
4,-1.732040,0.254098,-1.176142,-0.230366,-0.337399,-0.032438,-0.12319,-0.25632,-0.194936,-0.039009,...,-2.372302,0.945815,0.116937,0.088562,-0.085212,1.256434,-0.001001,-0.329431,-2.101331,-0.069032
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1491789,1.732040,-1.383716,0.284444,1.468405,-0.337399,-0.032438,-0.12319,-0.25632,-0.194936,-0.876691,...,-0.397777,-0.062592,1.576989,0.106066,-0.088651,-0.532000,-0.000355,-0.329431,0.466466,-1.335015
1491790,1.732043,-1.383716,0.057897,-0.230366,-0.337399,-0.032438,-0.12319,-0.25632,-0.194936,-0.039009,...,0.858739,-1.575201,0.116937,0.108209,-0.087520,0.258508,-0.001001,-0.329431,0.443539,-0.069032
1491791,1.732045,-1.383716,0.057897,-0.230366,-0.337399,-0.032438,-0.12319,-0.25632,-0.194936,-0.039009,...,0.858739,-1.575201,0.116937,0.108209,-0.087520,0.263095,-0.001001,1.183271,0.455002,1.196951
1491792,1.732047,-1.383716,0.057897,-0.230366,-0.337399,-0.032438,-0.12319,-0.25632,-0.194936,-0.039009,...,0.858739,-1.575201,0.116937,0.108209,-0.087520,0.258511,-0.000678,-0.329431,0.460097,-0.069032


## Preprocessing Categoricals

In [65]:
# Create the pipeline with OneHotEncoder
preprocessing_pipeline_categorical = Pipeline(steps=[
    ('encoder', OneHotEncoder())
])

# Fit and transform the data
df_cat_processed = preprocessing_pipeline_categorical.fit_transform(df_cat)

# Convert the transformed data back to a DataFrame
# df_cat_processed = pd.DataFrame(df_cat_processed)

# # Apply one-hot encoding using pandas get_dummies
# df_cat_processed = pd.get_dummies(df_cat)

# # Print the encoded DataFrame
# print(df_cat_processed)

In [66]:
df_cat_processed

<1491794x1518665 sparse matrix of type '<class 'numpy.float64'>'
	with 40278438 stored elements in Compressed Sparse Row format>

# Put it all back together

In [67]:
import scipy.sparse as sp

# Assuming df_numerical is your preprocessed numerical DataFrame
# Assuming sparse_matrix is your sparse matrix obtained from one-hot encoding

# Convert the numerical DataFrame to a sparse matrix
df_num_processed = sp.csr_matrix(df_num_processed.values)

# Concatenate the sparse matrix and the sparse categorical matrix horizontally
combined_sparse = sp.hstack((df_num_processed, df_cat_processed), format='csr')


In [71]:
combined_sparse

<1491794x1518686 sparse matrix of type '<class 'numpy.float64'>'
	with 70114318 stored elements in Compressed Sparse Row format>

In [68]:
# combine df_cat_processed and df_num_processed
# df_processed = pd.concat([df_num_processed, df_cat_processed], axis=1)

In [69]:
# df_processed.head().T

In [70]:
# export df_processed as csv
# df_processed.to_csv('data/df_processed.csv', index=False)