# Preprocessing Pipelines:

## Importing the data

In [70]:
import pandas as pd
import scipy.sparse as sp
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer

In [71]:
# import df_clean
df = pd.read_csv('data/df_clean.csv')

In [72]:
# Extract the feature names
feature_names = df.columns.tolist()

# Create a dataframe to store the feature names
feature_names_df = pd.DataFrame({'Feature Name': feature_names})

# Export the feature names to a CSV file
feature_names_df.to_csv('data/feature_names.csv', index=False)

## Split into categorical and numerical

In [73]:
# split df into categorical and numerical
df_cat = df.select_dtypes(include=['object'])
df_num = df.select_dtypes(exclude=['object'])

## Preprocessing Numericals

In [74]:
preprocessing_pipeline_num = Pipeline([
    ('scaler', StandardScaler())  # Scale the features using standardization
])

In [75]:
# put df_num through preprocessing_pipeline_num
df_num_processed = preprocessing_pipeline_num.fit_transform(df_num)

In [76]:
df_num_processed

array([[-1.73204261, -0.67728721, -1.26319354, ..., -0.35784396,
         0.3984094 , -1.4346986 ],
       [-1.73202622, -1.5516648 ,  0.23283039, ..., -0.35784396,
         0.37427325, -0.17222214],
       [-1.73200983,  0.19709038, -0.50413757, ..., -0.35784396,
         0.37569302, -0.17222214],
       ...,
       [ 1.73200983,  0.19709038, -1.25968751, ...,  1.06112321,
         0.38421166,  1.09025432],
       [ 1.73202622,  0.19709038, -0.95641573, ...,  1.06112321,
         0.40408849, -0.17222214],
       [ 1.73204261,  0.19709038,  0.76013763, ..., -0.35784396,
         0.39556985, -1.4346986 ]])

In [77]:
# turn df_num_processed back into a dataframe
df_num_processed = pd.DataFrame(df_num_processed, columns=df_num.columns)

In [78]:
df_num_processed

Unnamed: 0.1,Unnamed: 0,POSTED_SPEED_LIMIT,STREET_NO,NUM_UNITS,INJURIES_TOTAL,INJURIES_FATAL,INJURIES_INCAPACITATING,INJURIES_NON_INCAPACITATING,INJURIES_REPORTED_NOT_EVIDENT,INJURIES_NO_INDICATION,...,CRASH_HOUR,CRASH_DAY_OF_WEEK,CRASH_MONTH,LATITUDE,LONGITUDE,CRASH_UNIT_ID,UNIT_NO,NUM_PASSENGERS,VEHICLE_YEAR,OCCUPANT_CNT
0,-1.732043,-0.677287,-1.263194,-0.255489,-0.375647,-0.033062,-0.13469,-0.285158,-0.216465,-0.970574,...,0.670234,0.435013,-1.054037,0.092215,-0.079054,-0.396359,-0.002124,-0.357844,0.398409,-1.434699
1,-1.732026,-1.551665,0.232830,-0.255489,-0.375647,-0.033062,-0.13469,-0.285158,-0.216465,-0.156065,...,-0.422759,0.942270,1.275244,0.073038,-0.093881,-0.540146,-0.002124,-0.357844,0.374273,-0.172222
2,-1.732010,0.197090,-0.504138,-1.880307,-0.375647,-0.033062,-0.13469,-0.285158,-0.216465,-0.970574,...,1.034565,-0.072244,0.984084,0.083630,-0.083626,-0.052266,-0.002245,-0.357844,0.375693,-0.172222
3,-1.731993,0.197090,-0.293074,-0.255489,-0.375647,-0.033062,-0.13469,-0.285158,-0.216465,-0.156065,...,-0.604924,0.435013,0.110603,0.087650,-0.087641,-1.307216,-0.002124,-0.357844,-2.463854,-0.172222
4,-1.731977,0.197090,1.438555,-0.255489,-0.375647,-0.033062,-0.13469,-0.285158,-0.216465,-0.156065,...,0.305903,0.942270,1.275244,0.131539,-0.083699,-1.484010,-0.002245,-0.357844,0.398409,-0.172222
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
211319,1.731977,0.197090,-1.011811,-0.255489,-0.375647,-0.033062,-0.13469,-0.285158,-0.216465,-0.156065,...,-0.240593,-1.594015,-0.180557,0.098126,-0.081188,1.245487,-0.002124,-0.357844,0.385631,-0.172222
211320,1.731993,0.197090,-0.207177,-0.255489,-0.375647,-0.033062,-0.13469,-0.285158,-0.216465,-0.970574,...,1.216731,1.449527,0.984084,0.107559,-0.086220,-0.047047,-0.002124,-0.357844,0.391311,-1.434699
211321,1.732010,0.197090,-1.259688,-0.255489,-0.375647,-0.033062,-0.13469,-0.285158,-0.216465,1.472954,...,-2.426579,0.435013,1.275244,0.104416,-0.078901,-0.541439,-0.002245,1.061123,0.384212,1.090254
211322,1.732026,0.197090,-0.956416,-0.255489,-0.375647,-0.033062,-0.13469,-0.285158,-0.216465,-0.156065,...,0.852400,0.435013,0.692924,0.091944,-0.081462,0.380758,-0.002124,1.061123,0.404088,-0.172222


## Preprocessing Categoricals

In [79]:
# Create the pipeline with OneHotEncoder
preprocessing_pipeline_categorical = Pipeline(steps=[
    ('encoder', OneHotEncoder())
])

# Fit and transform the data
df_cat_processed = preprocessing_pipeline_categorical.fit_transform(df_cat)

# Convert the transformed data back to a DataFrame
# df_cat_processed = pd.DataFrame(df_cat_processed)

# # Apply one-hot encoding using pandas get_dummies
# df_cat_processed = pd.get_dummies(df_cat)

# # Print the encoded DataFrame
# print(df_cat_processed)

In [80]:
df_cat_processed

<211324x490965 sparse matrix of type '<class 'numpy.float64'>'
	with 5705748 stored elements in Compressed Sparse Row format>

# Put it all back together

In [81]:
import scipy.sparse as sp

# Assuming df_numerical is your preprocessed numerical DataFrame
# Assuming sparse_matrix is your sparse matrix obtained from one-hot encoding

# Convert the numerical DataFrame to a sparse matrix
df_num_processed = sp.csr_matrix(df_num_processed.values)

# Concatenate the sparse matrix and the sparse categorical matrix horizontally
combined_sparse = sp.hstack((df_num_processed, df_cat_processed), format='csr')


In [82]:
combined_sparse

<211324x490986 sparse matrix of type '<class 'numpy.float64'>'
	with 9932228 stored elements in Compressed Sparse Row format>

In [83]:
sp.save_npz('data/combined_sparse.npz', combined_sparse)

# Pipeline for target