# Preprocessing Pipelines:

## Importing the data

In [15]:
import pandas as pd
import scipy.sparse as sp
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer

In [16]:
# import df_clean
df = pd.read_csv('data/df_clean.csv')

In [17]:
# Extract the feature names
feature_names = df.columns.tolist()

# Create a dataframe to store the feature names
feature_names_df = pd.DataFrame({'Feature Name': feature_names})

# Export the feature names to a CSV file
feature_names_df.to_csv('data/feature_names.csv', index=False)

## Split into categorical and numerical

In [18]:
# split df into categorical and numerical
df_cat = df.select_dtypes(include=['object'])
df_num = df.select_dtypes(exclude=['object'])

## Preprocessing Numericals

In [19]:
preprocessing_pipeline_num = Pipeline([
    ('scaler', StandardScaler())  # Scale the features using standardization
])

In [20]:
# put df_num through preprocessing_pipeline_num
df_num_processed = preprocessing_pipeline_num.fit_transform(df_num)

In [21]:
df_num_processed

array([[-1.73203032, -0.68290943, -1.2677108 , ..., -0.36159775,
         0.39608588, -1.44844162],
       [-1.73198934, -1.55850896,  0.23261637, ..., -0.36159775,
         0.37183017, -0.17316667],
       [-1.73194835,  0.19269011, -0.50647145, ..., -0.36159775,
         0.37325698, -0.17316667],
       ...,
       [ 1.73194835,  0.19269011, -1.10350813, ..., -0.36159775,
         0.39037865, -0.17316667],
       [ 1.73198934,  0.19269011,  0.97064935, ..., -0.36159775,
         0.38039101, -0.17316667],
       [ 1.73203032,  0.19269011,  2.06099726, ..., -0.36159775,
         0.4017931 , -0.17316667]])

In [22]:
# turn df_num_processed back into a dataframe
df_num_processed = pd.DataFrame(df_num_processed, columns=df_num.columns)

In [23]:
df_num_processed

Unnamed: 0.1,Unnamed: 0,POSTED_SPEED_LIMIT,STREET_NO,NUM_UNITS,INJURIES_TOTAL,INJURIES_FATAL,INJURIES_INCAPACITATING,INJURIES_NON_INCAPACITATING,INJURIES_REPORTED_NOT_EVIDENT,INJURIES_NO_INDICATION,...,CRASH_HOUR,CRASH_DAY_OF_WEEK,CRASH_MONTH,LATITUDE,LONGITUDE,CRASH_UNIT_ID,UNIT_NO,NUM_PASSENGERS,VEHICLE_YEAR,OCCUPANT_CNT
0,-1.732030,-0.682909,-1.267711,-0.255929,-0.375920,-0.033224,-0.133136,-0.285641,-0.21524,-0.981804,...,0.671875,0.433897,-1.054530,0.093614,-0.080803,-0.394137,0.640844,-0.361598,0.396086,-1.448442
1,-1.731989,-1.558509,0.232616,-0.255929,-0.375920,-0.033224,-0.133136,-0.285641,-0.21524,-0.158302,...,-0.419964,0.940205,1.278859,0.074801,-0.095349,-0.537751,0.640844,-0.361598,0.371830,-0.173167
2,-1.731948,0.192690,-0.506471,-1.859233,-0.375920,-0.033224,-0.133136,-0.285641,-0.21524,-0.981804,...,1.035822,-0.072410,0.987185,0.085192,-0.085288,-0.050456,-0.880270,-0.361598,0.373257,-0.173167
3,-1.731907,0.192690,-0.294801,-0.255929,-0.375920,-0.033224,-0.133136,-0.285641,-0.21524,-0.158302,...,-0.601938,0.433897,0.112164,0.089135,-0.089227,-1.303902,0.640844,-0.361598,-2.480355,-0.173167
4,-1.731866,0.192690,1.441809,-0.255929,-0.375920,-0.033224,-0.133136,-0.285641,-0.21524,-0.158302,...,0.307929,0.940205,1.278859,0.132192,-0.085360,-1.480484,-0.880270,-0.361598,0.396086,-0.173167
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
84524,1.731866,-3.309708,-0.066254,-0.255929,-0.375920,-0.033224,-0.133136,-0.285641,-0.21524,-0.981804,...,-1.147858,-0.578718,0.987185,0.079110,-0.089093,-0.022137,-0.880270,-0.361598,-2.480355,-0.173167
84525,1.731907,0.192690,1.217129,-0.255929,-0.375920,-0.033224,-0.133136,-0.285641,-0.21524,-0.158302,...,-0.056018,0.433897,-0.471183,0.060974,-0.084782,-0.863190,-0.880270,-0.361598,-2.480355,-0.173167
84526,1.731948,0.192690,-1.103508,-0.255929,-0.375920,-0.033224,-0.133136,-0.285641,-0.21524,-0.158302,...,1.399769,-1.591333,-0.179509,0.095870,-0.079911,-0.798614,-0.880270,-0.361598,0.390379,-0.173167
84527,1.731989,0.192690,0.970649,-0.255929,1.141783,-0.033224,-0.133136,1.732832,-0.21524,-0.158302,...,1.035822,0.940205,-0.471183,0.110986,-0.101152,-0.306746,-0.880270,-0.361598,0.380391,-0.173167


## Preprocessing Categoricals

In [24]:
# Create the pipeline with OneHotEncoder
preprocessing_pipeline_categorical = Pipeline(steps=[
    ('encoder', OneHotEncoder())
])

# Fit and transform the data
df_cat_processed = preprocessing_pipeline_categorical.fit_transform(df_cat)

# Convert the transformed data back to a DataFrame
# df_cat_processed = pd.DataFrame(df_cat_processed)

# # Apply one-hot encoding using pandas get_dummies
# df_cat_processed = pd.get_dummies(df_cat)

# # Print the encoded DataFrame
# print(df_cat_processed)

In [25]:
df_cat_processed

<84529x229127 sparse matrix of type '<class 'numpy.float64'>'
	with 1944167 stored elements in Compressed Sparse Row format>

# Put it all back together

In [26]:
import scipy.sparse as sp

# Assuming df_numerical is your preprocessed numerical DataFrame
# Assuming sparse_matrix is your sparse matrix obtained from one-hot encoding

# Convert the numerical DataFrame to a sparse matrix
df_num_processed = sp.csr_matrix(df_num_processed.values)

# Concatenate the sparse matrix and the sparse categorical matrix horizontally
combined_sparse = sp.hstack((df_num_processed, df_cat_processed), format='csr')


In [27]:
combined_sparse

<84529x229148 sparse matrix of type '<class 'numpy.float64'>'
	with 3634746 stored elements in Compressed Sparse Row format>

In [28]:
sp.save_npz('data/combined_sparse.npz', combined_sparse)

# Pipeline for target