# Preprocessing Pipelines:

## Importing the data

In [205]:
import pandas as pd
import scipy.sparse as sp
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer

In [206]:
# import df_clean
df = pd.read_csv('data/df_clean.csv')

In [207]:
df.head().T

Unnamed: 0,0,1,2,3,4
Unnamed: 0,0,1,2,3,4
POSTED_SPEED_LIMIT,25,20,30,30,30
NUM_UNITS,2,2,1,2,2
INJURIES_TOTAL,0.0,0.0,0.0,0.0,0.0
INJURIES_FATAL,0.0,0.0,0.0,0.0,0.0
INJURIES_INCAPACITATING,0.0,0.0,0.0,0.0,0.0
INJURIES_NON_INCAPACITATING,0.0,0.0,0.0,0.0,0.0
INJURIES_REPORTED_NOT_EVIDENT,0.0,0.0,0.0,0.0,0.0
CRASH_HOUR,17,11,19,10,15
CRASH_DAY_OF_WEEK,5,6,4,5,6


In [208]:
# # Extract the feature names
# feature_names = df.columns.tolist()

# # Create a dataframe to store the feature names
# feature_names_df = pd.DataFrame({'Feature Name': feature_names})

# # Export the feature names to a CSV file
# feature_names_df.to_csv('data/feature_names.csv', index=False)

## Split into categorical and numerical

In [209]:
# split df into categorical and numerical
df_cat = df.select_dtypes(include=['object'])
df_num = df.select_dtypes(exclude=['object'])

## Preprocessing Numericals

In [210]:
preprocessing_pipeline_num = Pipeline([
    ('scaler', StandardScaler())  # Scale the features using standardization
])

In [211]:
# put df_num through preprocessing_pipeline_num
df_num_processed = preprocessing_pipeline_num.fit_transform(df_num)

In [212]:
df_num_processed

array([[-1.73184592, -0.70409592, -0.26305371, ..., -0.34957702,
         0.38803084, -1.37668793],
       [-1.73143611, -1.59705079, -0.26305371, ..., -0.34957702,
         0.36391914, -0.17251336],
       [-1.7310263 ,  0.18885895, -1.96695641, ..., -0.34957702,
         0.36533748, -0.17251336],
       ...,
       [ 1.7310263 ,  0.18885895, -0.26305371, ...,  0.99359324,
         0.39228585,  1.03166122],
       [ 1.73143611,  0.18885895, -0.26305371, ..., -0.34957702,
         0.36533748, -1.37668793],
       [ 1.73184592,  0.18885895, -0.26305371, ..., -0.34957702,
         0.38377584, -0.17251336]])

In [213]:
# turn df_num_processed back into a dataframe
df_num_processed = pd.DataFrame(df_num_processed, columns=df_num.columns)

In [214]:
df_num_processed.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,8443,8444,8445,8446,8447,8448,8449,8450,8451,8452
Unnamed: 0,-1.731846,-1.731436,-1.731026,-1.730616,-1.730207,-1.729797,-1.729387,-1.728977,-1.728567,-1.728158,...,1.728158,1.728567,1.728977,1.729387,1.729797,1.730207,1.730616,1.731026,1.731436,1.731846
POSTED_SPEED_LIMIT,-0.704096,-1.597051,0.188859,0.188859,0.188859,0.188859,0.188859,0.188859,0.188859,1.081814,...,0.188859,0.188859,0.188859,0.188859,0.188859,0.188859,0.188859,0.188859,0.188859,0.188859
NUM_UNITS,-0.263054,-0.263054,-1.966956,-0.263054,-0.263054,-0.263054,-0.263054,-0.263054,-0.263054,-0.263054,...,-0.263054,-0.263054,-0.263054,1.440849,-0.263054,-0.263054,-0.263054,-0.263054,-0.263054,-0.263054
INJURIES_TOTAL,-0.370209,-0.370209,-0.370209,-0.370209,-0.370209,-0.370209,-0.370209,-0.370209,-0.370209,-0.370209,...,-0.370209,1.14817,-0.370209,-0.370209,-0.370209,-0.370209,1.14817,-0.370209,-0.370209,-0.370209
INJURIES_FATAL,-0.04073,-0.04073,-0.04073,-0.04073,-0.04073,-0.04073,-0.04073,-0.04073,-0.04073,-0.04073,...,-0.04073,-0.04073,-0.04073,-0.04073,-0.04073,-0.04073,-0.04073,-0.04073,-0.04073,-0.04073
INJURIES_INCAPACITATING,-0.130038,-0.130038,-0.130038,-0.130038,-0.130038,-0.130038,-0.130038,-0.130038,-0.130038,-0.130038,...,-0.130038,-0.130038,-0.130038,-0.130038,-0.130038,-0.130038,-0.130038,-0.130038,-0.130038,-0.130038
INJURIES_NON_INCAPACITATING,-0.284624,-0.284624,-0.284624,-0.284624,-0.284624,-0.284624,-0.284624,-0.284624,-0.284624,-0.284624,...,-0.284624,-0.284624,-0.284624,-0.284624,-0.284624,-0.284624,1.718644,-0.284624,-0.284624,-0.284624
INJURIES_REPORTED_NOT_EVIDENT,-0.208845,-0.208845,-0.208845,-0.208845,-0.208845,-0.208845,-0.208845,-0.208845,-0.208845,-0.208845,...,-0.208845,2.494629,-0.208845,-0.208845,-0.208845,-0.208845,-0.208845,-0.208845,-0.208845,-0.208845
CRASH_HOUR,0.677569,-0.412793,1.041023,-0.59452,0.314115,-0.412793,0.859296,-0.049339,-1.139701,-0.049339,...,-0.957974,1.767931,0.495842,-0.957974,1.404477,-0.412793,-1.139701,0.859296,0.314115,-2.048336
CRASH_DAY_OF_WEEK,0.449562,0.956857,-0.057733,0.449562,0.956857,1.464153,-0.565028,-1.579619,0.449562,-0.057733,...,0.449562,-1.072324,0.956857,0.449562,-1.072324,-0.565028,-0.057733,0.449562,0.956857,-0.565028


## Preprocessing Categoricals

In [215]:
# Create the pipeline with OneHotEncoder
preprocessing_pipeline_categorical = Pipeline(steps=[
    ('encoder', OneHotEncoder())
])

# Fit and transform the data
df_cat_processed = preprocessing_pipeline_categorical.fit_transform(df_cat)

# Convert the transformed data back to a DataFrame
# df_cat_processed = pd.DataFrame(df_cat_processed)

# # Apply one-hot encoding using pandas get_dummies
# df_cat_processed = pd.get_dummies(df_cat)

# # Print the encoded DataFrame
# print(df_cat_processed)

In [216]:
df_cat_processed

<8453x97 sparse matrix of type '<class 'numpy.float64'>'
	with 76077 stored elements in Compressed Sparse Row format>

In [217]:
# Extract feature names
encoder = preprocessing_pipeline_categorical.named_steps['encoder']
categories = encoder.categories_
feature_names = [f'{col}={value}' for col, values in zip(df_cat.columns, categories) for value in values]

# Print the feature names
print(feature_names)

['MANEUVER=AVOIDING VEHICLES/OBJECTS', 'MANEUVER=BACKING', 'MANEUVER=CHANGING LANES', 'MANEUVER=DIVERGING', 'MANEUVER=DRIVING WRONG WAY', 'MANEUVER=ENTER FROM DRIVE/ALLEY', 'MANEUVER=ENTERING TRAFFIC LANE FROM PARKING', 'MANEUVER=LEAVING TRAFFIC LANE TO PARK', 'MANEUVER=MERGING', 'MANEUVER=NEGOTIATING A CURVE', 'MANEUVER=OTHER', 'MANEUVER=PARKED', 'MANEUVER=PARKED IN TRAFFIC LANE', 'MANEUVER=PASSING/OVERTAKING', 'MANEUVER=SKIDDING/CONTROL LOSS', 'MANEUVER=SLOW/STOP - LEFT TURN', 'MANEUVER=SLOW/STOP - LOAD/UNLOAD', 'MANEUVER=SLOW/STOP - RIGHT TURN', 'MANEUVER=SLOW/STOP IN TRAFFIC', 'MANEUVER=STARTING IN TRAFFIC', 'MANEUVER=STRAIGHT AHEAD', 'MANEUVER=TURNING LEFT', 'MANEUVER=TURNING ON RED', 'MANEUVER=TURNING RIGHT', 'MANEUVER=U-TURN', 'MANEUVER=UNKNOWN/NA', 'MOST_SEVERE_INJURY=FATAL', 'MOST_SEVERE_INJURY=INCAPACITATING INJURY', 'MOST_SEVERE_INJURY=NO INDICATION OF INJURY', 'MOST_SEVERE_INJURY=NONINCAPACITATING INJURY', 'MOST_SEVERE_INJURY=REPORTED, NOT EVIDENT', 'MOST_SEVERE_INJURY=UNKN

In [218]:
# Create a dataframe to store the feature names
feature_names_df = pd.DataFrame({'Feature Name': feature_names})

# Export the feature names to a CSV file
feature_names_df.to_csv('data/feature_names.csv', index=False)

In [219]:
len(feature_names)

97

# Put it all back together

In [220]:
import scipy.sparse as sp

# Assuming df_numerical is your preprocessed numerical DataFrame
# Assuming sparse_matrix is your sparse matrix obtained from one-hot encoding

# Convert the numerical DataFrame to a sparse matrix
df_num_processed = sp.csr_matrix(df_num_processed.values)

# Concatenate the sparse matrix and the sparse categorical matrix horizontally
combined_sparse = sp.hstack((df_num_processed, df_cat_processed), format='csr')


In [221]:
combined_sparse

<8453x111 sparse matrix of type '<class 'numpy.float64'>'
	with 194418 stored elements in Compressed Sparse Row format>

In [222]:
# Give me the shape of the combined_sparse matrix
combined_sparse.shape

(8453, 111)

In [223]:
sp.save_npz('data/combined_sparse.npz', combined_sparse)

# Pipeline for target