In [76]:
# Standard libraries
import numpy as np
import pandas as pd
import warnings
import zipfile, io

# Visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.ticker as mtick

# Statistical libraries
import scipy.stats as ss
from scipy import stats
from scipy.stats import f_oneway
from scipy.sparse import csr_matrix

# Scikit-learn preprocessing and model selection
from sklearn.impute import KNNImputer
from sklearn.preprocessing import MinMaxScaler, OrdinalEncoder, LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score

# Scikit-learn feature selection
from sklearn.feature_selection import f_classif, SelectKBest, mutual_info_classif, RFE, RFECV

# Scikit-learn models
from sklearn.linear_model import LogisticRegression, LassoCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

# Scikit-learn metrics
from sklearn.metrics import make_scorer, precision_score, recall_score, f1_score, classification_report

# Pickle for import and export of datasets
import pickle

# Seaborn settings
sns.set()

# Ignore warnings
warnings.filterwarnings('ignore')

# Enable inline plotting for Jupyter notebooks
%matplotlib inline

# 3. Data Scaling

* Due to the different types of information that Categorical features and Numerical features provide, each requires unique data scaling methods. Therefore, before normalizing features, we must split them into numeric and categorical sets.
* Regarding the numerical variables, the objective is to place them into the same scale, from 0 to 1, where 1 refers to the maximum value of the feature while 0 refers to the minimum value of the feature. One reason for normalizing / scaling the data is to prevent the model from giving more importance to variables with larger absolute numbers, since we do not want to make any assumptions on the level of importance of each feature. It is relevant to mention that while scaling we only consider information obtained from the training data. Only after we can use the minimum and maximum values of the training dataset for each variable to scale the validation dataset.
* Concerning categorical features, these are encoded to be readable by the models, as these do not typically process non-numeric data directly. To do so, we used the Ordinal Encoder method to transform the categorical variables by assigning each category an unique integer based on its order or occurrence


In [77]:
# load the data (train and test)
X_train_DS = pd.read_csv('datasets/feature_engineering_train_delivery1.csv', index_col=0)
X_val_DS = pd.read_csv('datasets/feature_engineering_val_delivery1.csv', index_col=0)
y_train_DS = pd.read_csv('datasets/feature_engineering_y_train_delivery1.csv', index_col=0)
y_val_DS = pd.read_csv('datasets/feature_engineering_y_val_delivery1.csv', index_col=0)

X_test_DS = pd.read_csv('datasets/feature_engineering_test_delivery1.csv', index_col=0)

In [78]:
X_train_DS.shape, y_train_DS.shape, X_val_DS.shape,  y_val_DS.shape

((418486, 32), (418486, 1), (101383, 32), (101383, 1))

In [79]:
X_train_DS.columns

Index(['Age at Injury', 'Alternative Dispute Resolution',
       'Attorney/Representative', 'Average Weekly Wage', 'Birth Year',
       'C-2 Date', 'C-3 Date', 'Carrier Name', 'Carrier Type',
       'County of Injury', 'COVID-19 Indicator', 'District Name', 'Gender',
       'Industry Code', 'Industry Code Description', 'Medical Fee Region',
       'WCIO Cause of Injury Code', 'WCIO Cause of Injury Description',
       'WCIO Nature of Injury Code', 'WCIO Nature of Injury Description',
       'WCIO Part Of Body Code', 'WCIO Part Of Body Description', 'Zip Code',
       'Number of Dependents', 'Accident Year', 'Accident Month',
       'Accident on Weekday', 'Assembly Year', 'Assembly Month', 'Age Group',
       'Frequent Injury Cause', 'Broad Body Part'],
      dtype='object')

In [80]:
numeric_features = ['Age at Injury', 'Average Weekly Wage', 'Birth Year','Number of Dependents', 'Accident Year', 'Accident Month', 'Assembly Year','Assembly Month']
                    
categorical_features = ['Carrier Name', 'Carrier Type', 'County of Injury','District Name', 'Industry Code', 'Industry Code Description', 'Medical Fee Region',
                        'WCIO Cause of Injury Code', 'WCIO Cause of Injury Description',
                        'WCIO Nature of Injury Code', 'WCIO Nature of Injury Description', 'Age Group',
                        'WCIO Part Of Body Code', 'WCIO Part Of Body Description', 'Zip Code', 'Broad Body Part']
                        
binary_features = ['Alternative Dispute Resolution', 'Attorney/Representative','C-2 Date', 'C-3 Date', 'COVID-19 Indicator','Gender', 'Accident on Weekday','Frequent Injury Cause']

In [81]:
X_train_numeric = X_train_DS[numeric_features]
X_train_categorical = X_train_DS[categorical_features]
X_train_binary = X_train_DS[binary_features]

X_val_numeric = X_val_DS[numeric_features]
X_val_categorical = X_val_DS[categorical_features]
X_val_binary = X_val_DS[binary_features]

## 3.1 Data Scaling for Numerical

In [82]:
# Call function
scaler = MinMaxScaler()

# Fit to training data
scaler.fit(X_train_numeric)

# Transform training data
X_train_numeric_scaled = scaler.transform(X_train_numeric)

# Check results
X_train_numeric_scaled

array([[0.36538462, 0.53426069, 0.67346939, ..., 0.63636364, 1.        ,
        0.81818182],
       [0.80769231, 0.        , 0.16326531, ..., 0.27272727, 0.        ,
        0.63636364],
       [0.55769231, 0.29581144, 0.48979592, ..., 0.63636364, 1.        ,
        0.63636364],
       ...,
       [1.        , 0.        , 0.        , ..., 0.90909091, 0.5       ,
        0.90909091],
       [0.78846154, 0.25283443, 0.16326531, ..., 0.72727273, 0.        ,
        0.72727273],
       [0.38461538, 0.81328697, 0.63265306, ..., 0.63636364, 0.        ,
        0.72727273]])

In [83]:
# Convert array into a pandas df

X_train_numeric_scaled = pd.DataFrame(X_train_numeric_scaled, columns=X_train_numeric.columns, index=X_train_numeric.index)
X_train_numeric_scaled.head()

Unnamed: 0,Age at Injury,Average Weekly Wage,Birth Year,Number of Dependents,Accident Year,Accident Month,Assembly Year,Assembly Month
557080,0.365385,0.534261,0.673469,0.5,1.0,0.636364,1.0,0.818182
112493,0.807692,0.0,0.163265,0.333333,0.953488,0.272727,0.0,0.636364
524424,0.557692,0.295811,0.489796,0.333333,1.0,0.636364,1.0,0.636364
266382,0.846154,0.0,0.163265,1.0,0.976744,0.454545,0.5,0.454545
568843,0.403846,0.0,0.653061,1.0,1.0,0.909091,1.0,0.909091


In [84]:
# Scale validation set too and convert it to pandas df

X_val_numeric_scaled = scaler.transform(X_val_numeric)
X_val_numeric_scaled = pd.DataFrame(X_val_numeric_scaled, columns=X_val_numeric.columns, index=X_val_numeric.index)

## 3.1 Data Scaling for Categorical

In [85]:
X_train_categorical.columns

Index(['Carrier Name', 'Carrier Type', 'County of Injury', 'District Name',
       'Industry Code', 'Industry Code Description', 'Medical Fee Region',
       'WCIO Cause of Injury Code', 'WCIO Cause of Injury Description',
       'WCIO Nature of Injury Code', 'WCIO Nature of Injury Description',
       'Age Group', 'WCIO Part Of Body Code', 'WCIO Part Of Body Description',
       'Zip Code', 'Broad Body Part'],
      dtype='object')

In [86]:
X_train_categorical.head()

Unnamed: 0,Carrier Name,Carrier Type,County of Injury,District Name,Industry Code,Industry Code Description,Medical Fee Region,WCIO Cause of Injury Code,WCIO Cause of Injury Description,WCIO Nature of Injury Code,WCIO Nature of Injury Description,Age Group,WCIO Part Of Body Code,WCIO Part Of Body Description,Zip Code,Broad Body Part
557080,"POLICE, FIRE, SANITATION",3A. SELF PUBLIC,KINGS,NYC,92.0,PUBLIC ADMINISTRATION,IV,81.0,"STRUCK OR INJURED, NOC",10.0,CONTUSION,Adult,35.0,HAND,11203,Other
112493,MEMORIAL SLOAN KETTERING,4A. SELF PRIVATE,ROCKLAND,NYC,62.0,HEALTH CARE AND SOCIAL ASSISTANCE,III,83.0,PANDEMIC,83.0,COVID-19,Old,44.0,CHEST,10923,Other
524424,ARCH INDEMNITY INSURANCE CO.,1A. PRIVATE,KINGS,NYC,62.0,HEALTH CARE AND SOCIAL ASSISTANCE,IV,33.0,ON STAIRS,28.0,FRACTURE,Adult,38.0,SHOULDER(S),11208,Other
266382,STATE INSURANCE FUND,2A. SIF,ERIE,BUFFALO,62.0,HEALTH CARE AND SOCIAL ASSISTANCE,II,29.0,ON SAME LEVEL,52.0,STRAIN OR TEAR,Old,42.0,LOWER BACK AREA,11236,Other
568843,INDEMNITY INS. OF N AMERICA,1A. PRIVATE,GENESEE,ROCHESTER,56.0,ADMINISTRATIVE AND SUPPORT AND WASTE MANAGEMEN...,I,28.0,INTO OPENINGS,52.0,STRAIN OR TEAR,Adult,33.0,LOWER ARM,14416,Other


In [87]:
#We're having a cardinality issue, with a lot of features. One Hot Encoder isn't being able to run properly.
# We will assess that problem here and then maybe drop some redundant features.

# Check cardinality of categorical features
cardinality = X_train_categorical.nunique()
cardinality

Carrier Name                         1934
Carrier Type                            8
County of Injury                       63
District Name                           8
Industry Code                          24
Industry Code Description              20
Medical Fee Region                      5
WCIO Cause of Injury Code              77
WCIO Cause of Injury Description       74
WCIO Nature of Injury Code             56
WCIO Nature of Injury Description      56
Age Group                               4
WCIO Part Of Body Code                 57
WCIO Part Of Body Description          54
Zip Code                             6046
Broad Body Part                         6
dtype: int64

We need to fix the high cardinality issue before encoding the categorical features. We will use the following strategies:
  - Analyse the cardinality of each feature and check if we can drop it
    - Feature 1 with high cardinality : Carrier Name:
      - As we've seen in the EDA, the Carrier Name feature has a high correlation with Carrier Type. Given That, we are dropping it
    - Feature 2 with high cardinality: Zip code:
      - As we've seen in the EDA, the Zip code feature has a high correlation with the County of Injury and Medical Fee Region. Given That, we are dropping it

In [88]:
#remove the values 'Carrier Name' and 'Zip Code' from the list of categorical features

X_train_categorical = X_train_categorical.drop(['Carrier Name', 'Zip Code'], axis=1)
X_val_categorical = X_val_categorical.drop(['Carrier Name', 'Zip Code'], axis=1)

# Check cardinality of categorical features

X_train_categorical.nunique()

Carrier Type                          8
County of Injury                     63
District Name                         8
Industry Code                        24
Industry Code Description            20
Medical Fee Region                    5
WCIO Cause of Injury Code            77
WCIO Cause of Injury Description     74
WCIO Nature of Injury Code           56
WCIO Nature of Injury Description    56
Age Group                             4
WCIO Part Of Body Code               57
WCIO Part Of Body Description        54
Broad Body Part                       6
dtype: int64

We can also see that there are features that have a code and a given description. We will reduce the model interpretabilty but we will drop the description and keep the code. The features we are dropping are:
- Industry Code Description 
- WCIO Cause of Injury Description
- WCIO Nature of Injury Description
- WCIO Part Of Body Description

In [89]:
#remove the values 'Carrier Name' and 'Zip Code' from the list of categorical features

X_train_categorical = X_train_categorical.drop(['Industry Code Description', 'WCIO Cause of Injury Description', 'WCIO Nature of Injury Description', 'WCIO Part Of Body Description'], axis=1)
X_val_categorical = X_val_categorical.drop(['Industry Code Description', 'WCIO Cause of Injury Description', 'WCIO Nature of Injury Description', 'WCIO Part Of Body Description'], axis=1)

# Check cardinality of categorical features

X_train_categorical.nunique()

Carrier Type                   8
County of Injury              63
District Name                  8
Industry Code                 24
Medical Fee Region             5
WCIO Cause of Injury Code     77
WCIO Nature of Injury Code    56
Age Group                      4
WCIO Part Of Body Code        57
Broad Body Part                6
dtype: int64

In [90]:
ohc = OneHotEncoder(sparse_output=False, drop="first", handle_unknown="ignore")

ohc_train_feat = ohc.fit_transform(X_train_categorical)
ohc_train_feat_names = ohc.get_feature_names_out()

ohc_val_feat = ohc.transform(X_val_categorical)
ohc_val_feat_names = ohc.get_feature_names_out()

In [91]:
ohc_train_df = pd.DataFrame(ohc_train_feat, index=X_train_categorical.index, columns=ohc_train_feat_names)  # Why the index=df_ohc.index?
ohc_train_df

Unnamed: 0,Carrier Type_2A. SIF,Carrier Type_3A. SELF PUBLIC,Carrier Type_4A. SELF PRIVATE,Carrier Type_5A. SPECIAL FUND - CONS. COMM. (SECT. 25-A),Carrier Type_5C. SPECIAL FUND - POI CARRIER WCB MENANDS,Carrier Type_5D. SPECIAL FUND - UNKNOWN,Carrier Type_UNKNOWN,County of Injury_ALLEGANY,County of Injury_BRONX,County of Injury_BROOME,...,WCIO Part Of Body Code_65.0,WCIO Part Of Body Code_66.0,WCIO Part Of Body Code_90.0,WCIO Part Of Body Code_91.0,WCIO Part Of Body Code_99.0,Broad Body Part_Lower Limbs,Broad Body Part_Multiple,Broad Body Part_Other,Broad Body Part_Trunk,Broad Body Part_Upper Limbs
557080,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
112493,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
524424,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
266382,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
568843,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
110268,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
259178,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
365838,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
131932,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [92]:
ohc_val_df = pd.DataFrame(ohc_val_feat, index=X_val_categorical.index, columns=ohc_val_feat_names)  # Why the index=df_ohc.index?
ohc_val_df

Unnamed: 0,Carrier Type_2A. SIF,Carrier Type_3A. SELF PUBLIC,Carrier Type_4A. SELF PRIVATE,Carrier Type_5A. SPECIAL FUND - CONS. COMM. (SECT. 25-A),Carrier Type_5C. SPECIAL FUND - POI CARRIER WCB MENANDS,Carrier Type_5D. SPECIAL FUND - UNKNOWN,Carrier Type_UNKNOWN,County of Injury_ALLEGANY,County of Injury_BRONX,County of Injury_BROOME,...,WCIO Part Of Body Code_65.0,WCIO Part Of Body Code_66.0,WCIO Part Of Body Code_90.0,WCIO Part Of Body Code_91.0,WCIO Part Of Body Code_99.0,Broad Body Part_Lower Limbs,Broad Body Part_Multiple,Broad Body Part_Other,Broad Body Part_Trunk,Broad Body Part_Upper Limbs
215853,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
155026,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
23902,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
450000,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
552978,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
509976,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
60883,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
317217,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
444340,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


## 3.1 Data Scaling for Target

In [93]:
# Initialize the encoder
le = LabelEncoder()

# Fit and transform the target variable in the training set

y_train_encoded = le.fit_transform(y_train_DS)
y_train_encoded_df = pd.DataFrame(y_train_encoded, columns=['Encoded Target'], index=y_train_DS.index)

In [94]:
y_train_encoded_df

Unnamed: 0,Encoded Target
557080,2
112493,1
524424,3
266382,1
568843,1
...,...
110268,1
259178,3
365838,2
131932,4


In [95]:
y_val_encoded = le.transform(y_val_DS)
y_val_encoded_df = pd.DataFrame(y_val_encoded, columns=['Encoded Target'], index=y_val_DS.index)

In [96]:
y_val_encoded_df

Unnamed: 0,Encoded Target
215853,4
155026,3
23902,1
450000,3
552978,1
...,...
509976,1
60883,2
317217,1
444340,1


In [97]:
X_train_DS = pd.concat([X_train_numeric_scaled, ohc_train_df, X_train_binary], axis=1)
X_val_DS = pd.concat([X_val_numeric_scaled, ohc_val_df, X_val_binary], axis=1)

In [98]:
X_train_DS.shape, y_train_encoded_df.shape

((418486, 314), (418486, 1))

In [99]:
X_train_DS.to_csv('datasets/scaled_data_train_delivery1.csv')
y_train_encoded_df.to_csv('datasets/scaled_target_train_delivery1.csv')
X_val_DS.to_csv('datasets/scaled_data_val_delivery1.csv')
y_val_encoded_df.to_csv('datasets/scaled_target_val_delivery1.csv')
X_test_DS.to_csv('datasets/scaled_data_test_delivery1.csv')