In [1]:
# Standard libraries
import numpy as np
import pandas as pd
import warnings
import zipfile, io

# Visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.ticker as mtick

# Statistical libraries
import scipy.stats as ss
from scipy import stats
from scipy.stats import f_oneway
from scipy.sparse import csr_matrix

# Scikit-learn preprocessing and model selection
from sklearn.impute import KNNImputer
from sklearn.preprocessing import MinMaxScaler, OrdinalEncoder, LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score

# Scikit-learn feature selection
from sklearn.feature_selection import f_classif, SelectKBest, mutual_info_classif, RFE, RFECV

# Scikit-learn models
from sklearn.linear_model import LogisticRegression, LassoCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

# Scikit-learn metrics
from sklearn.metrics import make_scorer, precision_score, recall_score, f1_score, classification_report

# Pickle for import and export of datasets
import pickle

# Seaborn settings
sns.set()

# Ignore warnings
warnings.filterwarnings('ignore')

# Enable inline plotting for Jupyter notebooks
%matplotlib inline

# 3. Data Scaling

* Due to the different types of information that Categorical features and Numerical features provide, each requires unique data scaling methods. Therefore, before normalizing features, we must split them into numeric and categorical sets.
* Regarding the numerical variables, the objective is to place them into the same scale, from 0 to 1, where 1 refers to the maximum value of the feature while 0 refers to the minimum value of the feature. One reason for normalizing / scaling the data is to prevent the model from giving more importance to variables with larger absolute numbers, since we do not want to make any assumptions on the level of importance of each feature. It is relevant to mention that while scaling we only consider information obtained from the training data. Only after we can use the minimum and maximum values of the training dataset for each variable to scale the validation dataset.
* Concerning categorical features, these are encoded to be readable by the models, as these do not typically process non-numeric data directly. To do so, we used the Ordinal Encoder method to transform the categorical variables by assigning each category an unique integer based on its order or occurrence


In [2]:
# load the data (train and test)
X_train_DS = pd.read_csv('datasets/feature_engineering_train_delivery1.csv', index_col=0)
X_val_DS = pd.read_csv('datasets/feature_engineering_val_delivery1.csv', index_col=0)
y_train_DS = pd.read_csv('datasets/feature_engineering_y_train_delivery1.csv', index_col=0)
y_val_DS = pd.read_csv('datasets/feature_engineering_y_val_delivery1.csv', index_col=0)

X_test_DS = pd.read_csv('datasets/feature_engineering_test_delivery1.csv', index_col=0)

In [3]:
X_train_DS.shape, y_train_DS.shape, X_val_DS.shape,  y_val_DS.shape

((418738, 32), (418738, 1), (101186, 32), (101186, 1))

In [4]:
X_train_DS.columns

Index(['Age at Injury', 'Alternative Dispute Resolution',
       'Attorney/Representative', 'Average Weekly Wage', 'Birth Year',
       'C-2 Date', 'C-3 Date', 'Carrier Name', 'Carrier Type',
       'County of Injury', 'COVID-19 Indicator', 'District Name', 'Gender',
       'Industry Code', 'Industry Code Description', 'Medical Fee Region',
       'WCIO Cause of Injury Code', 'WCIO Cause of Injury Description',
       'WCIO Nature of Injury Code', 'WCIO Nature of Injury Description',
       'WCIO Part Of Body Code', 'WCIO Part Of Body Description', 'Zip Code',
       'Number of Dependents', 'Accident Year', 'Accident Month',
       'Accident on Weekday', 'Assembly Year', 'Assembly Month', 'Age Group',
       'Frequent Injury Cause', 'Broad Body Part'],
      dtype='object')

In [5]:
X_train_DS['Accident on Weekday'] = X_train_DS['Accident on Weekday'].map({True: 1, False: 0})
X_val_DS['Accident on Weekday'] = X_val_DS['Accident on Weekday'].map({True: 1, False: 0})

In [6]:
print(X_train_DS['Medical Fee Region'].dropna().unique())  # Excludes NaN explicitly


['IV' 'III' 'II' 'UK' 'I']


In [7]:
mapping = {'I': 1, 'II': 2, 'III': 3, 'IV': 4, 'UK': 0}  # Assign a value to 'UK' based on its meaning
X_train_DS['Medical Fee Region'] = X_train_DS['Medical Fee Region'].map(mapping)
X_val_DS['Medical Fee Region'] = X_val_DS['Medical Fee Region'].map(mapping)

In [8]:
numeric_features = ['Age at Injury', 'Average Weekly Wage', 'Birth Year','Number of Dependents', 'Accident Year', 'Accident Month', 'Assembly Year','Assembly Month','Industry Code','WCIO Cause of Injury Code',
                   'WCIO Nature of Injury Code','WCIO Part Of Body Code', 'Medical Fee Region']
                    
categorical_features = ['Carrier Name', 'Carrier Type', 'County of Injury','District Name', 'Industry Code Description',
                       'WCIO Cause of Injury Description',
                       'WCIO Nature of Injury Description', 'Age Group',
                        'WCIO Part Of Body Description', 'Zip Code', 'Broad Body Part']
                        
binary_features = ['Alternative Dispute Resolution', 'Attorney/Representative','C-2 Date', 'C-3 Date', 'COVID-19 Indicator','Gender', 'Accident on Weekday','Frequent Injury Cause']

In [9]:
X_train_numeric = X_train_DS[numeric_features]
X_train_categorical = X_train_DS[categorical_features]
X_train_binary = X_train_DS[binary_features]

X_val_numeric = X_val_DS[numeric_features]
X_val_categorical = X_val_DS[categorical_features]
X_val_binary = X_val_DS[binary_features]

## 3.1 Data Scaling for Numerical

In [10]:
# Call function
scaler = MinMaxScaler()

# Fit to training data
scaler.fit(X_train_numeric)

# Transform training data
X_train_numeric_scaled = scaler.transform(X_train_numeric)

# Check results
X_train_numeric_scaled

array([[0.67307692, 0.36787758, 0.32653061, ..., 0.64444444, 0.47222222,
        1.        ],
       [0.32692308, 0.57220622, 0.46938776, ..., 0.        , 0.        ,
        1.        ],
       [0.57692308, 0.        , 0.42857143, ..., 0.56666667, 0.60185185,
        1.        ],
       ...,
       [0.82692308, 0.39261338, 0.18367347, ..., 0.56666667, 0.57407407,
        0.25      ],
       [0.21153846, 0.        , 0.79591837, ..., 0.56666667, 0.60185185,
        1.        ],
       [0.15384615, 0.        , 0.91836735, ..., 0.1       , 0.60185185,
        1.        ]])

In [11]:
# Convert array into a pandas df

X_train_numeric_scaled = pd.DataFrame(X_train_numeric_scaled, columns=X_train_numeric.columns, index=X_train_numeric.index)
X_train_numeric_scaled.head()

Unnamed: 0_level_0,Age at Injury,Average Weekly Wage,Birth Year,Number of Dependents,Accident Year,Accident Month,Assembly Year,Assembly Month,Industry Code,WCIO Cause of Injury Code,WCIO Nature of Injury Code,WCIO Part Of Body Code,Medical Fee Region
Claim Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
5785935,0.673077,0.367878,0.326531,1.0,0.976744,0.636364,0.5,0.636364,0.246914,0.285714,0.644444,0.472222,1.0
5980545,0.326923,0.572206,0.469388,0.333333,1.0,0.272727,1.0,0.363636,0.62963,1.0,0.0,0.0,1.0
5552635,0.576923,0.0,0.428571,0.5,0.953488,0.727273,0.0,0.727273,0.469136,0.806122,0.566667,0.601852,1.0
5758039,0.211538,0.0,0.836735,0.166667,0.976744,0.454545,0.5,0.545455,0.407407,0.683673,0.1,0.185185,1.0
5951382,0.519231,0.123605,0.510204,0.666667,1.0,0.0,1.0,0.181818,0.45679,0.44898,0.533333,0.0,1.0


In [12]:
print(X_val_numeric.dtypes)


Age at Injury                 float64
Average Weekly Wage           float64
Birth Year                    float64
Number of Dependents          float64
Accident Year                   int64
Accident Month                  int64
Assembly Year                   int64
Assembly Month                  int64
Industry Code                 float64
WCIO Cause of Injury Code     float64
WCIO Nature of Injury Code    float64
WCIO Part Of Body Code        float64
Medical Fee Region              int64
dtype: object


In [13]:
# Scale validation set too and convert it to pandas df

X_val_numeric_scaled = scaler.transform(X_val_numeric)
X_val_numeric_scaled = pd.DataFrame(X_val_numeric_scaled, columns=X_val_numeric.columns, index=X_val_numeric.index)

## 3.1 Data Scaling for Categorical

In [14]:
X_train_categorical.columns

Index(['Carrier Name', 'Carrier Type', 'County of Injury', 'District Name',
       'Industry Code Description', 'WCIO Cause of Injury Description',
       'WCIO Nature of Injury Description', 'Age Group',
       'WCIO Part Of Body Description', 'Zip Code', 'Broad Body Part'],
      dtype='object')

In [15]:
X_train_categorical.head()

Unnamed: 0_level_0,Carrier Name,Carrier Type,County of Injury,District Name,Industry Code Description,WCIO Cause of Injury Description,WCIO Nature of Injury Description,Age Group,WCIO Part Of Body Description,Zip Code,Broad Body Part
Claim Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
5785935,PROPERTY AND CASUALTY,1A. PRIVATE,QUEENS,NYC,MANUFACTURING,ON SAME LEVEL,"ALL OTHER SPECIFIC INJURIES, NOC",Adult,LOWER BACK AREA,11432,Other
5980545,HEALTH & HOSPITAL CORP.,3A. SELF PUBLIC,BRONX,NYC,HEALTH CARE AND SOCIAL ASSISTANCE,"OTHER - MISCELLANEOUS, NOC",NO PHYSICAL INJURY,Adult,MULTIPLE,10451,Other
5552635,AMERICAN ZURICH INSURANCE CO,1A. PRIVATE,KINGS,NYC,TRANSPORTATION AND WAREHOUSING,OBJECT HANDLED BY OTHERS,STRAIN OR TEAR,Adult,FOOT,11203,Other
5758039,"NORDSTROM, INC.",4A. SELF PRIVATE,KINGS,NYC,RETAIL TRADE,STATIONARY OBJECT,CONTUSION,Adult,SKULL,11237,Upper Limbs
5951382,NEW YORK BLACK CAR OPERATORS',4A. SELF PRIVATE,NASSAU,NYC,TRANSPORTATION AND WAREHOUSING,COLLISION OR SIDESWIPE WITH ANOTHER VEHICLE,SPRAIN OR TEAR,Adult,MULTIPLE,11003,Other


In [16]:
#We're having a cardinality issue, with a lot of features. One Hot Encoder isn't being able to run properly.
# We will assess that problem here and then maybe drop some redundant features.

# Check cardinality of categorical features
cardinality = X_train_categorical.nunique()
cardinality

Carrier Name                         1921
Carrier Type                            8
County of Injury                       63
District Name                           8
Industry Code Description              20
WCIO Cause of Injury Description       74
WCIO Nature of Injury Description      56
Age Group                               4
WCIO Part Of Body Description          54
Zip Code                             6018
Broad Body Part                         6
dtype: int64

We need to fix the high cardinality issue before encoding the categorical features. We will use the following strategies:
  - Analyse the cardinality of each feature and check if we can drop it
    - Feature 1 with high cardinality : Carrier Name:
      - As we've seen in the EDA, the Carrier Name feature has a high correlation with Carrier Type. Given That, we are dropping it
    - Feature 2 with high cardinality: Zip code:
      - As we've seen in the EDA, the Zip code feature has a high correlation with the County of Injury and Medical Fee Region. Given That, we are dropping it

In [17]:
#remove the values 'Carrier Name' and 'Zip Code' from the list of categorical features

X_train_categorical = X_train_categorical.drop(['Carrier Name', 'Zip Code'], axis=1)
X_val_categorical = X_val_categorical.drop(['Carrier Name', 'Zip Code'], axis=1)

# Check cardinality of categorical features

X_train_categorical.nunique()

Carrier Type                          8
County of Injury                     63
District Name                         8
Industry Code Description            20
WCIO Cause of Injury Description     74
WCIO Nature of Injury Description    56
Age Group                             4
WCIO Part Of Body Description        54
Broad Body Part                       6
dtype: int64

We can also see that there are features that have a code and a given description. We will reduce the model interpretabilty but we will drop the description and keep the code. The features we are dropping are:
- Industry Code Description 
- WCIO Cause of Injury Description
- WCIO Nature of Injury Description
- WCIO Part Of Body Description

In [18]:
#remove the values 'Carrier Name' and 'Zip Code' from the list of categorical features

X_train_categorical = X_train_categorical.drop(['Industry Code Description', 'WCIO Cause of Injury Description', 'WCIO Nature of Injury Description', 'WCIO Part Of Body Description'], axis=1)
X_val_categorical = X_val_categorical.drop(['Industry Code Description', 'WCIO Cause of Injury Description', 'WCIO Nature of Injury Description', 'WCIO Part Of Body Description'], axis=1)

# Check cardinality of categorical features

X_train_categorical.nunique()

Carrier Type         8
County of Injury    63
District Name        8
Age Group            4
Broad Body Part      6
dtype: int64

In [19]:
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
import pandas as pd

low_cardinality_threshold = 10

# Separate columns based on cardinality
low_cardinality_cols = [col for col in X_train_categorical.columns if X_train_categorical[col].nunique() <= low_cardinality_threshold]
high_cardinality_cols = [col for col in X_train_categorical.columns if X_train_categorical[col].nunique() > low_cardinality_threshold]

# Apply One-Hot Encoding to Low Cardinality Columns
ohc = OneHotEncoder(sparse_output=False, drop="first", handle_unknown="ignore")

ohc_train_feat = ohc.fit_transform(X_train_categorical[low_cardinality_cols])
ohc_val_feat = ohc.transform(X_val_categorical[low_cardinality_cols])


In [20]:
# Create DataFrames for one-hot encoded features
ohc_train_df = pd.DataFrame(ohc_train_feat, index=X_train_categorical.index, columns=ohc.get_feature_names_out(low_cardinality_cols))
ohc_val_df = pd.DataFrame(ohc_val_feat, index=X_val_categorical.index, columns=ohc.get_feature_names_out(low_cardinality_cols))

In [21]:
# Apply Frequency Encoding to High Cardinality Columns
for col in high_cardinality_cols:
    freq_map = X_train_categorical[col].value_counts(normalize=True)
    X_train_categorical[col] = X_train_categorical[col].map(freq_map)
    X_val_categorical[col] = X_val_categorical[col].map(freq_map)

# Create DataFrames for frequency-encoded features
freq_train_df = X_train_categorical[high_cardinality_cols]
freq_val_df = X_val_categorical[high_cardinality_cols]


In [22]:
# Apply Min-Max Scaling to Frequency Encoded Features
scaler = MinMaxScaler()
freq_train_scaled = scaler.fit_transform(freq_train_df)
freq_val_scaled = scaler.transform(freq_val_df)

# Convert scaled frequency features back to DataFrames
freq_train_scaled_df = pd.DataFrame(freq_train_scaled, index=freq_train_df.index, columns=freq_train_df.columns)
freq_val_scaled_df = pd.DataFrame(freq_val_scaled, index=freq_val_df.index, columns=freq_val_df.columns)

In [23]:
# Combine Both Encoded DataFrames
X_train_processed = pd.concat([ohc_train_df, freq_train_scaled_df], axis=1)
X_val_processed = pd.concat([ohc_val_df, freq_val_scaled_df], axis=1)

# Check the shapes of the processed DataFrames
print(f"Processed training data shape: {X_train_processed.shape}")
print(f"Processed validation data shape: {X_val_processed.shape}")

# Output the final DataFrames
X_train_processed.head(), X_val_processed.head()

Processed training data shape: (418738, 23)
Processed validation data shape: (101186, 23)


(                  Carrier Type_2A. SIF  Carrier Type_3A. SELF PUBLIC  \
 Claim Identifier                                                       
 5785935                            0.0                           0.0   
 5980545                            0.0                           1.0   
 5552635                            0.0                           0.0   
 5758039                            0.0                           0.0   
 5951382                            0.0                           0.0   
 
                   Carrier Type_4A. SELF PRIVATE  \
 Claim Identifier                                  
 5785935                                     0.0   
 5980545                                     0.0   
 5552635                                     0.0   
 5758039                                     1.0   
 5951382                                     1.0   
 
                   Carrier Type_5A. SPECIAL FUND - CONS. COMM. (SECT. 25-A)  \
 Claim Identifier                         

## 3.1 Data Scaling for Target

In [24]:
# Initialize the encoder
le = LabelEncoder()

# Fit and transform the target variable in the training set

y_train_encoded = le.fit_transform(y_train_DS)
y_train_encoded_df = pd.DataFrame(y_train_encoded, columns=['Encoded Target'], index=y_train_DS.index)

In [25]:
y_train_encoded_df

Unnamed: 0_level_0,Encoded Target
Claim Identifier,Unnamed: 1_level_1
5785935,3
5980545,2
5552635,1
5758039,1
5951382,2
...,...
5837651,3
5781926,3
5890060,3
5539380,1


In [26]:
y_val_encoded = le.transform(y_val_DS)
y_val_encoded_df = pd.DataFrame(y_val_encoded, columns=['Encoded Target'], index=y_val_DS.index)

In [27]:
y_val_encoded_df

Unnamed: 0_level_0,Encoded Target
Claim Identifier,Unnamed: 1_level_1
5730729,1
6038049,1
6053557,3
5953832,1
5488869,3
...,...
5731119,3
5750072,1
5974275,2
5425640,3


In [28]:
X_train_DS = pd.concat([X_train_numeric_scaled, X_train_processed, X_train_binary], axis=1)
X_val_DS = pd.concat([X_val_numeric_scaled, X_val_processed, X_val_binary], axis=1)

In [31]:
X_train_DS.shape, y_train_encoded_df.shape, X_val_DS.shape, y_val_encoded_df.shape

((418738, 44), (418738, 1), (101186, 44), (101186, 1))

In [30]:
X_train_DS.to_csv('datasets/scaled_data_train_delivery1.csv')
y_train_encoded_df.to_csv('datasets/scaled_target_train_delivery1.csv')
X_val_DS.to_csv('datasets/scaled_data_val_delivery1.csv')
y_val_encoded_df.to_csv('datasets/scaled_target_val_delivery1.csv')
X_test_DS.to_csv('datasets/scaled_data_test_delivery1.csv')