In [112]:
# Standard libraries
import numpy as np
import pandas as pd
import warnings
import zipfile, io

# Visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.ticker as mtick

# Statistical libraries
import scipy.stats as ss
from scipy import stats
from scipy.stats import f_oneway
from scipy.sparse import csr_matrix

# Scikit-learn preprocessing and model selection
from sklearn.impute import KNNImputer
from sklearn.preprocessing import MinMaxScaler, OrdinalEncoder, LabelEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score

# Scikit-learn feature selection
from sklearn.feature_selection import f_classif, SelectKBest, mutual_info_classif, RFE, RFECV

# Scikit-learn models
from sklearn.linear_model import LogisticRegression, LassoCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

# Scikit-learn metrics
from sklearn.metrics import make_scorer, precision_score, recall_score, f1_score, classification_report

# Pickle for import and export of datasets
import pickle

# Seaborn settings
sns.set()

# Ignore warnings
warnings.filterwarnings('ignore')

# Enable inline plotting for Jupyter notebooks
%matplotlib inline

## **1.2 Importing the datasets**

In [113]:
# load the data (train and test)
X_train_FE = pd.read_csv('datasets/preprocessed_train_delivery1.csv', index_col=0)
X_val_FE = pd.read_csv('datasets/preprocessed_val_delivery1.csv', index_col=0)
y_train_FE = pd.read_csv('datasets/preprocessed_y_train_delivery1.csv', index_col=0)
y_val_FE = pd.read_csv('datasets/preprocessed_y_val_delivery1.csv', index_col=0)

X_test_FE = pd.read_csv('datasets/test_delivery1.csv', index_col=0)

In [114]:
X_train_FE.shape, y_train_FE.shape, X_val_FE.shape,  y_val_FE.shape

((418738, 29), (418738, 1), (101186, 29), (101186, 1))

In [115]:
X_train_FE.head()

Unnamed: 0_level_0,Age at Injury,Alternative Dispute Resolution,Attorney/Representative,Average Weekly Wage,Birth Year,C-2 Date,C-3 Date,Carrier Name,Carrier Type,County of Injury,...,WCIO Nature of Injury Description,WCIO Part Of Body Code,WCIO Part Of Body Description,Zip Code,Number of Dependents,Accident Year,Accident Month,Accident on Weekday,Assembly Year,Assembly Month
Claim Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5785935,49.0,0,1,744.06,1971.0,0,0,PROPERTY AND CASUALTY,1A. PRIVATE,QUEENS,...,"ALL OTHER SPECIFIC INJURIES, NOC",42.0,LOWER BACK AREA,11432,6.0,2021,8,True,2021,8
5980545,31.0,0,0,1157.33,1978.0,0,1,HEALTH & HOSPITAL CORP.,3A. SELF PUBLIC,BRONX,...,NO PHYSICAL INJURY,-9.0,MULTIPLE,10451,2.0,2022,4,True,2022,5
5552635,44.0,0,0,0.0,1976.0,0,1,AMERICAN ZURICH INSURANCE CO,1A. PRIVATE,KINGS,...,STRAIN OR TEAR,56.0,FOOT,11203,3.0,2020,9,True,2020,9
5758039,25.0,0,0,0.0,1996.0,0,1,"NORDSTROM, INC.",4A. SELF PRIVATE,KINGS,...,CONTUSION,11.0,SKULL,11237,1.0,2021,6,True,2021,7
5951382,41.0,0,1,250.0,1980.0,0,0,NEW YORK BLACK CAR OPERATORS',4A. SELF PRIVATE,NASSAU,...,SPRAIN OR TEAR,-9.0,MULTIPLE,11003,4.0,2022,1,True,2022,3


# New Features

In [116]:
# Define a new column 'Age Group' based on 'Age at Injury'
def categorize_age(age):
    if 0 <= age < 14:
        return 'Child'
    elif 14 <= age < 25:
        return 'Young'
    elif 25 <= age < 55:
        return 'Adult'
    elif 55 <= age <= 65:
        return 'Old'
    else:
        return 'Other'  # For ages outside the defined ranges

# Apply the function to create the new 'Age Group' feature in train, validation and test datasets
X_train_FE['Age Group'] = X_train_FE['Age at Injury'].apply(categorize_age)
X_val_FE['Age Group'] = X_val_FE['Age at Injury'].apply(categorize_age)
X_test_FE['Age Group'] = X_test_FE['Age at Injury'].apply(categorize_age)

# Check the first few records to ensure the new feature was created correctly
print(X_train_FE[['Age at Injury', 'Age Group']].head())

                  Age at Injury Age Group
Claim Identifier                         
5785935                    49.0     Adult
5980545                    31.0     Adult
5552635                    44.0     Adult
5758039                    25.0     Adult
5951382                    41.0     Adult


In [117]:
# Count the frequency of each type of injury in 'WCIO Cause of Injury Description' or 'WCIO Nature of Injury Description'
common_injuries = X_train_FE['WCIO Cause of Injury Description'].value_counts().nlargest(5).index


print("Most common injury causes:", common_injuries)

# We defined a function to create the binary indicator
def injury_indicator(injury_type):
    return 1 if injury_type in common_injuries else 0

# Apply the function to create a new column 'Frequent Injury Cause' in train, validation and test datasets
X_train_FE['Frequent Injury Cause'] = X_train_FE['WCIO Cause of Injury Description'].apply(injury_indicator)
X_val_FE['Frequent Injury Cause'] = X_val_FE['WCIO Cause of Injury Description'].apply(injury_indicator)
X_test_FE['Frequent Injury Cause'] = X_test_FE['WCIO Cause of Injury Description'].apply(injury_indicator)

# Display the first few rows to verify the new feature
print(X_train_FE[['WCIO Cause of Injury Description', 'Frequent Injury Cause']].head())

Most common injury causes: Index(['LIFTING', 'FELLOW WORKER, PATIENT OR OTHER PERSON',
       'STRAIN OR INJURY BY, NOC', 'FALL, SLIP OR TRIP, NOC', 'ON SAME LEVEL'],
      dtype='object', name='WCIO Cause of Injury Description')
                             WCIO Cause of Injury Description  \
Claim Identifier                                                
5785935                                         ON SAME LEVEL   
5980545                            OTHER - MISCELLANEOUS, NOC   
5552635                              OBJECT HANDLED BY OTHERS   
5758039                                     STATIONARY OBJECT   
5951382           COLLISION OR SIDESWIPE WITH ANOTHER VEHICLE   

                  Frequent Injury Cause  
Claim Identifier                         
5785935                               1  
5980545                               0  
5552635                               0  
5758039                               0  
5951382                               0  


In [118]:
# Define a mapping function for broader body part categories
def categorize_body_part(body_part_code):
    if body_part_code in [10, 11, 12, 13]:  # Example codes for upper limbs
        return 'Upper Limbs'
    elif body_part_code in [20, 21, 22, 23]:  # Example codes for lower limbs
        return 'Lower Limbs'
    elif body_part_code in [30, 31, 32]:  # Example codes for trunk
        return 'Trunk'
    elif body_part_code in [40, 41]:  # Example codes for head and neck
        return 'Head/Neck'
    elif body_part_code in [50, 51]:  # Example codes for multiple body parts
        return 'Multiple'
    else:
        return 'Other'

# Apply the mapping function to create a new 'Broad Body Part' feature
X_train_FE['Broad Body Part'] = X_train_FE['WCIO Part Of Body Code'].apply(categorize_body_part)
X_val_FE['Broad Body Part'] = X_val_FE['WCIO Part Of Body Code'].apply(categorize_body_part)
X_test_FE['Broad Body Part'] = X_test_FE['WCIO Part Of Body Code'].apply(categorize_body_part)

# Display the first few rows to verify the new feature
print(X_train_FE[['WCIO Part Of Body Code', 'Broad Body Part']].head())

                  WCIO Part Of Body Code Broad Body Part
Claim Identifier                                        
5785935                             42.0           Other
5980545                             -9.0           Other
5552635                             56.0           Other
5758039                             11.0     Upper Limbs
5951382                             -9.0           Other


In [119]:
X_train_FE.shape, y_train_FE.shape, X_val_FE.shape,  y_val_FE.shape

((418738, 32), (418738, 1), (101186, 32), (101186, 1))

In [120]:
X_test_FE.shape

(387975, 32)

In [121]:
X_train_FE.to_csv('datasets/feature_engineering_train_delivery1.csv')
X_val_FE.to_csv('datasets/feature_engineering_val_delivery1.csv')
y_train_FE.to_csv('datasets/feature_engineering_y_train_delivery1.csv')
y_val_FE.to_csv('datasets/feature_engineering_y_val_delivery1.csv')

X_test_FE.to_csv('datasets/feature_engineering_test_delivery1.csv')