In [89]:
# Standard libraries
import numpy as np
import pandas as pd
import warnings
import zipfile, io

# Visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.ticker as mtick

# Statistical libraries
import scipy.stats as ss
from scipy import stats
from scipy.stats import f_oneway
from scipy.sparse import csr_matrix

# Scikit-learn preprocessing and model selection
from sklearn.impute import KNNImputer
from sklearn.preprocessing import MinMaxScaler, OrdinalEncoder, LabelEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score

# Scikit-learn feature selection
from sklearn.feature_selection import f_classif, SelectKBest, mutual_info_classif, RFE, RFECV

# Scikit-learn models
from sklearn.linear_model import LogisticRegression, LassoCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

# Scikit-learn metrics
from sklearn.metrics import make_scorer, precision_score, recall_score, f1_score, classification_report

# Pickle for import and export of datasets
import pickle

# Seaborn settings
sns.set()

# Ignore warnings
warnings.filterwarnings('ignore')

# Enable inline plotting for Jupyter notebooks
%matplotlib inline

## **1.2 Importing the datasets**

In [90]:
# load the data (train and test)
X_train_FE = pd.read_csv('datasets/preprocessed_train_delivery1.csv', index_col=0)
X_val_FE = pd.read_csv('datasets/preprocessed_val_delivery1.csv', index_col=0)
y_train_FE = pd.read_csv('datasets/preprocessed_y_train_delivery1.csv', index_col=0)
y_val_FE = pd.read_csv('datasets/preprocessed_y_val_delivery1.csv', index_col=0)

X_test_FE = pd.read_csv('datasets/test_delivery1.csv', index_col=0)

In [91]:
X_train_FE.shape, y_train_FE.shape, X_val_FE.shape,  y_val_FE.shape

((418486, 29), (418486, 1), (101383, 29), (101383, 1))

In [92]:
X_train_FE.head()

Unnamed: 0,Age at Injury,Alternative Dispute Resolution,Attorney/Representative,Average Weekly Wage,Birth Year,C-2 Date,C-3 Date,Carrier Name,Carrier Type,County of Injury,...,WCIO Nature of Injury Description,WCIO Part Of Body Code,WCIO Part Of Body Description,Zip Code,Number of Dependents,Accident Year,Accident Month,Accident on Weekday,Assembly Year,Assembly Month
557080,33.0,0,1,1037.0,1988.0,0,0,"POLICE, FIRE, SANITATION",3A. SELF PUBLIC,KINGS,...,CONTUSION,35.0,HAND,11203,3.0,2022,8,True,2022,10
112493,56.0,0,0,0.0,1963.0,0,1,MEMORIAL SLOAN KETTERING,4A. SELF PRIVATE,ROCKLAND,...,COVID-19,44.0,CHEST,10923,2.0,2020,4,True,2020,8
524424,43.0,0,1,574.17,1979.0,0,1,ARCH INDEMNITY INSURANCE CO.,1A. PRIVATE,KINGS,...,FRACTURE,38.0,SHOULDER(S),11208,2.0,2022,8,True,2022,8
266382,58.0,0,0,0.0,1963.0,0,1,STATE INSURANCE FUND,2A. SIF,ERIE,...,STRAIN OR TEAR,42.0,LOWER BACK AREA,11236,6.0,2021,6,True,2021,6
568843,35.0,0,0,0.0,1987.0,0,1,INDEMNITY INS. OF N AMERICA,1A. PRIVATE,GENESEE,...,STRAIN OR TEAR,33.0,LOWER ARM,14416,6.0,2022,11,True,2022,11


# New Features

In [93]:
# Define a new column 'Age Group' based on 'Age at Injury'
def categorize_age(age):
    if 0 <= age < 14:
        return 'Child'
    elif 14 <= age < 25:
        return 'Young'
    elif 25 <= age < 55:
        return 'Adult'
    elif 55 <= age <= 65:
        return 'Old'
    else:
        return 'Other'  # For ages outside the defined ranges

# Apply the function to create the new 'Age Group' feature in train, validation and test datasets
X_train_FE['Age Group'] = X_train_FE['Age at Injury'].apply(categorize_age)
X_val_FE['Age Group'] = X_val_FE['Age at Injury'].apply(categorize_age)
X_test_FE['Age Group'] = X_test_FE['Age at Injury'].apply(categorize_age)

# Check the first few records to ensure the new feature was created correctly
print(X_train_FE[['Age at Injury', 'Age Group']].head())

        Age at Injury Age Group
557080           33.0     Adult
112493           56.0       Old
524424           43.0     Adult
266382           58.0       Old
568843           35.0     Adult


In [94]:
# Count the frequency of each type of injury in 'WCIO Cause of Injury Description' or 'WCIO Nature of Injury Description'
common_injuries = X_train_FE['WCIO Cause of Injury Description'].value_counts().nlargest(5).index


print("Most common injury causes:", common_injuries)

# We defined a function to create the binary indicator
def injury_indicator(injury_type):
    return 1 if injury_type in common_injuries else 0

# Apply the function to create a new column 'Frequent Injury Cause' in train, validation and test datasets
X_train_FE['Frequent Injury Cause'] = X_train_FE['WCIO Cause of Injury Description'].apply(injury_indicator)
X_val_FE['Frequent Injury Cause'] = X_val_FE['WCIO Cause of Injury Description'].apply(injury_indicator)
X_test_FE['Frequent Injury Cause'] = X_test_FE['WCIO Cause of Injury Description'].apply(injury_indicator)

# Display the first few rows to verify the new feature
print(X_train_FE[['WCIO Cause of Injury Description', 'Frequent Injury Cause']].head())

Most common injury causes: Index(['LIFTING', 'FELLOW WORKER, PATIENT OR OTHER PERSON',
       'STRAIN OR INJURY BY, NOC', 'FALL, SLIP OR TRIP, NOC', 'ON SAME LEVEL'],
      dtype='object', name='WCIO Cause of Injury Description')
       WCIO Cause of Injury Description  Frequent Injury Cause
557080           STRUCK OR INJURED, NOC                      0
112493                         PANDEMIC                      0
524424                        ON STAIRS                      0
266382                    ON SAME LEVEL                      1
568843                    INTO OPENINGS                      0


In [95]:
# Define a mapping function for broader body part categories
def categorize_body_part(body_part_code):
    if body_part_code in [10, 11, 12, 13]:  # Example codes for upper limbs
        return 'Upper Limbs'
    elif body_part_code in [20, 21, 22, 23]:  # Example codes for lower limbs
        return 'Lower Limbs'
    elif body_part_code in [30, 31, 32]:  # Example codes for trunk
        return 'Trunk'
    elif body_part_code in [40, 41]:  # Example codes for head and neck
        return 'Head/Neck'
    elif body_part_code in [50, 51]:  # Example codes for multiple body parts
        return 'Multiple'
    else:
        return 'Other'

# Apply the mapping function to create a new 'Broad Body Part' feature
X_train_FE['Broad Body Part'] = X_train_FE['WCIO Part Of Body Code'].apply(categorize_body_part)
X_val_FE['Broad Body Part'] = X_val_FE['WCIO Part Of Body Code'].apply(categorize_body_part)
X_test_FE['Broad Body Part'] = X_test_FE['WCIO Part Of Body Code'].apply(categorize_body_part)

# Display the first few rows to verify the new feature
print(X_train_FE[['WCIO Part Of Body Code', 'Broad Body Part']].head())

        WCIO Part Of Body Code Broad Body Part
557080                    35.0           Other
112493                    44.0           Other
524424                    38.0           Other
266382                    42.0           Other
568843                    33.0           Other


In [96]:
X_train_FE.shape, y_train_FE.shape, X_val_FE.shape,  y_val_FE.shape

((418486, 32), (418486, 1), (101383, 32), (101383, 1))

In [97]:
X_train_FE.to_csv('datasets/feature_engineering_train_delivery1.csv')
X_val_FE.to_csv('datasets/feature_engineering_val_delivery1.csv')
y_train_FE.to_csv('datasets/feature_engineering_y_train_delivery1.csv')
y_val_FE.to_csv('datasets/feature_engineering_y_val_delivery1.csv')

X_test_FE.to_csv('datasets/feature_engineering_test_delivery1.csv')