In [9]:
# Standard libraries
import numpy as np
import pandas as pd
import warnings
import zipfile, io

# Visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.ticker as mtick

# Statistical libraries
import scipy.stats as ss
from scipy import stats
from scipy.stats import f_oneway
from scipy.sparse import csr_matrix

# Scikit-learn preprocessing and model selection
from sklearn.impute import KNNImputer
from sklearn.preprocessing import MinMaxScaler, OrdinalEncoder, LabelEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score

# Scikit-learn feature selection
from sklearn.feature_selection import f_classif, SelectKBest, mutual_info_classif, RFE, RFECV

# Scikit-learn models
from sklearn.linear_model import LogisticRegression, LassoCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

# Scikit-learn metrics
from sklearn.metrics import make_scorer, precision_score, recall_score, f1_score, classification_report

# Pickle for import and export of datasets
import pickle

# Seaborn settings
sns.set()

# Ignore warnings
warnings.filterwarnings('ignore')

# Enable inline plotting for Jupyter notebooks
%matplotlib inline

## **1.2 Importing the datasets**

In [10]:
# load the data (train and test)
X_train = pd.read_csv('datasets/preprocessed_train_delivery1.csv', index_col=0)
X_test = pd.read_csv('datasets/preprocessed_test_delivery1.csv', index_col=0)

In [11]:
X_train.head()

Unnamed: 0_level_0,Age at Injury,Alternative Dispute Resolution,Attorney/Representative,Average Weekly Wage,Birth Year,C-2 Date,C-3 Date,Carrier Name,Carrier Type,Claim Injury Type,...,WCIO Nature of Injury Description,WCIO Part Of Body Code,WCIO Part Of Body Description,Zip Code,Number of Dependents,Accident Year,Accident Month,Accident on Weekday,Assembly Year,Assembly Month
Claim Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5393875,31.0,0,0,0.0,1988.0,0,1,NEW HAMPSHIRE INSURANCE CO,1A. PRIVATE,2. NON-COMP,...,CONTUSION,62.0,BUTTOCKS,13662,1.0,2019,12,True,2020,1
5393091,46.0,0,1,1745.93,1973.0,0,0,ZURICH AMERICAN INSURANCE CO,1A. PRIVATE,4. TEMPORARY,...,SPRAIN OR TEAR,38.0,SHOULDER(S),14569,4.0,2019,8,True,2020,1
5393889,40.0,0,0,1434.8,1979.0,0,1,INDEMNITY INSURANCE CO OF,1A. PRIVATE,4. TEMPORARY,...,CONCUSSION,10.0,MULTIPLE HEAD INJURY,12589,6.0,2019,12,True,2020,1
5393887,61.0,0,0,491.088321,1958.0,0,1,STATE INSURANCE FUND,2A. SIF,2. NON-COMP,...,PUNCTURE,36.0,FINGER(S),12603,1.0,2019,12,True,2020,1
5393848,48.0,0,0,0.0,1971.0,0,1,LM INSURANCE CORP,1A. PRIVATE,2. NON-COMP,...,LACERATION,36.0,FINGER(S),13029,1.0,2019,12,False,2020,1


# New Features

In [12]:
# Define a new column 'Age Group' based on 'Age at Injury'
def categorize_age(age):
    if 0 <= age < 14:
        return 'Child'
    elif 14 <= age < 25:
        return 'Young'
    elif 25 <= age < 55:
        return 'Adult'
    elif 55 <= age <= 65:
        return 'Old'
    else:
        return 'Other'  # For ages outside the defined ranges

# Apply the function to create the new 'Age Group' feature in both train and test datasets
X_train['Age Group'] = X_train['Age at Injury'].apply(categorize_age)
X_test['Age Group'] = X_test['Age at Injury'].apply(categorize_age)

# Check the first few records to ensure the new feature was created correctly
print(X_train[['Age at Injury', 'Age Group']].head())

                  Age at Injury Age Group
Claim Identifier                         
5393875                    31.0     Adult
5393091                    46.0     Adult
5393889                    40.0     Adult
5393887                    61.0       Old
5393848                    48.0     Adult


In [13]:
# Count the frequency of each type of injury in 'WCIO Cause of Injury Description' or 'WCIO Nature of Injury Description'
common_injuries = X_train['WCIO Cause of Injury Description'].value_counts().nlargest(5).index


print("Most common injury causes:", common_injuries)

# We defined a function to create the binary indicator
def injury_indicator(injury_type):
    return 1 if injury_type in common_injuries else 0

# Apply the function to create a new column 'Frequent Injury Cause' in both train and test datasets
X_train['Frequent Injury Cause'] = X_train['WCIO Cause of Injury Description'].apply(injury_indicator)
X_test['Frequent Injury Cause'] = X_test['WCIO Cause of Injury Description'].apply(injury_indicator)

# Display the first few rows to verify the new feature
print(X_train[['WCIO Cause of Injury Description', 'Frequent Injury Cause']].head())

Most common injury causes: Index(['LIFTING', 'FELLOW WORKER, PATIENT OR OTHER PERSON',
       'STRAIN OR INJURY BY, NOC', 'FALL, SLIP OR TRIP, NOC', 'ON SAME LEVEL'],
      dtype='object', name='WCIO Cause of Injury Description')
                 WCIO Cause of Injury Description  Frequent Injury Cause
Claim Identifier                                                        
5393875              FROM LIQUID OR GREASE SPILLS                      0
5393091                         REPETITIVE MOTION                      0
5393889            OBJECT BEING LIFTED OR HANDLED                      0
5393887           HAND TOOL, UTENSIL; NOT POWERED                      0
5393848                CUT, PUNCTURE, SCRAPE, NOC                      0


In [14]:
# Define a mapping function for broader body part categories
def categorize_body_part(body_part_code):
    if body_part_code in [10, 11, 12, 13]:  # Example codes for upper limbs
        return 'Upper Limbs'
    elif body_part_code in [20, 21, 22, 23]:  # Example codes for lower limbs
        return 'Lower Limbs'
    elif body_part_code in [30, 31, 32]:  # Example codes for trunk
        return 'Trunk'
    elif body_part_code in [40, 41]:  # Example codes for head and neck
        return 'Head/Neck'
    elif body_part_code in [50, 51]:  # Example codes for multiple body parts
        return 'Multiple'
    else:
        return 'Other'

# Apply the mapping function to create a new 'Broad Body Part' feature
X_train['Broad Body Part'] = X_train['WCIO Part Of Body Code'].apply(categorize_body_part)
X_test['Broad Body Part'] = X_test['WCIO Part Of Body Code'].apply(categorize_body_part)

# Display the first few rows to verify the new feature
print(X_train[['WCIO Part Of Body Code', 'Broad Body Part']].head())

                  WCIO Part Of Body Code Broad Body Part
Claim Identifier                                        
5393875                             62.0           Other
5393091                             38.0           Other
5393889                             10.0     Upper Limbs
5393887                             36.0           Other
5393848                             36.0           Other


In [15]:
X_train.to_csv('datasets/feature_engineering_train_delivery1.csv')
X_test.to_csv('datasets/feature_engineering_test_delivery1.csv')