In [1]:
# Standard libraries
import numpy as np
import pandas as pd
import warnings
import zipfile, io

# Visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.ticker as mtick

# Statistical libraries
import scipy.stats as ss
from scipy import stats
from scipy.stats import f_oneway
from scipy.sparse import csr_matrix

# Scikit-learn preprocessing and model selection
from sklearn.impute import KNNImputer
from sklearn.preprocessing import MinMaxScaler, OrdinalEncoder, LabelEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score

# Scikit-learn feature selection
from sklearn.feature_selection import f_classif, SelectKBest, mutual_info_classif, RFE, RFECV

# Scikit-learn models
from sklearn.linear_model import LogisticRegression, LassoCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

# Scikit-learn metrics
from sklearn.metrics import make_scorer, precision_score, recall_score, f1_score, classification_report

# Pickle for import and export of datasets
import pickle

# Seaborn settings
sns.set()

# Ignore warnings
warnings.filterwarnings('ignore')

# Enable inline plotting for Jupyter notebooks
%matplotlib inline

## **1.2 Importing the datasets**

In [2]:
# load the data (train and test)
X_train = pd.read_csv('datasets/X_train_cleaned.csv', index_col=0)
X_val = pd.read_csv('datasets/X_val_cleaned.csv', index_col=0)
y_train = pd.read_csv('datasets/y_train_cleaned.csv', index_col=0)
y_val = pd.read_csv('datasets/y_val_cleaned.csv', index_col=0)
X_test = pd.read_csv('datasets/X_test_cleaned.csv', index_col=0)

In [3]:
X_train.columns

Index(['Accident Date', 'Age at Injury', 'Alternative Dispute Resolution',
       'Assembly Date', 'Attorney/Representative', 'Average Weekly Wage',
       'Birth Year', 'C-2 Date', 'C-3 Date', 'Carrier Name', 'Carrier Type',
       'County of Injury', 'COVID-19 Indicator', 'District Name',
       'First Hearing Date', 'Gender', 'IME-4 Count', 'Industry Code',
       'Industry Code Description', 'Medical Fee Region',
       'WCIO Cause of Injury Code', 'WCIO Cause of Injury Description',
       'WCIO Nature of Injury Code', 'WCIO Nature of Injury Description',
       'WCIO Part Of Body Code', 'WCIO Part Of Body Description', 'Zip Code',
       'Number of Dependents'],
      dtype='object')

In [4]:
X_train.head(100)

Unnamed: 0_level_0,Accident Date,Age at Injury,Alternative Dispute Resolution,Assembly Date,Attorney/Representative,Average Weekly Wage,Birth Year,C-2 Date,C-3 Date,Carrier Name,...,Industry Code Description,Medical Fee Region,WCIO Cause of Injury Code,WCIO Cause of Injury Description,WCIO Nature of Injury Code,WCIO Nature of Injury Description,WCIO Part Of Body Code,WCIO Part Of Body Description,Zip Code,Number of Dependents
Claim Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5840724,2021-10-20,49.0,0.0,2021-10-22,1,1500.000000,1971.0,2021-10-22,1,STATE INSURANCE FUND,...,ADMINISTRATIVE AND SUPPORT AND WASTE MANAGEMEN...,IV,25.0,FROM DIFFERENT LEVEL (ELEVATION),1.0,NO PHYSICAL INJURY,42.0,LOWER BACK AREA,10304.0,1.0
5828518,2021-10-04,45.0,0.0,2021-10-06,0,1265.750000,1976.0,2021-10-05,0,STATE INSURANCE FUND,...,HEALTH CARE AND SOCIAL ASSISTANCE,IV,2.0,HOT OBJECTS OR SUBSTANCES,4.0,BURN,35.0,HAND,11520.0,3.0
5623558,2020-11-19,33.0,0.0,2020-12-22,1,1973.950000,1986.0,2020-12-22,1,TRAVELERS INDEMNITY CO OF CONN,...,FINANCE AND INSURANCE,IV,97.0,REPETITIVE MOTION,78.0,CARPAL TUNNEL SYNDROME,34.0,WRIST,11358.0,4.0
5630561,2020-12-09,31.0,0.0,2021-01-06,1,750.923333,1989.0,2021-01-15,1,AMERICAN ZURICH INSURANCE CO,...,HEALTH CARE AND SOCIAL ASSISTANCE,IV,83.0,PANDEMIC,83.0,COVID-19,90.0,MULTIPLE BODY PARTS (INCLUDING BODY,11236.0,2.0
5708079,2021-03-10,57.0,0.0,2021-04-26,0,1370.240000,1963.0,2021-04-23,0,AMERICAN CASUALTY CO OF,...,CONSTRUCTION,I,98.0,"CUMULATIVE, NOC",52.0,STRAIN OR TEAR,35.0,HAND,14850.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5857484,2021-11-09,59.0,0.0,2021-11-15,1,1825.000000,1962.0,2021-11-15,1,MEMIC INDEMNITY COMPANY,...,HEALTH CARE AND SOCIAL ASSISTANCE,IV,56.0,LIFTING,52.0,STRAIN OR TEAR,53.0,KNEE,11372.0,3.0
6016034,2021-11-13,32.0,0.0,2022-06-17,1,750.923333,1989.0,2022-06-21,1,ALLMERICA FINANCIAL BENEFIT,...,MANUFACTURING,II,56.0,LIFTING,52.0,STRAIN OR TEAR,42.0,LOWER BACK AREA,14622.0,2.0
5728500,2021-05-17,61.0,0.0,2021-05-21,0,990.853333,1960.0,2021-05-21,0,ARCH INDEMNITY INSURANCE CO.,...,HEALTH CARE AND SOCIAL ASSISTANCE,IV,56.0,LIFTING,52.0,STRAIN OR TEAR,42.0,LOWER BACK AREA,11570.0,5.0
5948662,2022-03-17,57.0,0.0,2022-03-21,1,720.430000,1965.0,2022-03-21,1,STATE INSURANCE FUND,...,"ARTS, ENTERTAINMENT, AND RECREATION",IV,31.0,"FALL, SLIP OR TRIP, NOC",10.0,CONTUSION,35.0,HAND,10452.0,2.0


In [5]:
#Check missing values for X_test

X_test.isnull().sum()

Accident Date                        0
Age at Injury                        0
Alternative Dispute Resolution       0
Assembly Date                        0
Attorney/Representative              0
Average Weekly Wage                  0
Birth Year                           0
C-2 Date                             0
C-3 Date                             0
Carrier Name                         0
Carrier Type                         0
County of Injury                     0
COVID-19 Indicator                   0
District Name                        0
First Hearing Date                   0
Gender                               0
IME-4 Count                          0
Industry Code                        0
Industry Code Description            0
Medical Fee Region                   0
WCIO Cause of Injury Code            0
WCIO Cause of Injury Description     0
WCIO Nature of Injury Code           0
WCIO Nature of Injury Description    0
WCIO Part Of Body Code               0
WCIO Part Of Body Descrip

In [6]:
X_train.columns

Index(['Accident Date', 'Age at Injury', 'Alternative Dispute Resolution',
       'Assembly Date', 'Attorney/Representative', 'Average Weekly Wage',
       'Birth Year', 'C-2 Date', 'C-3 Date', 'Carrier Name', 'Carrier Type',
       'County of Injury', 'COVID-19 Indicator', 'District Name',
       'First Hearing Date', 'Gender', 'IME-4 Count', 'Industry Code',
       'Industry Code Description', 'Medical Fee Region',
       'WCIO Cause of Injury Code', 'WCIO Cause of Injury Description',
       'WCIO Nature of Injury Code', 'WCIO Nature of Injury Description',
       'WCIO Part Of Body Code', 'WCIO Part Of Body Description', 'Zip Code',
       'Number of Dependents'],
      dtype='object')

In [7]:
#TODO Remove this cell when preprocess is run again. Joao 17-12 12:04

u_indexes_train = X_train[X_train['Gender'] == 'U'].index
x_indexes_train = X_train[X_train['Gender'] == 'X'].index

u_indexes_val = X_val[X_val['Gender'] == 'U'].index
x_indexes_val = X_val[X_val['Gender'] == 'X'].index

X_test = X_test[~X_test.index.isin(u_indexes_val)]
X_test = X_test[~X_test.index.isin(x_indexes_val)]






In [8]:
#Check 'Gender' value counts

print(X_train['Gender'].value_counts())

Gender
1    258145
0    183659
Name: count, dtype: int64


In [9]:
#check for 'Gender' missing values on x_test

print(X_test['Gender'].isnull().sum())

0


In [10]:
# Check the rows where 'Gender' has missing values on test

X_test[X_test['Gender'].isna()]

Unnamed: 0_level_0,Accident Date,Age at Injury,Alternative Dispute Resolution,Assembly Date,Attorney/Representative,Average Weekly Wage,Birth Year,C-2 Date,C-3 Date,Carrier Name,...,Industry Code Description,Medical Fee Region,WCIO Cause of Injury Code,WCIO Cause of Injury Description,WCIO Nature of Injury Code,WCIO Nature of Injury Description,WCIO Part Of Body Code,WCIO Part Of Body Description,Zip Code,Number of Dependents
Claim Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1


# New Features

## Dates derivatives

In [11]:
#Check if 'Accident Date' has NaT values

X_test['Accident Date'].isnull().sum()

0

In [12]:
# Ensure 'Assembly Date' and 'Accident Date' are in datetime format
X_train['Assembly Date'] = pd.to_datetime(X_train['Assembly Date'], errors='coerce')
X_train['Accident Date'] = pd.to_datetime(X_train['Accident Date'], errors='coerce')
X_train['C-2 Date'] = pd.to_datetime(X_train['C-2 Date'], errors='coerce')
X_val['Assembly Date'] = pd.to_datetime(X_val['Assembly Date'], errors='coerce')
X_val['Accident Date'] = pd.to_datetime(X_val['Accident Date'], errors='coerce')
X_val['C-2 Date'] = pd.to_datetime(X_val['C-2 Date'], errors='coerce')
X_test['Assembly Date'] = pd.to_datetime(X_test['Assembly Date'], errors='coerce')
X_test['Accident Date'] = pd.to_datetime(X_test['Accident Date'], errors='coerce')
X_test['C-2 Date'] = pd.to_datetime(X_test['C-2 Date'], errors='coerce')

# Extract year, month and day from 'Assembly Date', 'Accident Date' and C-2 Date

X_train['Assembly Year'] = X_train['Assembly Date'].dt.year
X_train['Accident Year'] = X_train['Accident Date'].dt.year
X_train['C-2 Date Year'] = X_train['C-2 Date'].dt.year
X_train['Assembly Month'] = X_train['Assembly Date'].dt.month
X_train['Accident Month'] = X_train['Accident Date'].dt.month
X_train['C-2 Date Month'] = X_train['C-2 Date'].dt.month


X_val['Assembly Year'] = X_val['Assembly Date'].dt.year
X_val['Accident Year'] = X_val['Accident Date'].dt.year
X_val['C-2 Date Year'] = X_val['C-2 Date'].dt.year
X_val['Assembly Month'] = X_val['Assembly Date'].dt.month
X_val['Accident Month'] = X_val['Accident Date'].dt.month
X_val['C-2 Date Month'] = X_val['C-2 Date'].dt.month

X_test['Assembly Year'] = X_test['Assembly Date'].dt.year
X_test['Accident Year'] = X_test['Accident Date'].dt.year
X_test['C-2 Date Year'] = X_test['C-2 Date'].dt.year
X_test['Assembly Month'] = X_test['Assembly Date'].dt.month
X_test['Accident Month'] = X_test['Accident Date'].dt.month
X_test['C-2 Date Month'] = X_test['C-2 Date'].dt.month

## Age Group

The ‘Age group’ can provide valuable information on the relationship between age and the type of injury claims that occur. There are types of injuries that are more likely to occur in a certain age group due to the nature of physical development, activity level and age-related health risks. 

For example, adults between the ages of 25 and 55 are the most physically active and may have injuries related to incidents in the workplace, thus giving rise to specific accident patterns such as repetitive strain injuries or back problems, but older adults between the ages of 55 and 65 may suffer injuries due to ageing, such as slips and falls, resulting in claims for fractures or joint injuries.

In [13]:
# Define a function to categorize ages into age groups
def categorize_age(age):
    if 0 <= age < 14:
        return 'Child'
    elif 14 <= age < 25:
        return 'Young'
    elif 25 <= age < 55:
        return 'Adult'
    elif 55 <= age <= 65:
        return 'Old'
    else:
        return 'Other'  # For ages outside the defined ranges

# Apply the function to create the new 'Age Group' feature in all datasets
X_train['Age Group'] = X_train['Age at Injury'].apply(categorize_age)
X_val['Age Group'] = X_val['Age at Injury'].apply(categorize_age)
X_test['Age Group'] = X_test['Age at Injury'].apply(categorize_age)

# Verify the new feature creation for all datasets
print("X_train: Age Group feature creation")
print(X_train[['Age at Injury', 'Age Group']].head())

print("\nX_val: Age Group feature creation")
print(X_val[['Age at Injury', 'Age Group']].head())

print("\nX_test: Age Group feature creation")
print(X_test[['Age at Injury', 'Age Group']].head())


X_train: Age Group feature creation
                  Age at Injury Age Group
Claim Identifier                         
5840724                    49.0     Adult
5828518                    45.0     Adult
5623558                    33.0     Adult
5630561                    31.0     Adult
5708079                    57.0       Old

X_val: Age Group feature creation
                  Age at Injury Age Group
Claim Identifier                         
6003273                    28.0     Adult
5659715                    36.0     Adult
6070148                    65.0       Old
6108041                    53.0     Adult
5519272                    31.0     Adult

X_test: Age Group feature creation
                  Age at Injury Age Group
Claim Identifier                         
6165911                      19     Young
6166141                      19     Young
6165907                      59       Old
6166047                      55       Old
6166102                      25     Adult


## Frequent Injury Cause

This variable identifies whether the cause of the injury is among the five most common types of workplace injuries. The reason for creating this binary indicator is that workplace injuries often follow specific patterns, with certain causes of injury being more common than others. By highlighting these frequent causes, the feature allows the model to focus on the most representative and predictable injury categories.


In [14]:
# Count the frequency of each type of injury in 'WCIO Cause of Injury Description'
common_injuries = X_train['WCIO Cause of Injury Description'].value_counts().nlargest(5).index

print("Most common injury causes:", common_injuries)

# Define a function to create the binary indicator
def injury_indicator(injury_type):
    return 1 if injury_type in common_injuries else 0

# Apply the function to create a new column 'Frequent Injury Cause' in all datasets
X_train['Frequent Injury Cause'] = X_train['WCIO Cause of Injury Description'].apply(injury_indicator)
X_val['Frequent Injury Cause'] = X_val['WCIO Cause of Injury Description'].apply(injury_indicator)
X_test['Frequent Injury Cause'] = X_test['WCIO Cause of Injury Description'].apply(injury_indicator)

# Display the first few rows to verify the new feature
print("X_train: Frequent Injury Cause")
print(X_train[['WCIO Cause of Injury Description', 'Frequent Injury Cause']].head())

print("\nX_val: Frequent Injury Cause")
print(X_val[['WCIO Cause of Injury Description', 'Frequent Injury Cause']].head())

print("\nX_test: Frequent Injury Cause")
print(X_test[['WCIO Cause of Injury Description', 'Frequent Injury Cause']].head())


Most common injury causes: Index(['LIFTING', 'FELLOW WORKER, PATIENT OR OTHER PERSON',
       'STRAIN OR INJURY BY, NOC', 'FALL, SLIP OR TRIP, NOC', 'ON SAME LEVEL'],
      dtype='object', name='WCIO Cause of Injury Description')
X_train: Frequent Injury Cause
                  WCIO Cause of Injury Description  Frequent Injury Cause
Claim Identifier                                                         
5840724           FROM DIFFERENT LEVEL (ELEVATION)                      0
5828518                  HOT OBJECTS OR SUBSTANCES                      0
5623558                          REPETITIVE MOTION                      0
5630561                                   PANDEMIC                      0
5708079                            CUMULATIVE, NOC                      0

X_val: Frequent Injury Cause
                 WCIO Cause of Injury Description  Frequent Injury Cause
Claim Identifier                                                        
6003273                                  TWIS

## Broad Body Part

The ‘Broad Body Part’ function groups the detailed body part codes from the body part code column of the WCIO into broader, more easily interpretable categories such as ‘Head and Neck’, ‘Upper Extremities’, ‘Trunk’, etc. This categorisation simplifies the detailed information into higher level groupings that are easier to analyse and align with injury patterns. The grouping helps to identify trends that may not be evident in the individual codes. The function includes general categories such as ‘Unclassified’ and ‘Unknown’, which ensure that records with incomplete or unusual codes are still classified, reducing the impact of missing or unusual data in the model.




In [15]:
# Drop duplicates to ensure a one to one mapping of descriptions to codes
body_part_mapping = (
    X_train[['WCIO Part Of Body Description', 'WCIO Part Of Body Code']]
    .drop_duplicates()
    .sort_values(by='WCIO Part Of Body Description') 
    .set_index('WCIO Part Of Body Description')['WCIO Part Of Body Code']
    .to_dict()
)

# Verify that it is correct
for description, code in body_part_mapping.items():
    print(f"Description: {description}, Code: {code}")


Description: ABDOMEN INCLUDING GROIN, Code: 61.0
Description: ANKLE, Code: 55.0
Description: ARTIFICIAL APPLIANCE, Code: 64.0
Description: BODY SYSTEMS AND MULTIPLE BODY SYSTEMS, Code: 91.0
Description: BRAIN, Code: 12.0
Description: BUTTOCKS, Code: 62.0
Description: CHEST, Code: 44.0
Description: DISC, Code: 22.0
Description: EAR(S), Code: 13.0
Description: ELBOW, Code: 32.0
Description: EYE(S), Code: 14.0
Description: FACIAL BONES, Code: 19.0
Description: FINGER(S), Code: 36.0
Description: FOOT, Code: 56.0
Description: GREAT TOE, Code: 58.0
Description: HAND, Code: 35.0
Description: HEART, Code: 49.0
Description: HIP, Code: 51.0
Description: INSUFFICIENT INFO TO PROPERLY IDENTIFY - UNCLASSIFIED, Code: 65.0
Description: INTERNAL ORGANS, Code: 48.0
Description: KNEE, Code: 53.0
Description: LARYNX, Code: 24.0
Description: LOWER ARM, Code: 33.0
Description: LOWER BACK AREA, Code: 42.0
Description: LOWER LEG, Code: 54.0
Description: LUMBAR & OR SACRAL VERTEBRAE (VERTEBRA, Code: 63.0
Desc

In [16]:
# Function to categorize body part codes into broader categories
def categorize_body_part(body_part_code):
    if body_part_code in [10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 24, 26]:
        return 'Head and Neck'
    elif body_part_code in [30, 31, 32, 33, 34, 35, 36, 37, 38, 39]:
        return 'Upper Extremities'
    elif body_part_code in [50, 51, 52, 53, 54, 55, 56, 57, 58]:
        return 'Lower Extremities'
    elif body_part_code in [40, 41, 42, 43, 44, 45, 46, 61, 62]:
        return 'Trunk'
    elif body_part_code in [21, 23, 43, 48, 49, 60, 63]:
        return 'Internal Organs'
    elif body_part_code in [90, 91]:
        return 'Multiple Body Parts'
    elif body_part_code in [66, 99]:
        return 'Whole Body'
    elif body_part_code in [65]:
        return 'Unclassified'
    else:
        return 'Unknown'


In [17]:
# Assign broad categories to the body part codes in the training, validation, and test datasets.
# Apply the categorization function to create a new 'Broad Body Part' feature
X_train['Broad Body Part'] = X_train['WCIO Part Of Body Code'].apply(categorize_body_part)
X_val['Broad Body Part'] = X_val['WCIO Part Of Body Code'].apply(categorize_body_part)
X_test['Broad Body Part'] = X_test['WCIO Part Of Body Code'].apply(categorize_body_part)


In [18]:
# Verify the new feature in training, validation, and test datasets
print("X_train: Broad Body Part feature")
print(X_train[['WCIO Part Of Body Code', 'Broad Body Part']].head())

print("\nX_val: Broad Body Part feature")
print(X_val[['WCIO Part Of Body Code', 'Broad Body Part']].head())

print("\nX_test: Broad Body Part feature")
print(X_test[['WCIO Part Of Body Code', 'Broad Body Part']].head())


X_train: Broad Body Part feature
                  WCIO Part Of Body Code      Broad Body Part
Claim Identifier                                             
5840724                             42.0                Trunk
5828518                             35.0    Upper Extremities
5623558                             34.0    Upper Extremities
5630561                             90.0  Multiple Body Parts
5708079                             35.0    Upper Extremities

X_val: Broad Body Part feature
                  WCIO Part Of Body Code    Broad Body Part
Claim Identifier                                           
6003273                             55.0  Lower Extremities
5659715                             25.0            Unknown
6070148                             55.0  Lower Extremities
6108041                             43.0              Trunk
5519272                             52.0  Lower Extremities

X_test: Broad Body Part feature
                  WCIO Part Of Body Code    Broa

In [19]:
# Count the number of 'Unknown' values in the new feature
unknown_train = X_train['Broad Body Part'].value_counts().get('Unknown', 0)
unknown_val = X_val['Broad Body Part'].value_counts().get('Unknown', 0)
unknown_test = X_test['Broad Body Part'].value_counts().get('Unknown', 0)

# Print the results
print(f"Number of unknown values in X_train: {unknown_train}")
print(f"Number of unknown values in X_val: {unknown_val}")
print(f"Number of unknown values in X_test: {unknown_test}")


Number of unknown values in X_train: 5812
Number of unknown values in X_val: 1459
Number of unknown values in X_test: 70433


## Dependency-to-Income Ratio

The ‘Dependency to Income Ratio’ measures the financial burden of dependants on a person's income. It is calculated by dividing the number of dependents by the average weekly wage, representing how much each dollar of weekly income must support dependent family members. Those with a high dependency-to-income ratio may have unique injury patterns related to their socioeconomic status. For example, they may work longer hours or in higher-risk roles to support their dependents.

In [20]:
# Create a new feature: Dependency-to-Income Ratio for all datasets
for dataset in [X_train, X_val, X_test]:
    dataset['Dependency-to-Income Ratio'] = dataset['Number of Dependents'] / (dataset['Average Weekly Wage'] + 1e-9)

    # Handle potential division by zero or missing values
    dataset['Dependency-to-Income Ratio'] = dataset['Dependency-to-Income Ratio'].fillna(0).replace([float('inf'), -float('inf')], 0)

# Display the new feature for verification
print("X_train: Dependency-to-Income Ratio feature")
print(X_train[['Number of Dependents', 'Average Weekly Wage', 'Dependency-to-Income Ratio']].head())

print("\nX_val: Dependency-to-Income Ratio feature")
print(X_val[['Number of Dependents', 'Average Weekly Wage', 'Dependency-to-Income Ratio']].head())

print("\nX_test: Dependency-to-Income Ratio feature")
print(X_test[['Number of Dependents', 'Average Weekly Wage', 'Dependency-to-Income Ratio']].head())


X_train: Dependency-to-Income Ratio feature
                  Number of Dependents  Average Weekly Wage  \
Claim Identifier                                              
5840724                            1.0          1500.000000   
5828518                            3.0          1265.750000   
5623558                            4.0          1973.950000   
5630561                            2.0           750.923333   
5708079                            3.0          1370.240000   

                  Dependency-to-Income Ratio  
Claim Identifier                              
5840724                             0.000667  
5828518                             0.002370  
5623558                             0.002026  
5630561                             0.002663  
5708079                             0.002189  

X_val: Dependency-to-Income Ratio feature
                  Number of Dependents  Average Weekly Wage  \
Claim Identifier                                              
6003273         

## Injury-Location Pair

The feature ‘Injury-location pair’ combines two important attributes: the WCIO code Body part (representing the body part affected) and the WCIO description Nature of injury (describing the type of injury). Some injuries are closely associated with specific parts of the body. A ‘fracture’ is more closely related to the ‘lower extremities’ than to the ‘internal organs’. At the same time, a ‘strain’ may occur predominantly in the ‘back’ or in the ‘upper extremities’. Combining these two pieces of information into a single attribute will allow our models to learn patterns more efficiently, without the need to explicitly calculate interactions between features. This will be especially beneficial for tree-based models, such as random forests, gradient boosting...

In [21]:
# Combine 'WCIO Part Of Body Code' and 'WCIO Nature of Injury Description' into a single feature for all datasets
for dataset in [X_train, X_val, X_test]:
    dataset['Injury-Location Pair'] = (
        dataset['WCIO Part Of Body Code'].astype(str) + " - " + dataset['WCIO Nature of Injury Description'].astype(str)
    )

# Display the new feature for verification
print("X_train: Injury-Location Pair feature")
print(X_train[['WCIO Part Of Body Code', 'WCIO Nature of Injury Description', 'Injury-Location Pair']].head())

print("\nX_val: Injury-Location Pair feature")
print(X_val[['WCIO Part Of Body Code', 'WCIO Nature of Injury Description', 'Injury-Location Pair']].head())

print("\nX_test: Injury-Location Pair feature")
print(X_test[['WCIO Part Of Body Code', 'WCIO Nature of Injury Description', 'Injury-Location Pair']].head())


X_train: Injury-Location Pair feature
                  WCIO Part Of Body Code WCIO Nature of Injury Description  \
Claim Identifier                                                             
5840724                             42.0                NO PHYSICAL INJURY   
5828518                             35.0                              BURN   
5623558                             34.0            CARPAL TUNNEL SYNDROME   
5630561                             90.0                          COVID-19   
5708079                             35.0                    STRAIN OR TEAR   

                           Injury-Location Pair  
Claim Identifier                                 
5840724               42.0 - NO PHYSICAL INJURY  
5828518                             35.0 - BURN  
5623558           34.0 - CARPAL TUNNEL SYNDROME  
5630561                         90.0 - COVID-19  
5708079                   35.0 - STRAIN OR TEAR  

X_val: Injury-Location Pair feature
                  WCIO Part 

## Time Difference Between Events

The ‘Time between events’ feature calculates the difference in years between the year of assembly (which possibly indicates when a claim or event was processed or recorded) and the year of the accident (which indicates when the workplace injury occurred). This characteristic captures the time gap between the occurrence of the injury and its subsequent treatment or processing. A longer time lag may suggest delays in reporting or processing claims, which may correlate with specific types of workplace injuries or practices, and shorter time gaps may correlate with high-priority injuries that require immediate action.

In [22]:
# Calculate the time difference between events for all datasets
for dataset in [X_train, X_val, X_test]:
    dataset['Time Between Events'] = dataset['Assembly Year'] - dataset['Accident Year']

# Display the new feature for verification
print("X_train: Time Between Events feature")
print(X_train[['Accident Year', 'Assembly Year', 'Time Between Events']].head())

print("\nX_val: Time Between Events feature")
print(X_val[['Accident Year', 'Assembly Year', 'Time Between Events']].head())

print("\nX_test: Time Between Events feature")
print(X_test[['Accident Year', 'Assembly Year', 'Time Between Events']].head())


X_train: Time Between Events feature
                  Accident Year  Assembly Year  Time Between Events
Claim Identifier                                                   
5840724                    2021           2021                    0
5828518                    2021           2021                    0
5623558                    2020           2020                    0
5630561                    2020           2021                    1
5708079                    2021           2021                    0

X_val: Time Between Events feature
                  Accident Year  Assembly Year  Time Between Events
Claim Identifier                                                   
6003273                    2022           2022                    0
5659715                    2021           2021                    0
6070148                    2022           2022                    0
6108041                    2022           2022                    0
5519272                    2020           2

## Accident on Weekday

Create a binary variable that indicates whether the accident occurred on a weekday. This feature is based on the assumption that workplace injuries are more likely to occur during the workweek. This feature will hopefuly help the model to identify patterns in the data that are specific to work-related injuries.

In [23]:
for dataset in [X_train, X_val, X_test]:
    dataset['Accident on Weekday'] = dataset['Accident Date'].dt.dayofweek < 5

# Display the new feature for verification
print("X_train: Accident on Weekday feature")
print(X_train[['Accident Date', 'Accident on Weekday']].head())

print("\nX_val: Accident on Weekday feature")
print(X_val[['Accident Date', 'Accident on Weekday']].head())

print("\nX_test: Accident on Weekday feature")
print(X_test[['Accident Date', 'Accident on Weekday']].head())

X_train: Accident on Weekday feature
                 Accident Date  Accident on Weekday
Claim Identifier                                   
5840724             2021-10-20                 True
5828518             2021-10-04                 True
5623558             2020-11-19                 True
5630561             2020-12-09                 True
5708079             2021-03-10                 True

X_val: Accident on Weekday feature
                 Accident Date  Accident on Weekday
Claim Identifier                                   
6003273             2022-05-19                 True
5659715             2021-01-15                 True
6070148             2022-08-16                 True
6108041             2022-09-29                 True
5519272             2020-07-07                 True

X_test: Accident on Weekday feature
                 Accident Date  Accident on Weekday
Claim Identifier                                   
6165911             2022-12-24                False
616614

## Injury Complexity

The characteristic ‘Injury complexity’ is calculated as the product of the WCIO nature of injury code and the number of dependants plus one. This characteristic combines two factors, the nature of injury code, indicating the severity or type of injury, and the number of dependents. By incorporating these two aspects, the feature represents an estimate of how complex an injury claim can be in terms of severity and impact on dependants.

In [24]:
# Calculate injury complexity for all datasets
for dataset in [X_train, X_val, X_test]:
    dataset['Injury Complexity'] = dataset['WCIO Nature of Injury Code'] * (dataset['Number of Dependents'] + 1)

# Display the new feature for verification
print("X_train: Injury Complexity feature")
print(X_train[['WCIO Nature of Injury Code', 'Number of Dependents', 'Injury Complexity']].head())

print("\nX_val: Injury Complexity feature")
print(X_val[['WCIO Nature of Injury Code', 'Number of Dependents', 'Injury Complexity']].head())

print("\nX_test: Injury Complexity feature")
print(X_test[['WCIO Nature of Injury Code', 'Number of Dependents', 'Injury Complexity']].head())


X_train: Injury Complexity feature
                  WCIO Nature of Injury Code  Number of Dependents  \
Claim Identifier                                                     
5840724                                  1.0                   1.0   
5828518                                  4.0                   3.0   
5623558                                 78.0                   4.0   
5630561                                 83.0                   2.0   
5708079                                 52.0                   3.0   

                  Injury Complexity  
Claim Identifier                     
5840724                         2.0  
5828518                        16.0  
5623558                       390.0  
5630561                       249.0  
5708079                       208.0  

X_val: Injury Complexity feature
                  WCIO Nature of Injury Code  Number of Dependents  \
Claim Identifier                                                     
6003273                           

## Accident density by carrier

The ‘Carrier Accident Density’ feature measures the proportion of accidents associated with each carrier relative to the total number of records in the training dataset. This feature provides insight into whether the frequency of claims varies significantly by carrier, which could influence the likelihood of certain types of injury claims occurring. For example, carriers with a higher accident density might show different casualty patterns, such as a higher proportion of specific injury types or severity. Through normalising accident counts relative to the size of the training dataset, the function allows for better comparison across carriers and facilitates generalisation to validation and test datasets.

In [25]:
# Calculate accident counts per carrier in the training dataset
carrier_accident_counts = X_train['Carrier Name'].value_counts()

# Create the Carrier Accident Density feature for all datasets
for dataset in [X_train, X_val, X_test]:
    dataset['Carrier Accident Density'] = dataset['Carrier Name'].map(carrier_accident_counts) / len(X_train)

# Fill NaN values with the mean accident density from X_train
default_density = carrier_accident_counts.mean() / len(X_train)
for dataset in [X_val, X_test]:
    dataset['Carrier Accident Density'].fillna(default_density, inplace=True)

# Display the feature for verification
print("X_train: Carrier Accident Density feature")
print(X_train[['Carrier Name', 'Carrier Accident Density']].head())

print("\nX_val: Carrier Accident Density feature")
print(X_val[['Carrier Name', 'Carrier Accident Density']].head())

print("\nX_test: Carrier Accident Density feature")
print(X_test[['Carrier Name', 'Carrier Accident Density']].head())


X_train: Carrier Accident Density feature
                                    Carrier Name  Carrier Accident Density
Claim Identifier                                                          
5840724                     STATE INSURANCE FUND                  0.195872
5828518                     STATE INSURANCE FUND                  0.195872
5623558           TRAVELERS INDEMNITY CO OF CONN                  0.004475
5630561             AMERICAN ZURICH INSURANCE CO                  0.030767
5708079                  AMERICAN CASUALTY CO OF                  0.002562

X_val: Carrier Accident Density feature
                                  Carrier Name  Carrier Accident Density
Claim Identifier                                                        
6003273                A I U INSURANCE COMPANY                  0.022345
5659715                   STATE INSURANCE FUND                  0.195872
6070148           COSTCO WHOLESALE CORPORATION                  0.001876
6108041              WAL-MA

## Injury Season

This feature categorizes the month of an accident into one of four seasons: Winter, Spring, Summer, and Autumn. This can help capture seasonal patterns in workplace injuries.

In [26]:
def injury_season(month):
    if month in [12, 1, 2]: return 'Winter'
    elif month in [3, 4, 5]: return 'Spring'
    elif month in [6, 7, 8]: return 'Summer'
    else: return 'Autumn'

for dataset in [X_train, X_val, X_test]:
    dataset['Season of Accident'] = dataset['Accident Month'].apply(injury_season)


In [27]:
print("X_train: Season of Accident feature")
print(X_train[['Accident Month', 'Season of Accident']].head())

print("\nX_val: Season of Accident feature")
print(X_val[['Accident Month', 'Season of Accident']].head())

print("\nX_test: Season of Accident feature")
print(X_test[['Accident Month', 'Season of Accident']].head())


X_train: Season of Accident feature
                  Accident Month Season of Accident
Claim Identifier                                   
5840724                       10             Autumn
5828518                       10             Autumn
5623558                       11             Autumn
5630561                       12             Winter
5708079                        3             Spring

X_val: Season of Accident feature
                  Accident Month Season of Accident
Claim Identifier                                   
6003273                        5             Spring
5659715                        1             Winter
6070148                        8             Summer
6108041                        9             Autumn
5519272                        7             Summer

X_test: Season of Accident feature
                  Accident Month Season of Accident
Claim Identifier                                   
6165911                     12.0             Winter
6166141  

## Region Risk Level

This feature calculates the relative risk level of each county or zip code based on the frequency of claims in the training dataset. The counties or zip codes are then grouped into categories such as Low Risk, Medium Risk, and High Risk.

In [28]:
county_claim_counts = X_train['County of Injury'].value_counts(normalize=True)

In [29]:
#Verify the new column
X_train['Region Risk Percentage'] = X_train['County of Injury'].map(county_claim_counts)

In [30]:
for dataset in [X_val, X_test]:
    dataset['Region Risk Percentage'] = dataset['County of Injury'].map(county_claim_counts)

print(X_train['Region Risk Percentage'].describe())

count    441804.000000
mean          0.054564
std           0.036141
min           0.000240
25%           0.020453
50%           0.051550
75%           0.093012
max           0.103496
Name: Region Risk Percentage, dtype: float64


In [31]:
# Based on the statistics from describe(), we define the thresholds for risk levels
def assign_risk_level(percentage):
    if percentage <= 0.0205:  # Bottom 25%
        return 'Low Risk'
    elif percentage <= 0.0930:  # Between 25% and 75%
        return 'Medium Risk'
    else:  # Top 25%
        return 'High Risk'

# Apply the risk level assignment to all datasets
for dataset in [X_train, X_val, X_test]:
    dataset['Region Risk Level'] = dataset['Region Risk Percentage'].apply(assign_risk_level)

# Risk level distribution in the training set
print("Risk Level Distribution in Training Set:")
print(X_train['Region Risk Level'].value_counts(normalize=True))

# Sample data with risk percentages and levels for verification
print("Sample of Risk Percentages with Risk Levels:")
print(X_train[['County of Injury', 'Region Risk Percentage', 'Region Risk Level']].drop_duplicates().sort_values(by='Region Risk Percentage', ascending=False))

Risk Level Distribution in Training Set:
Region Risk Level
Medium Risk    0.443287
High Risk      0.298096
Low Risk       0.258617
Name: proportion, dtype: float64
Sample of Risk Percentages with Risk Levels:
                 County of Injury  Region Risk Percentage Region Risk Level
Claim Identifier                                                           
6117254                   SUFFOLK                0.103496         High Risk
5623558                    QUEENS                0.101588         High Risk
5630561                     KINGS                0.093012         High Risk
5828518                    NASSAU                0.074535       Medium Risk
5794346                     BRONX                0.069952       Medium Risk
...                           ...                     ...               ...
6119140                   WYOMING                0.001385          Low Risk
5945123                MONTGOMERY                0.001082          Low Risk
6093921                 SCHOHAR

## Industry Risk Index

This feature calculates the frequency of claims for each industry in the training dataset and assigns industries a Low Risk, Medium Risk, or High Risk category based on the frequency percentile.

In [32]:
# Calculate claim frequencies for each industry in the training dataset
industry_claim_counts = X_train['Industry Code'].value_counts(normalize=True)

# Map claim frequencies to the training dataset
X_train['Industry Claim Percentage'] = X_train['Industry Code'].map(industry_claim_counts)

In [33]:
# Get statistics for 'Industry Claim Percentage'
print(X_train['Industry Claim Percentage'].describe())

count    441804.000000
mean          0.097457
std           0.070553
min           0.000593
25%           0.031869
50%           0.078474
75%           0.160601
max           0.204579
Name: Industry Claim Percentage, dtype: float64


In [34]:
# Define thresholds for industry risk levels based on the statistics
def assign_industry_risk(percentage):
    if percentage <= 0.0318:  # Bottom 25%
        return 'Low Risk'
    elif percentage <= 0.1606:  # Between 25% and 75%
        return 'Medium Risk'
    else:  # Top 25%
        return 'High Risk'

# Apply the thresholds to all datasets
for dataset in [X_train, X_val, X_test]:
    # Map the industry claim percentages
    dataset['Industry Claim Percentage'] = dataset['Industry Code'].map(industry_claim_counts)
    # Assign industry risk levels
    dataset['Industry Risk Level'] = dataset['Industry Claim Percentage'].apply(assign_industry_risk)

In [35]:
print("X_train: Industry Risk Level feature")
print(X_train[['Industry Code', 'Industry Claim Percentage', 'Industry Risk Level']].head())

print("\nX_val: Industry Risk Level feature")
print(X_val[['Industry Code', 'Industry Claim Percentage', 'Industry Risk Level']].head())

print("\nX_test: Industry Risk Level feature")
print(X_test[['Industry Code', 'Industry Claim Percentage', 'Industry Risk Level']].head())

X_train: Industry Risk Level feature
                  Industry Code  Industry Claim Percentage Industry Risk Level
Claim Identifier                                                              
5840724                    56.0                   0.036985         Medium Risk
5828518                    62.0                   0.204579           High Risk
5623558                    52.0                   0.014332            Low Risk
5630561                    62.0                   0.204579           High Risk
5708079                    23.0                   0.059311         Medium Risk

X_val: Industry Risk Level feature
                  Industry Code  Industry Claim Percentage Industry Risk Level
Claim Identifier                                                              
6003273                    45.0                   0.031869         Medium Risk
5659715                    92.0                   0.160601           High Risk
6070148                    45.0                   0.031869

## C2 and C3 Indicators

This feature set represents the presence (1) or absence (0) of the C-2 and C-3 forms, with their combinations. They may help represent the effect of claims once one or both forms had been submitted

* Both C2 and C3: Indicates claims where both forms are submitted.
* Only C2: Indicates claims where only the C-2 form is submitted.
* Only C3: Indicates claims where only the C-3 form is submitted.
* No C2 or C3: Indicates claims where neither form is submitted.


In [36]:
for dataset in [X_train, X_val, X_test]:
    # Both forms are present
    dataset['Both C2 and C3'] = ((dataset['C-2 Date'] == 1) & (dataset['C-3 Date'] == 1)).astype(int)

    # Only C2 is present
    dataset['Only C2'] = ((dataset['C-2 Date'] == 1) & (dataset['C-3 Date'] == 0)).astype(int)

    # Only C3 is present
    dataset['Only C3'] = ((dataset['C-3 Date'] == 1) & (dataset['C-2 Date'] == 0)).astype(int)

    # Neither form is present
    dataset['No C2 or C3'] = ((dataset['C-2 Date'] == 0) & (dataset['C-3 Date'] == 0)).astype(int)


In [37]:
print("X_train: C2 and C3 Indicators")
print(X_train[['C-2 Date', 'C-3 Date', 'Both C2 and C3', 'Only C2', 'Only C3', 'No C2 or C3']].head())

print("\nX_val: C2 and C3 Indicators")
print(X_val[['C-2 Date', 'C-3 Date', 'Both C2 and C3', 'Only C2', 'Only C3', 'No C2 or C3']].head())

print("\nX_test: C2 and C3 Indicators")
print(X_test[['C-2 Date', 'C-3 Date', 'Both C2 and C3', 'Only C2', 'Only C3', 'No C2 or C3']].head())


X_train: C2 and C3 Indicators
                   C-2 Date  C-3 Date  Both C2 and C3  Only C2  Only C3  \
Claim Identifier                                                          
5840724          2021-10-22         1               0        0        0   
5828518          2021-10-05         0               0        0        0   
5623558          2020-12-22         1               0        0        0   
5630561          2021-01-15         1               0        0        0   
5708079          2021-04-23         0               0        0        0   

                  No C2 or C3  
Claim Identifier               
5840724                     0  
5828518                     0  
5623558                     0  
5630561                     0  
5708079                     0  

X_val: C2 and C3 Indicators
                   C-2 Date  C-3 Date  Both C2 and C3  Only C2  Only C3  \
Claim Identifier                                                          
6003273          2022-05-31         0    

In [38]:
'''# Check unique values in C-2 Date
unique_c2_test = X_test['C-2 Date'].unique()
print(f"Unique values in 'C-2 Date' in X_test: {unique_c2_test}")

# Check unique values in C-3 Date
unique_c3_test = X_test['C-3 Date'].unique()
print(f"Unique values in 'C-3 Date' in X_test: {unique_c3_test}")

'''

'# Check unique values in C-2 Date\nunique_c2_test = X_test[\'C-2 Date\'].unique()\nprint(f"Unique values in \'C-2 Date\' in X_test: {unique_c2_test}")\n\n# Check unique values in C-3 Date\nunique_c3_test = X_test[\'C-3 Date\'].unique()\nprint(f"Unique values in \'C-3 Date\' in X_test: {unique_c3_test}")\n\n'

## Geographic-Industry Risk Interaction

This feature will calculate the relative frequency of claims within each (County, Industry) pair, providing insight into high-risk areas and industries. It captures localized industry trends, such as certain industries being riskier in specific counties.

In [39]:
# Calculate claim frequencies for (County, Industry) pairs
geo_industry_claim_counts = (
    X_train.groupby(['County of Injury', 'Industry Code']).size()
    / len(X_train)
)
for dataset in [X_train, X_val, X_test]:
    dataset['Geo-Industry Risk'] = dataset.set_index(['County of Injury', 'Industry Code']).index.map(geo_industry_claim_counts)
for dataset in [X_train, X_val, X_test]:
    dataset['Geo-Industry Risk'].fillna(0, inplace=True)


In [40]:
print("X_train: Geo-Industry Risk feature")
print(X_train[['County of Injury', 'Industry Code', 'Geo-Industry Risk']].head())

print("\nX_val: Geo-Industry Risk feature")
print(X_val[['County of Injury', 'Industry Code', 'Geo-Industry Risk']].head())

print("\nX_test: Geo-Industry Risk feature")
print(X_test[['County of Injury', 'Industry Code', 'Geo-Industry Risk']].head())


X_train: Geo-Industry Risk feature
                 County of Injury  Industry Code  Geo-Industry Risk
Claim Identifier                                                   
5840724                  RICHMOND           56.0           0.000783
5828518                    NASSAU           62.0           0.015228
5623558                    QUEENS           52.0           0.001148
5630561                     KINGS           62.0           0.022573
5708079                  TOMPKINS           23.0           0.000091

X_val: Geo-Industry Risk feature
                 County of Injury  Industry Code  Geo-Industry Risk
Claim Identifier                                                   
6003273                  SARATOGA           45.0           0.000797
5659715                   SUFFOLK           92.0           0.016935
6070148                    MONROE           45.0           0.001478
6108041                 SCHOHARIE           45.0           0.000063
5519272                    QUEENS           23.

## Age at Injury

In [41]:
for dataset in [X_train, X_val, X_test]:
    dataset['Age at Injury'] = dataset['Accident Year'] - dataset['Birth Year']


In [42]:
for col in ['Birth Year', 'Accident Year']:
    missing_train = X_train[col].isna().sum()
    missing_val = X_val[col].isna().sum()
    missing_test = X_test[col].isna().sum()
    print(f"Missing values in '{col}':")
    print(f"  X_train: {missing_train}")
    print(f"  X_val: {missing_val}")
    print(f"  X_test: {missing_test}")


Missing values in 'Birth Year':
  X_train: 0
  X_val: 0
  X_test: 0
Missing values in 'Accident Year':
  X_train: 0
  X_val: 0
  X_test: 2439


## Dropping the date columns

In [43]:
#Drop 'Accident Date', 'Assembly Date' and 'C-2 Date' columns

X_train.drop(['Accident Date', 'Assembly Date', 'C-2 Date'], axis=1, inplace=True)
X_val.drop(['Accident Date', 'Assembly Date', 'C-2 Date'], axis=1, inplace=True)
X_test.drop(['Accident Date', 'Assembly Date', 'C-2 Date'], axis=1, inplace=True)

In [44]:
X_test.head()

Unnamed: 0_level_0,Age at Injury,Alternative Dispute Resolution,Attorney/Representative,Average Weekly Wage,Birth Year,C-3 Date,Carrier Name,Carrier Type,County of Injury,COVID-19 Indicator,...,Season of Accident,Region Risk Percentage,Region Risk Level,Industry Claim Percentage,Industry Risk Level,Both C2 and C3,Only C2,Only C3,No C2 or C3,Geo-Industry Risk
Claim Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6165911,19.0,0.0,0,892.62,2003.0,0,INDEMNITY INSURANCE CO OF,1A. PRIVATE,BRONX,0,...,Winter,0.069952,Medium Risk,0.064538,Medium Risk,0,0,0,0,0.006847
6166141,19.0,0.0,0,892.62,2003.0,0,A I U INSURANCE COMPANY,1A. PRIVATE,QUEENS,0,...,Autumn,0.101588,High Risk,0.031869,Medium Risk,0,0,0,0,0.001974
6165907,59.0,0.0,0,0.0,1963.0,0,AMGUARD INSURANCE COMPANY,1A. PRIVATE,WESTCHESTER,0,...,Winter,0.044465,Medium Risk,0.036985,Medium Risk,0,0,0,0,0.001618
6166047,2022.0,0.0,0,0.0,0.0,0,INDEMNITY INS. OF N AMERICA,1A. PRIVATE,QUEENS,0,...,Winter,0.101588,High Risk,0.064538,Medium Risk,0,0,0,0,0.014597
6166102,25.0,0.0,0,0.0,1997.0,0,NEW HAMPSHIRE INSURANCE CO,1A. PRIVATE,KINGS,0,...,Winter,0.093012,High Risk,0.000593,Low Risk,0,0,0,0,3.4e-05


In [45]:
# Display the columns of X_train
print("Columns in X_train:")
print(X_train.columns)

# Display the columns of X_val
print("\nColumns in X_val:")
print(X_val.columns)

# Display the columns of X_test
print("\nColumns in X_test:")
print(X_test.columns)


Columns in X_train:
Index(['Age at Injury', 'Alternative Dispute Resolution',
       'Attorney/Representative', 'Average Weekly Wage', 'Birth Year',
       'C-3 Date', 'Carrier Name', 'Carrier Type', 'County of Injury',
       'COVID-19 Indicator', 'District Name', 'First Hearing Date', 'Gender',
       'IME-4 Count', 'Industry Code', 'Industry Code Description',
       'Medical Fee Region', 'WCIO Cause of Injury Code',
       'WCIO Cause of Injury Description', 'WCIO Nature of Injury Code',
       'WCIO Nature of Injury Description', 'WCIO Part Of Body Code',
       'WCIO Part Of Body Description', 'Zip Code', 'Number of Dependents',
       'Assembly Year', 'Accident Year', 'C-2 Date Year', 'Assembly Month',
       'Accident Month', 'C-2 Date Month', 'Age Group',
       'Frequent Injury Cause', 'Broad Body Part',
       'Dependency-to-Income Ratio', 'Injury-Location Pair',
       'Time Between Events', 'Accident on Weekday', 'Injury Complexity',
       'Carrier Accident Density', 'Sea

In [46]:
#Display the shape of the x_train, y_train, x_val, y_val and x_test
# Put X's shape side by side with y's shape

print(f"X_train shape: {X_train.shape} y_train shape: {y_train.shape}")

print(f"X_val shape: {X_val.shape} y_val shape: {y_val.shape}")

print(f"X_test shape: {X_test.shape}")

X_train shape: (441804, 50) y_train shape: (441804, 1)
X_val shape: (110586, 50) y_val shape: (110586, 1)
X_test shape: (382362, 50)


In [47]:
# Display the shapes
print(f"X_train shape: {X_train.shape} y_train shape: {y_train.shape}")
print(f"X_val shape: {X_val.shape} y_val shape: {y_val.shape}")
print(f"X_test shape: {X_test.shape}")

# Display the columns of each DataFrame
print("\nColumns in X_train:")
print(X_train.columns)

print("\nColumns in X_val:")
print(X_val.columns)

print("\nColumns in X_test:")
print(X_test.columns)


X_train shape: (441804, 50) y_train shape: (441804, 1)
X_val shape: (110586, 50) y_val shape: (110586, 1)
X_test shape: (382362, 50)

Columns in X_train:
Index(['Age at Injury', 'Alternative Dispute Resolution',
       'Attorney/Representative', 'Average Weekly Wage', 'Birth Year',
       'C-3 Date', 'Carrier Name', 'Carrier Type', 'County of Injury',
       'COVID-19 Indicator', 'District Name', 'First Hearing Date', 'Gender',
       'IME-4 Count', 'Industry Code', 'Industry Code Description',
       'Medical Fee Region', 'WCIO Cause of Injury Code',
       'WCIO Cause of Injury Description', 'WCIO Nature of Injury Code',
       'WCIO Nature of Injury Description', 'WCIO Part Of Body Code',
       'WCIO Part Of Body Description', 'Zip Code', 'Number of Dependents',
       'Assembly Year', 'Accident Year', 'C-2 Date Year', 'Assembly Month',
       'Accident Month', 'C-2 Date Month', 'Age Group',
       'Frequent Injury Cause', 'Broad Body Part',
       'Dependency-to-Income Ratio', 'In

In [48]:
# Check for NaN values in X_train, X_val, and X_test
print("NaN values in X_train:")
print(X_train.isna().sum()[X_train.isna().sum() > 0])

print("\nNaN values in X_val:")
print(X_val.isna().sum()[X_val.isna().sum() > 0])

print("\nNaN values in X_test:")
print(X_test.isna().sum()[X_test.isna().sum() > 0])

# Optionally, confirm if there are any NaNs at all
print("\nAre there any NaNs in X_train? ", X_train.isna().any().any())
print("Are there any NaNs in X_val? ", X_val.isna().any().any())
print("Are there any NaNs in X_test? ", X_test.isna().any().any())


NaN values in X_train:
Series([], dtype: int64)

NaN values in X_val:
Series([], dtype: int64)

NaN values in X_test:
Age at Injury          2439
Accident Year          2439
Accident Month         2439
Time Between Events    2439
dtype: int64

Are there any NaNs in X_train?  False
Are there any NaNs in X_val?  False
Are there any NaNs in X_test?  True


In [49]:
#check missing values for test

missing_values = X_test.isnull().sum()

missing_values = missing_values[missing_values > 0]

missing_values


Age at Injury          2439
Accident Year          2439
Accident Month         2439
Time Between Events    2439
dtype: int64

In [50]:
# # Calculate the mode of the 'Gender' column in X_test
# gender_mode = X_test['Gender'].mode()[0]

# # Fill missing values in the 'Gender' column with the mode
# X_test['Gender'].fillna(gender_mode, inplace=True)

# X_test = X_test



In [51]:
#Check X_test 'Gender column'

X_test['Gender'].head()

Claim Identifier
6165911    1
6166141    0
6165907    0
6166047    0
6166102    1
Name: Gender, dtype: int64

In [52]:
#check missing values for test

missing_values = X_test.isnull().sum()

missing_values = missing_values[missing_values > 0]

missing_values

Age at Injury          2439
Accident Year          2439
Accident Month         2439
Time Between Events    2439
dtype: int64

In [53]:
missing_values = X_train.isnull().sum()

missing_values = missing_values[missing_values > 0]

missing_values

Series([], dtype: int64)

In [54]:
missing_values = X_val.isnull().sum()

missing_values = missing_values[missing_values > 0]

missing_values

Series([], dtype: int64)

In [55]:
#Check the shape of the datasets

print(f"X_train shape: {X_train.shape} y_train shape: {y_train.shape}")

print(f"X_val shape: {X_val.shape} y_val shape: {y_val.shape}")

print(f"X_test shape: {X_test.shape}")

X_train shape: (441804, 50) y_train shape: (441804, 1)
X_val shape: (110586, 50) y_val shape: (110586, 1)
X_test shape: (382362, 50)


In [56]:
X_train.to_csv('datasets/feature_engineering_train_delivery1.csv')
X_val.to_csv('datasets/feature_engineering_val_delivery1.csv')
y_train.to_csv('datasets/feature_engineering_y_train_delivery1.csv')
y_val.to_csv('datasets/feature_engineering_y_val_delivery1.csv')
X_test.to_csv('datasets/feature_engineering_test_delivery1.csv')