# Libraries

In [1]:
# General Libraries
import json
import numpy as np
import pandas as pd
import os
import ast
import inspect
from collections.abc import Iterable

# Base Classes & Estimators
from sklearn.base import BaseEstimator, TransformerMixin

# Pipeline & Model Construction
from imblearn.pipeline import Pipeline as imPipeline
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer

# Preprocessing & Transformation
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.neighbors import NeighborhoodComponentsAnalysis

# Feature Selection
from sklearn.feature_selection import RFE, SelectKBest

# Handling Imbalance
from imblearn.over_sampling import SMOTE

# Models
from sklearn.linear_model import ElasticNet, LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier

# Model Tuning & Cross-validation
from sklearn.model_selection import GridSearchCV, cross_validate, StratifiedKFold, train_test_split

# Model Evaluation & Scoring
from sklearn.metrics import classification_report, confusion_matrix, make_scorer


# Custom Classes

In [2]:
class OutlierClipper(BaseEstimator, TransformerMixin):
    def __init__(self, lower_percentile=0.005, upper_percentile=0.995, use_iqr=False):
        """
        Initialize the OutlierClipper with options for percentile clipping or IQR-based clipping.

        Parameters:
        - lower_percentile: float, lower bound percentile for clipping (if percentiles are used)
        - upper_percentile: float, upper bound percentile for clipping (if percentiles are used)
        - use_iqr: bool, whether to use IQR method for determining bounds
        """
        self.lower_percentile = lower_percentile
        self.upper_percentile = upper_percentile
        self.use_iqr = use_iqr
    
    def fit(self, X, y=None):
        """
        Fit the clipping bounds based on the training dataset using the specified method (percentiles or IQR).

        Parameters:
        - X: numpy.ndarray or pandas.DataFrame, the dataset used for fitting
        - y: ignored, not used for fitting

        Returns:
        - self: fitted instance of the class
        """
        # Convert to DataFrame if input is numpy array
        X = pd.DataFrame(X) if not isinstance(X, pd.DataFrame) else X
        
        # For each column in X, calculate the bounds using the specified method
        self.bounds_ = {}
        for column in X.columns:
            if self.use_iqr:
                q1 = X[column].quantile(0.25)  # 1st quartile
                q3 = X[column].quantile(0.75)  # 3rd quartile
                iqr = q3 - q1  # Interquartile range
                lower_bound = q1 - 1.5 * iqr
                upper_bound = q3 + 1.5 * iqr
            else:
                lower_bound = X[column].quantile(self.lower_percentile)
                upper_bound = X[column].quantile(self.upper_percentile)

            self.bounds_[column] = (lower_bound, upper_bound)

        return self

    def transform(self, X):
        """
        Apply clipping to the dataset based on the fitted bounds.

        Parameters:
        - X: numpy.ndarray or pandas.DataFrame, the dataset to transform

        Returns:
        - X: pandas.DataFrame, the transformed dataset with clipped values
        """
        # Convert to DataFrame if input is numpy array
        X = pd.DataFrame(X) if not isinstance(X, pd.DataFrame) else X

        # Apply clipping for each column
        for column, (lower_bound, upper_bound) in self.bounds_.items():
            X[column] = X[column].clip(lower=lower_bound, upper=upper_bound)

        return X

In [3]:
class FrequencyEncoder(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        # Compute the frequency counts for each column in the DataFrame
        self.freq_map = X.apply(pd.Series.value_counts)
        return self

    def transform(self, X):
        # Apply the frequency counts to transform the data
        return X.apply(lambda col: col.map(self.freq_map[col.name]))

# Data Ingestion

In [4]:
# Data ingestion with dtype application directly
with open('../data/dtypes.json', 'r') as file:
    dtypes = json.load(file)
train = pd.read_csv('../data/preproc_train.csv', index_col=0, dtype=dtypes, low_memory=True)
test = pd.read_csv('../data/preproc_test.csv', index_col=0, dtype=dtypes, low_memory=True)

# Get unique values and create a mapping dictionary
unique_values = train['claim_injury_type'].unique()
mapping = {value: int(value[0]) - 1 for value in unique_values}

# Handle NaN and target variable creation
train = train.fillna(np.nan)
X = train.drop(columns=['claim_injury_type'])

# Encode the target values
y = train['claim_injury_type'].map(mapping).astype(int)
y = train['claim_injury_type'].map(lambda x: int(x[0]) - 1).astype(int)

ParserError: Error tokenizing data. C error: Calling read(nbytes) on source failed. Try engine='python'.

In [None]:
metric_features = [
    'age_at_injury', 'ime_4_count', 'average_weekly_wage', 'birth_year', 'number_of_dependents', 'dd_asb_c2', 'dd_asb_c3', 'dd_c2_c3',
    'first_hearing_date_day', 'first_hearing_date_month', 'first_hearing_date_year', 'c_2_date_day', 'c_2_date_month', 'c_2_date_year',
    'c_3_date_day', 'c_3_date_month', 'c_3_date_year', 'assembly_date_day', 'assembly_date_month', 'assembly_date_year',
    'accident_date_day', 'accident_date_month', 'accident_date_year', 'avg_word_emb_dim_0', 'avg_word_emb_dim_1', 'avg_word_emb_dim_2',
    'avg_word_emb_dim_3', 'avg_word_emb_dim_4', 'avg_word_emb_dim_5', 'avg_word_emb_dim_6', 'avg_word_emb_dim_7', 'avg_word_emb_dim_8',
    'avg_word_emb_dim_9', 'var_word_emb_dim_0', 'var_word_emb_dim_1', 'var_word_emb_dim_2', 'var_word_emb_dim_3', 'var_word_emb_dim_4',
    'var_word_emb_dim_5', 'var_word_emb_dim_6', 'var_word_emb_dim_7', 'var_word_emb_dim_8', 'var_word_emb_dim_9', 'euclidean_norm'
]

binary_features = [
    'age_at_injury_zero', 'is_unionized', 'alternative_dispute_resolution', 'attorney_representative', 'covid_19_indicator', 'do_1', 'do_10',
    'do_11', 'do_12', 'do_13', 'do_14', 'do_15', 'do_16', 'do_2', 'do_3', 'do_4', 'do_5', 'do_6', 'do_7', 'do_8', 'do_9', 'missing_accident_date',
    'missing_age_at_injury', 'missing_average_weekly_wage', 'missing_birth_year', 'missing_c_2_date', 'missing_c_3_date', 'missing_first_hearing_date',
    'missing_gender', 'missing_ime_4_count', 'missing_industry_code', 'missing_industry_code_description', 'missing_wcio_cause_of_injury_code',
    'missing_wcio_cause_of_injury_description', 'missing_wcio_nature_of_injury_code', 'missing_wcio_nature_of_injury_description',
    'missing_wcio_part_of_body_code', 'missing_wcio_part_of_body_description', 'missing_zip_code'
]

hot_columns = ["carrier_type", "part_of_body_group", "cause_of_injury_group", "medical_fee_region"]
frequency_columns = ["industry_code"]

In [None]:
# Due to the extremely high computational demands of Support-Vector Machines in general, with this algorithm we decide to start from a smaller set of columns.

custom_column_selection = [
    'attorney_representative', 'average_weekly_wage', 'ime_4_count', 'gender', 'age_at_injury',
    'do_14', 'do_3', 'do_12', 'do_6', 'do_2', 'do_10', 'do_13', 'missing_c_3_date', 
    'missing_first_hearing_date', 'missing_ime_4_count',
    'avg_word_emb_dim_0', 'avg_word_emb_dim_1', 'avg_word_emb_dim_2', 'avg_word_emb_dim_3', 
    'avg_word_emb_dim_4', 'avg_word_emb_dim_5', 'avg_word_emb_dim_6', 'avg_word_emb_dim_7', 
    'avg_word_emb_dim_8', 'avg_word_emb_dim_9',
    'var_word_emb_dim_0', 'var_word_emb_dim_1', 'var_word_emb_dim_2', 'var_word_emb_dim_3', 
    'var_word_emb_dim_4', 'var_word_emb_dim_5', 'var_word_emb_dim_6', 'var_word_emb_dim_7', 
    'var_word_emb_dim_8', 'var_word_emb_dim_9',
    'euclidean_norm'
]

# Pipes

Here ensure that each model is declared with the best found hyperparameters.

In [None]:
# KNN Pipeline
knn_pipeline = imPipeline(steps=[
    ('column_transformer', ColumnTransformer(
        transformers=[
            ('onehotencoder', OneHotEncoder(handle_unknown='ignore'), hot_columns),
            ('frequencyencoder', FrequencyEncoder(), frequency_columns),
            ('outlier_clipper', OutlierClipper(), metric_features)
        ]
    )),
    ('scaler', StandardScaler()),  # Normalization remains essential for KNN
    ('KNNimputer', KNNImputer(n_neighbors=5, weights="uniform")),  # Reduced sensitivity during imputation
    ('select_kbest', SelectKBest(k=30)),  # Focus on top 30 features to simplify the model
    ('smote', SMOTE(sampling_strategy="auto", k_neighbors=3, random_state=42)),  # Reduce oversampling to limit overfitting
    ('knn', KNeighborsClassifier(n_neighbors=100, weights='uniform', p=1))  # Final estimator
])

# HistGradientBoosting Pipeline
hist_gradient_boosting_pipeline = imPipeline(steps=[
    ('column_transformer', ColumnTransformer(
        transformers=[
            ('onehotencoder', OneHotEncoder(), hot_columns),
            ('frequencyencoder', FrequencyEncoder(), frequency_columns),
            ('outlier_clipper', OutlierClipper(), metric_features)
        ]
    )),
    ('scaler', MinMaxScaler()),  # MinMaxScaler for scaling the features
    ('simpleimputer', SimpleImputer(strategy='median')),  # Handle missing values
    ('smote', SMOTE(sampling_strategy='auto', k_neighbors=3, random_state=42)),  # Handling class imbalance
    ('feature_selection', RFE(estimator=ElasticNet(alpha=0.001, l1_ratio=0.2), n_features_to_select=10)),  # Feature selection using ElasticNet
    ('hist_gradient_boosting', HistGradientBoostingClassifier(
        learning_rate=0.05,
        max_iter=200,
        max_depth=2,
    ))  # Final model
])

# XGBoost Pipeline
xgboost_pipeline = imPipeline(steps=[
    ('column_transformer', ColumnTransformer(
        transformers=[
            ('onehotencoder', OneHotEncoder(), hot_columns),
            ('frequencyencoder', FrequencyEncoder(), frequency_columns),
            ('outlier_clipper', OutlierClipper(), metric_features)
        ]
    )),
    ('scaler', MinMaxScaler()),
    ('simpleimputer', SimpleImputer(strategy='most_frequent')),  # Imputation step
    ('rfe', RFE(estimator=LogisticRegression(max_iter=200, penalty='l2', solver='newton-cholesky'), n_features_to_select=50, step=2)),  # Feature selection
    ('smote', SMOTE(sampling_strategy='auto', k_neighbors=3, random_state=42)),  # Oversampling after RFE
    ('xgboost', XGBClassifier(
        n_estimators=100,            # Number of boosting rounds
        max_depth=3,                 # Depth of each tree
        learning_rate=0.05,          # Slower learning rate for smoother convergence
        gamma=6,                     # Stronger regularization to control complexity
        min_child_weight=3,          # Controls complexity
        subsample=0.7,               # Slightly reduced subsampling for more randomness
        colsample_bytree=0.6,        # Column subsampling to reduce overfitting
        booster='gbtree',            # Use gbtree as the tree-based boosting algorithm
        objective='multi:softmax',   # Multi-class classification
        num_class=8,                 # Number of classes
        eval_metric='mlogloss'       # Multi-class evaluation metric (log loss)
    ))  # XGBoost model as the final estimator
])

# Random Forest Pipeline
random_forest_pipeline = imPipeline(steps=[
    ('column_transformer', ColumnTransformer(
        transformers=[
            ('onehotencoder', OneHotEncoder(), hot_columns),
            ('frequencyencoder', FrequencyEncoder(), frequency_columns),
            ('outlier_clipper', OutlierClipper(), metric_features)
        ]
    )),
    ('scaler', StandardScaler()),
    ('simpleimputer', SimpleImputer(strategy='median')),  # Imputation step
    ('rfe', RFE(estimator=LogisticRegression(max_iter=200, penalty='l2', solver='newton-cholesky'), n_features_to_select=50, step=2)),  # Feature selection
    ('smote', SMOTE(sampling_strategy='auto', random_state=42)),  # Oversampling after RFE
    ('random_forest', RandomForestClassifier(
        n_estimators=150,                # Number of trees in the forest
        max_depth=4,                      # Maximum depth of the trees
        min_samples_split=8,              # Minimum number of samples required to split an internal node
        min_samples_leaf=4,               # Minimum number of samples required to be at a leaf node
        max_features='log2'               # Number of features to consider when looking for the best split
    ))  
])

# MLP Pipeline
mlp_pipeline = imPipeline(steps=[
    ('column_transformer', ColumnTransformer(
        transformers=[
            ('onehotencoder', OneHotEncoder(), hot_columns),  # OneHotEncoder for categorical variables in 'hot_columns'
            ('frequencyencoder', FrequencyEncoder(), frequency_columns),  # Frequency encoding for categorical variables in 'frequency_columns'
            ('outlier_clipper', OutlierClipper(), metric_features)  # Outlier clipping for 'metric_features'
        ]
    )),
    ('scaler', MinMaxScaler()),  # MinMaxScaler to normalize feature values between 0 and 1
    ('knnimputer', KNNImputer(n_neighbors=3, weights='uniform')),  # KNN imputation to fill missing values based on nearest neighbors (uniform weights)
    ('feature_selection', RFE(estimator=ElasticNet(), n_features_to_select=50, step=2)),  # Recursive Feature Elimination (RFE) for selecting features, using ElasticNet as estimator
    ('smote', SMOTE(sampling_strategy='auto', random_state=42)),  # SMOTE to handle class imbalance by generating synthetic samples
    ('mlp', MLPClassifier(
        solver='adam', 
        alpha=0.001, 
        hidden_layer_sizes=(128, 64), 
        learning_rate_init=0.05, 
        max_iter=300
    ))  # Multi-layer Perceptron classifier with 'adam' optimizer and a max of 300 iterations for training
])

# SVC Pipeline
svc_pipeline = imPipeline(steps=[
    ('outlier_clipper', OutlierClipper()),  # Handle outliers
    ('scaler', StandardScaler()),  # Standardize features
    ('knnimputer', KNNImputer(n_neighbors=5)),  # Impute missing values using KNN
    ('pca', PCA(n_components=0.9)),  # Reduce dimensionality (90% variance)
    ('feature_selection', SelectKBest(k=20)),  # SelectKBest with ANOVA F-test
    ('smote', SMOTE(sampling_strategy='auto', k_neighbors=5)),  # Handle class imbalance with SMOTE
    ('svc', SVC(
        kernel='rbf',  # Radial basis function kernel
        C=0.5,  # Regularization parameter
        class_weight='balanced'  # Handle class imbalance by adjusting weights
    ))  # Support Vector Classifier with RBF kernel
])

# Logistic Regression Pipeline
logistic_regression_pipeline = imPipeline(steps=[
    ('column_transformer', ColumnTransformer(
        transformers=[
            ('onehotencoder', OneHotEncoder(), hot_columns),  # OneHotEncoder for categorical variables in 'hot_columns'
            ('frequencyencoder', FrequencyEncoder(), frequency_columns),  # Frequency encoding for categorical variables in 'frequency_columns'
            ('outlier_clipper', OutlierClipper(), metric_features)  # Outlier clipping for 'metric_features'
        ]
    )),
    ('scaler', MinMaxScaler()),  # MinMaxScaler for scaling the features
    ('simple_imputer', SimpleImputer(strategy='most_frequent')),  # SimpleImputer with most frequent strategy
    ('pca', PCA()),  # PCA for dimensionality reduction (retain 95% variance)
    ('feature_selection', SelectKBest(k=30)),  # SelectKBest with ANOVA F-test
    ('smote', SMOTE(sampling_strategy='auto', random_state=42)),  # Handling class imbalance
    ('logistic_regression', LogisticRegression(
        C=100,
        class_weight='balanced',  # Handle class imbalance
        max_iter=1000,  # Allow up to 1000 iterations for convergence
        random_state=42  # Set random seed for reproducibility
    ))  # Logistic Regression as the final model
])


In [None]:
# Fit each base model on the training data (X, y)
xgboost_pipeline.fit(X, y)
logistic_regression_pipeline.fit(X, y)
svc_pipeline.fit(X[custom_column_selection].copy(), y)
hist_gradient_boosting_pipeline.fit(X, y)
kneighbors_pipeline.fit(X, y)
mlp_pipeline.fit(X, y)
random_forest_pipeline.fit(X, y)

# Generate predictions for the test set (test)
predictions = pd.DataFrame()

predictions['logistic_regression_pipeline'] = logistic_regression_pipeline.predict(test)
predictions['xgboost_pipeline'] = xgboost_pipeline.predict(test)
predictions['svc_pipeline'] = svc_pipeline.predict(test[custom_column_selection].copy())
predictions['hist_gradient_boosting_pipeline'] = hist_gradient_boosting_pipeline.predict(test)
predictions['kneighbors_pipeline'] = kneighbors_pipeline.predict(test)
predictions['mlp_pipeline'] = mlp_pipeline.predict(test)
predictions['random_forest_pipeline'] = random_forest_pipeline.predict(test)

In [None]:
# Define a function to apply random perturbation
def add_random_perturbation(data, perturbation_range=0.1):
    return data + np.random.uniform(-perturbation_range, perturbation_range, size=data.shape)

# Apply random perturbation to each of the columns in the predictions DataFrame
perturbed_predictions = predictions.copy()

for column in predictions.columns:
    perturbed_predictions[column] = add_random_perturbation(predictions[column])

meta_model = LogisticRegression()
meta_model.fit(perturbed_predictions, y)

In [None]:
# Get predictions from the test set
predictions = meta_model.predict(test.predict(test)

# Decode the predictions using the inverse mapping
decoded_predictions = pd.Series(predictions).map(inverse_mapping)

# Convert the index back to a column
predictions_with_index = decoded_predictions.reset_index()

# Export predictions with index to CSV
predictions_with_index.to_csv('predictions_with_index.csv', index=False)