# Libraries

In [9]:
# General Libraries
import json
import numpy as np
import pandas as pd
import os
import ast
import inspect
from collections.abc import Iterable

# Base Classes & Estimators
from sklearn.base import BaseEstimator, TransformerMixin

# Pipeline & Model Construction
from imblearn.pipeline import Pipeline as imPipeline
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

# Preprocessing & Transformation
from sklearn.impute import SimpleImputer
from sklearn.impute import KNNImputer 
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.decomposition import PCA

# Feature Selection
from sklearn.feature_selection import RFE
from sklearn.feature_selection import SelectKBest

# Model Selection & Estimators
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import HistGradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier

# Model Evaluation & Scoring
from sklearn.metrics import classification_report, confusion_matrix, make_scorer

# Handling Imbalance
from imblearn.over_sampling import SMOTE

# Model Tuning & Cross-validation
from sklearn.model_selection import GridSearchCV, cross_validate, StratifiedKFold


# Custom Classes

In [2]:
class OutlierClipper(BaseEstimator, TransformerMixin):
    def __init__(self, lower_percentile=0.005, upper_percentile=0.995, use_iqr=False):
        """
        Initialize the OutlierClipper with options for percentile clipping or IQR-based clipping.

        Parameters:
        - lower_percentile: float, lower bound percentile for clipping (if percentiles are used)
        - upper_percentile: float, upper bound percentile for clipping (if percentiles are used)
        - use_iqr: bool, whether to use IQR method for determining bounds
        """
        self.lower_percentile = lower_percentile
        self.upper_percentile = upper_percentile
        self.use_iqr = use_iqr
    
    def fit(self, X, y=None):
        """
        Fit the clipping bounds based on the training dataset using the specified method (percentiles or IQR).

        Parameters:
        - X: numpy.ndarray or pandas.DataFrame, the dataset used for fitting
        - y: ignored, not used for fitting

        Returns:
        - self: fitted instance of the class
        """
        # Convert to DataFrame if input is numpy array
        X = pd.DataFrame(X) if not isinstance(X, pd.DataFrame) else X
        
        # For each column in X, calculate the bounds using the specified method
        self.bounds_ = {}
        for column in X.columns:
            if self.use_iqr:
                q1 = X[column].quantile(0.25)  # 1st quartile
                q3 = X[column].quantile(0.75)  # 3rd quartile
                iqr = q3 - q1  # Interquartile range
                lower_bound = q1 - 1.5 * iqr
                upper_bound = q3 + 1.5 * iqr
            else:
                lower_bound = X[column].quantile(self.lower_percentile)
                upper_bound = X[column].quantile(self.upper_percentile)

            self.bounds_[column] = (lower_bound, upper_bound)

        return self

    def transform(self, X):
        """
        Apply clipping to the dataset based on the fitted bounds.

        Parameters:
        - X: numpy.ndarray or pandas.DataFrame, the dataset to transform

        Returns:
        - X: pandas.DataFrame, the transformed dataset with clipped values
        """
        # Convert to DataFrame if input is numpy array
        X = pd.DataFrame(X) if not isinstance(X, pd.DataFrame) else X

        # Apply clipping for each column
        for column, (lower_bound, upper_bound) in self.bounds_.items():
            X[column] = X[column].clip(lower=lower_bound, upper=upper_bound)

        return X

    def set_output(self, transform="default"):
        """
        Enable compatibility with scikit-learn's `set_output` functionality.

        Parameters:
        - transform: str, the output format ("default" or "pandas").
        """
        self.output_format = transform
        return self


In [3]:
class FrequencyEncoder(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        # Compute the frequency counts for each column in the DataFrame
        self.freq_map = X.apply(pd.Series.value_counts)
        return self

    def transform(self, X):
        # Apply the frequency counts to transform the data
        return X.apply(lambda col: col.map(self.freq_map[col.name]))

# Data Ingestion

In [4]:
# Data is ingested from the working directory, with the index set to col_0 and dtypes applied directly
with open('../data/dtypes.json', 'r') as file:
    train = pd.read_csv('../data/preproc_train.csv', index_col=0, dtype=json.load(file), low_memory=True)

# Data is ingested from the working directory, with the index set to col_0 and dtypes applied directly
with open('../data/dtypes.json', 'r') as file:
    test = pd.read_csv('../data/preproc_test.csv', index_col=0, dtype=json.load(file), low_memory=True)

# Convert all NaN values to np.nan
train = train.fillna(np.nan)

# Separate features (X) and target
X = train.drop(columns=['claim_injury_type']).copy()
target = train['claim_injury_type']

# Map y to the first character of target and drop missing values
y = target.map(lambda target: int(target[0]) - 1).astype(int)
y = y.dropna()

# Align X and y based on y's indices
X = X.loc[y.index]

# Ensure all rows where y is in [6, 7, 8] are included
rows_with_78 = y[y.isin([6, 7])].index

# Extract these rows
X_with_78 = X.loc[rows_with_78]
y_with_78 = y.loc[rows_with_78]

# Calculate the remaining sample size
remaining_sample_size = 10000 - len(X_with_78)

# Exclude rows with y values [6, 7, 8] from the rest of the dataset
X_remaining = X.drop(index=rows_with_78)
y_remaining = y.drop(index=rows_with_78)

# Sample the required number of rows from the remaining dataset
X_sampled = X_remaining.sample(remaining_sample_size, random_state=42)
y_sampled = y_remaining.loc[X_sampled.index]

# Combine the rows for y values [6, 7, 8] with the sampled rows
X = pd.concat([X_with_78, X_sampled])
y = pd.concat([y_with_78, y_sampled])

In [5]:
metric_features = [
    'age_at_injury', 
    'ime_4_count', 
    'average_weekly_wage', 
    'birth_year',
    'number_of_dependents',
    'dd_asb_c2',
    'dd_asb_c3',
    'dd_c2_c3',
    'first_hearing_date_day',
    'first_hearing_date_month',
    'first_hearing_date_year',
    'c_2_date_day',
    'c_2_date_month',
    'c_2_date_year',
    'c_3_date_day',
    'c_3_date_month',
    'c_3_date_year',
    'assembly_date_day',
    'assembly_date_month',
    'assembly_date_year',
    'accident_date_day',
    'accident_date_month',
    'accident_date_year',
    'avg_word_emb_dim_0',
    'avg_word_emb_dim_1',
    'avg_word_emb_dim_2',
    'avg_word_emb_dim_3',
    'avg_word_emb_dim_4',
    'avg_word_emb_dim_5',
    'avg_word_emb_dim_6',
    'avg_word_emb_dim_7',
    'avg_word_emb_dim_8',
    'avg_word_emb_dim_9',
    'var_word_emb_dim_0',
    'var_word_emb_dim_1',
    'var_word_emb_dim_2',
    'var_word_emb_dim_3',
    'var_word_emb_dim_4',
    'var_word_emb_dim_5',
    'var_word_emb_dim_6',
    'var_word_emb_dim_7',
    'var_word_emb_dim_8',
    'var_word_emb_dim_9',
    'euclidean_norm'
]

binary_features = [
    'age_at_injury_zero',
    'is_unionized',
    'alternative_dispute_resolution',
    'attorney_representative',
    'covid_19_indicator',
    'do_1',
    'do_10',
    'do_11',
    'do_12',
    'do_13',
    'do_14',
    'do_15',
    'do_16',
    'do_2',
    'do_3',
    'do_4',
    'do_5',
    'do_6',
    'do_7',
    'do_8',
    'do_9',
    'missing_accident_date',
    'missing_age_at_injury',
    'missing_average_weekly_wage',
    'missing_birth_year',
    'missing_c_2_date',
    'missing_c_3_date',
    'missing_first_hearing_date',
    'missing_gender',
    'missing_ime_4_count',
    'missing_industry_code',
    'missing_industry_code_description',
    'missing_wcio_cause_of_injury_code',
    'missing_wcio_cause_of_injury_description',
    'missing_wcio_nature_of_injury_code',
    'missing_wcio_nature_of_injury_description',
    'missing_wcio_part_of_body_code',
    'missing_wcio_part_of_body_description',
    'missing_zip_code'
]

hot_columns = [
    "carrier_type", 
    "part_of_body_group", 
    "cause_of_injury_group",
    "medical_fee_region",
]
frequency_columns = [
    "industry_code"
]

# Pipeline

In [11]:
pipeline = imPipeline(steps=[
    ('column_transformer', ColumnTransformer(
        transformers=[
            ('onehotencoder', OneHotEncoder(), hot_columns),
            ('frequencyencoder', FrequencyEncoder(), frequency_columns),
            ('outlier_clipper', OutlierClipper(), metric_features)
        ]
    )),
    ('scaler', MinMaxScaler()),  # MinMaxScaler for scaling the features
    ('knnimputer', KNNImputer()),  # Handle missing values
    ('feature_selection', RFE(estimator=ElasticNet(), step=2)),  # Feature selection using RFE
    ('smote', SMOTE(sampling_strategy='auto', random_state=42)),  # Handling class imbalance
    ('mlp', MLPClassifier())  # Multi-layer Perceptron Classifier
])


In [12]:
param_grid = {
    # ElasticNet hyperparameters
    'feature_selection__n_features_to_select': [25, 50],
    
    # KNNImputer hyperparameters
    'knnimputer__n_neighbors': [3, 5],  # Number of neighbors for imputation
    'knnimputer__weights': ['uniform'],  # Weighting strategy for neighbors
    
    # MLPClassifier hyperparameters
    'mlp__hidden_layer_sizes': [(64, 32), (128, 64), (256, 128, 64)],  # Chosen architectures
    'mlp__solver': ['adam'],  # Optimizer
    'mlp__max_iter': [300],  # Number of iterations
    'mlp__learning_rate_init': [0.05, 0.1],  # Learning rate
    'mlp__alpha': [0.001, 0.01]  # Regularization strength for MLP
}


# Fit

In [None]:
outer_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
inner_cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    scoring='f1_macro',
    cv=inner_cv,
    n_jobs=1,
    refit=True,
    verbose=2
)

# Perform cross-validation using GridSearchCV with outer cross-validation
cv_results = cross_validate(
    estimator=grid_search,
    X=X,
    y=y,
    cv=outer_cv,
    return_train_score=True,
    return_estimator=True,
    scoring="f1_macro",
    n_jobs=1,
    verbose=2
)

Fitting 3 folds for each of 48 candidates, totalling 144 fits
[CV] END feature_selection__n_features_to_select=25, knnimputer__n_neighbors=3, knnimputer__weights=uniform, mlp__alpha=0.001, mlp__hidden_layer_sizes=(64, 32), mlp__learning_rate_init=0.05, mlp__max_iter=300, mlp__solver=adam; total time=   3.3s
[CV] END feature_selection__n_features_to_select=25, knnimputer__n_neighbors=3, knnimputer__weights=uniform, mlp__alpha=0.001, mlp__hidden_layer_sizes=(64, 32), mlp__learning_rate_init=0.05, mlp__max_iter=300, mlp__solver=adam; total time=   3.7s
[CV] END feature_selection__n_features_to_select=25, knnimputer__n_neighbors=3, knnimputer__weights=uniform, mlp__alpha=0.001, mlp__hidden_layer_sizes=(64, 32), mlp__learning_rate_init=0.05, mlp__max_iter=300, mlp__solver=adam; total time=   3.3s
[CV] END feature_selection__n_features_to_select=25, knnimputer__n_neighbors=3, knnimputer__weights=uniform, mlp__alpha=0.001, mlp__hidden_layer_sizes=(64, 32), mlp__learning_rate_init=0.1, mlp__ma

  arr = np.array(param_list)


[CV] END .................................................... total time=15.4min
Fitting 3 folds for each of 48 candidates, totalling 144 fits
[CV] END feature_selection__n_features_to_select=25, knnimputer__n_neighbors=3, knnimputer__weights=uniform, mlp__alpha=0.001, mlp__hidden_layer_sizes=(64, 32), mlp__learning_rate_init=0.05, mlp__max_iter=300, mlp__solver=adam; total time=   4.0s
[CV] END feature_selection__n_features_to_select=25, knnimputer__n_neighbors=3, knnimputer__weights=uniform, mlp__alpha=0.001, mlp__hidden_layer_sizes=(64, 32), mlp__learning_rate_init=0.05, mlp__max_iter=300, mlp__solver=adam; total time=   4.2s
[CV] END feature_selection__n_features_to_select=25, knnimputer__n_neighbors=3, knnimputer__weights=uniform, mlp__alpha=0.001, mlp__hidden_layer_sizes=(64, 32), mlp__learning_rate_init=0.05, mlp__max_iter=300, mlp__solver=adam; total time=   3.3s
[CV] END feature_selection__n_features_to_select=25, knnimputer__n_neighbors=3, knnimputer__weights=uniform, mlp__al

  arr = np.array(param_list)


[CV] END .................................................... total time=15.8min
Fitting 3 folds for each of 48 candidates, totalling 144 fits
[CV] END feature_selection__n_features_to_select=25, knnimputer__n_neighbors=3, knnimputer__weights=uniform, mlp__alpha=0.001, mlp__hidden_layer_sizes=(64, 32), mlp__learning_rate_init=0.05, mlp__max_iter=300, mlp__solver=adam; total time=   4.0s
[CV] END feature_selection__n_features_to_select=25, knnimputer__n_neighbors=3, knnimputer__weights=uniform, mlp__alpha=0.001, mlp__hidden_layer_sizes=(64, 32), mlp__learning_rate_init=0.05, mlp__max_iter=300, mlp__solver=adam; total time=   5.9s
[CV] END feature_selection__n_features_to_select=25, knnimputer__n_neighbors=3, knnimputer__weights=uniform, mlp__alpha=0.001, mlp__hidden_layer_sizes=(64, 32), mlp__learning_rate_init=0.05, mlp__max_iter=300, mlp__solver=adam; total time=   3.7s
[CV] END feature_selection__n_features_to_select=25, knnimputer__n_neighbors=3, knnimputer__weights=uniform, mlp__al

  arr = np.array(param_list)


[CV] END .................................................... total time=15.1min
Fitting 3 folds for each of 48 candidates, totalling 144 fits
[CV] END feature_selection__n_features_to_select=25, knnimputer__n_neighbors=3, knnimputer__weights=uniform, mlp__alpha=0.001, mlp__hidden_layer_sizes=(64, 32), mlp__learning_rate_init=0.05, mlp__max_iter=300, mlp__solver=adam; total time=   4.4s
[CV] END feature_selection__n_features_to_select=25, knnimputer__n_neighbors=3, knnimputer__weights=uniform, mlp__alpha=0.001, mlp__hidden_layer_sizes=(64, 32), mlp__learning_rate_init=0.05, mlp__max_iter=300, mlp__solver=adam; total time=   5.4s
[CV] END feature_selection__n_features_to_select=25, knnimputer__n_neighbors=3, knnimputer__weights=uniform, mlp__alpha=0.001, mlp__hidden_layer_sizes=(64, 32), mlp__learning_rate_init=0.05, mlp__max_iter=300, mlp__solver=adam; total time=   4.9s
[CV] END feature_selection__n_features_to_select=25, knnimputer__n_neighbors=3, knnimputer__weights=uniform, mlp__al

# Results

In [None]:
# Convert cv_results to a pandas DataFrame
cv_results_df = pd.DataFrame(cv_results)

# Specify the file path (in the current working directory)
file_path = 'cv_results.csv'

# Append the DataFrame to the CSV file (if it exists) or create a new one
cv_results_df.to_csv(file_path, mode='a', header=not pd.io.common.file_exists(file_path), index=False)

print(f"Cross-validation results saved to {file_path}")

In [None]:
from playsound import playsound

# Specify the path to your sound file
sound_file = '../miscellanious/alert.mp3'

# Play the sound
playsound(sound_file)