In [37]:
# Standard libraries
import numpy as np
import pandas as pd
import warnings
import zipfile, io

# Visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.ticker as mtick

# Statistical libraries
import scipy.stats as ss
from scipy import stats
from scipy.stats import f_oneway
from scipy.sparse import csr_matrix

# Scikit-learn preprocessing and model selection
from sklearn.impute import KNNImputer
from sklearn.preprocessing import MinMaxScaler, OrdinalEncoder, LabelEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score

# Scikit-learn feature selection
from sklearn.feature_selection import f_classif, SelectKBest, mutual_info_classif, RFE, RFECV

# Scikit-learn models
from sklearn.linear_model import LogisticRegression, LassoCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

# Scikit-learn metrics
from sklearn.metrics import make_scorer, precision_score, recall_score, f1_score, classification_report

# Pickle for import and export of datasets
import pickle

# Seaborn settings
sns.set()

# Ignore warnings
warnings.filterwarnings('ignore')

# Enable inline plotting for Jupyter notebooks
%matplotlib inline

In [38]:
X_train = pd.read_csv('datasets/final_data_train_delivery1.csv', index_col=0)
y_train = pd.read_csv('datasets/scaled_target_train_delivery1.csv', index_col=0)
X_test = pd.read_csv('datasets/final_data_test_delivery1.csv', index_col=0)

# 4. Modelling

**Random Forest**

In [40]:
# Initialize and fit the model
rf_model = RandomForestClassifier(class_weight="balanced", random_state=42)

In [41]:
scorers = {
    'accuracy': 'accuracy',
    'precision': make_scorer(precision_score, average='weighted'),
    'recall': make_scorer(recall_score, average='weighted'),
    'f1': make_scorer(f1_score, average='weighted')
}

# Perform cross-validation for each metric
for metric_name, scorer in scorers.items():
    cv_scores = cross_val_score(rf_model, X_train, y_train, cv=5, scoring=scorer)
    print(f"Cross-validation {metric_name.capitalize()} for Random Forest: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")


Cross-validation Accuracy for Random Forest: 0.7573 ± 0.0014


KeyboardInterrupt: 

In [42]:
rf_model.fit(X_train, y_train)

In [43]:
# Obtain predictions for test set
pred_RF = rf_model.predict(X_test)

**Gradient Boosting**

In [None]:
# Initialize and fit the model
#gb_model = GradientBoostingClassifier(random_state=42)
#gb_model.fit(X_train, y_train)

In [None]:
'''
# Perform cross-validation for each metric
for metric_name, scorer in scorers.items():
    cv_scores = cross_val_score(gb_model, X_train, y_train, cv=5, scoring=scorer)
    print(f"Cross-validation {metric_name.capitalize()} for Gradient Boosting: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")
'''

'\n# Perform cross-validation for each metric\nfor metric_name, scorer in scorers.items():\n    cv_scores = cross_val_score(gb_model, X_train, y_train, cv=5, scoring=scorer)\n    print(f"Cross-validation {metric_name.capitalize()} for Gradient Boosting: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")\n'

In [None]:
#pred_GB = gb_model.predict(X_test)

In [None]:
from sklearn.neural_network import MLPClassifier


model_complex = MLPClassifier(hidden_layer_sizes=(100,100,100,100))

In [None]:
model_complex.fit(X_train, y_train)

In [None]:
predictNN = model_complex.predict(X_train, y_train)

**Kaggle Submissions**

In [47]:
# Reset index to include 'Claim Identifier' as a column if it's the index
X_test = X_test.reset_index()

label_mapping = {
    0: "1. CANCELLED",
    1: "2. NON-COMP",
    2: "3. MED ONLY",
    3: "4. TEMPORARY",
    4: "5. PPD SCH LOSS",
    5: "6. PPD NSL",
    6: "7. PTD",
    7: "8. DEATH",
    8: "Unknown"
}

predictions = [label_mapping[label] for label in pred_RF]

# Create the submission DataFrame
submission_df = pd.DataFrame({
    'Claim Identifier': X_test['Claim Identifier'],  # Ensure this column exists
    'Claim Injury Type': predictions                     # Replace 'pred_RF' with your predictions
})

# Save to CSV
submission_df.to_csv('submission.csv', index=False)

