In [1]:
# Standard libraries
import numpy as np
import pandas as pd
import warnings
import zipfile, io

# Visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.ticker as mtick

# Statistical libraries
import scipy.stats as ss
from scipy import stats
from scipy.stats import f_oneway
from scipy.sparse import csr_matrix

# Scikit-learn preprocessing and model selection
from sklearn.impute import KNNImputer
from sklearn.preprocessing import MinMaxScaler, OrdinalEncoder, LabelEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score

# Scikit-learn feature selection
from sklearn.feature_selection import f_classif, SelectKBest, mutual_info_classif, RFE, RFECV

# Scikit-learn models
from sklearn.linear_model import LogisticRegression, LassoCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

# Scikit-learn metrics
from sklearn.metrics import make_scorer, precision_score, recall_score, f1_score, classification_report

# Pickle for import and export of datasets
import pickle

# Seaborn settings
sns.set()

# Ignore warnings
warnings.filterwarnings('ignore')

# Enable inline plotting for Jupyter notebooks
%matplotlib inline

In [2]:
X_train = pd.read_csv('datasets/final_data_train_delivery1.csv', index_col=0)
y_train = pd.read_csv('datasets/scaled_target_train_delivery1.csv', index_col=0)
X_test = pd.read_csv('datasets/final_data_test_delivery1.csv', index_col=0)

# NOTE FOR NUNO 

## I am making predictions for both X_train and X_val ##

Predictions on X_train:
Why?
To evaluate how well the model has learned the training data. This can help detect:

Underfitting: If the training accuracy or F1 score is low, the model may not be complex enough to learn the patterns in the data.

Overfitting: If the training accuracy is much higher than the validation accuracy, the model may have memorized the training data rather than generalizing.

Predictions on X_val:
Why?

To evaluate how well the model generalizes to unseen data (validation set). This is the primary metric for model performance and gives an indication of how the model will perform on real-world data.



# 4. Modelling

**Logistic Regression**

In [None]:
#NOTE FOR NUNO: TO MAKE SURE IT RUNS;
#Ensure that y_train and y_val are encoded as numeric labels
#Ensure X_train, X_val, y_train, and y_val are defined and properly preprocessed.
#Need to be scaled

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, f1_score, accuracy_score
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt

# Initialize the Logistic Regression model
logistic_model = LogisticRegression()
# Fit the model on the training data
logistic_model.fit(X_train, y_train)

In [None]:
# Make predictions on the training set
train_pred = logistic_model.predict(X_train)

# Make predictions on the validation set
val_pred = logistic_model.predict(X_val)

In [None]:
# Evaluate performance on the training data (optional, for overfitting detection)
print("Training Data Evaluation:")
print(f"Accuracy (Train): {accuracy_score(y_train, train_pred)}")
print(f"F1 Score (Train): {f1_score(y_train, train_pred, average='weighted')}")

# Evaluate performance on the validation data
print("\nValidation Data Evaluation:")
print(f"Accuracy (Validation): {accuracy_score(y_val, val_pred)}")
print(f"F1 Score (Validation): {f1_score(y_val, val_pred, average='weighted')}")

# Generate classification report for validation
print("\nClassification Report (Validation):")
print(classification_report(y_val, val_pred))

In [None]:
# Generate  the confusion matrix
conf_matrix = confusion_matrix(y_val, val_pred)

# Convert the confusion matrix into a DataFrame
conf_matrix_df = pd.DataFrame(
    conf_matrix, 
    index=[f"Actual_{label}" for label in logistic_model.classes_], 
    columns=[f"Predicted_{label}" for label in logistic_model.classes_]
)
print("\nConfusion Matrix as DataFrame:")
print(conf_matrix_df)

# Visualize the confusion matrix as a heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', 
            xticklabels=logistic_model.classes_, yticklabels=logistic_model.classes_)
plt.title("Confusion Matrix (Validation)")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

In [None]:
#ADD HERE THE SCORE FROM KAGGLE= 

**Random Forest**

In [None]:
# tenho 1 meu, tenho de correr primeiro e depois checkar este

In [None]:
'''Requirements for this to run
X_train, y_train: Training feature set and target labels.
X_val, y_val: Validation feature set and target labels.

Define n_estimators_range with reasonable values.Larger values of n_estimators will increase model complexity and runtime.


In [None]:
# Define the function to find the best number of trees (from the earlier example)
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, accuracy_score, classification_report, confusion_matrix
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

def find_best_n_estimators(X_train, y_train, X_val, y_val, n_estimators_range):
    results = []

    for n in n_estimators_range:
        rf_model = RandomForestClassifier(n_estimators=n, random_state=42, class_weight='balanced')
        rf_model.fit(X_train, y_train)
        val_pred = rf_model.predict(X_val)
        f1 = f1_score(y_val, val_pred, average='weighted')
        results.append({'n_estimators': n, 'F1 Score': f1})
        print(f"n_estimators: {n}, F1 Score: {f1}")

    results_df = pd.DataFrame(results)
    best_n_estimators = results_df.loc[results_df['F1 Score'].idxmax(), 'n_estimators']
    print(f"\nBest n_estimators: {best_n_estimators} with F1 Score: {results_df['F1 Score'].max()}")

    plt.figure(figsize=(10, 6))
    plt.plot(results_df['n_estimators'], results_df['F1 Score'], marker='o')
    plt.title('F1 Score vs. Number of Trees (n_estimators)')
    plt.xlabel('Number of Trees (n_estimators)')
    plt.ylabel('F1 Score')
    plt.grid()
    plt.show()

    return best_n_estimators, results_df

In [None]:
# Step 1: Define the range of n_estimators
n_estimators_range = range(10, 201, 10)

In [None]:
# Step 2: Find the best number of trees
best_n, results_df = find_best_n_estimators(X_train, y_train, X_val, y_val, n_estimators_range)

In [None]:
# Step 3: Train the final Random Forest model with the best number of trees
random_forest_model = RandomForestClassifier(
    n_estimators=best_n, 
    random_state=42, 
    class_weight='balanced'
)

random_forest_model.fit(X_train, y_train)

# Step 4: Make predictions and evaluate the final model
train_pred = random_forest_model.predict(X_train)
val_pred = random_forest_model.predict(X_val)

In [None]:
print("\nTraining Data Evaluation:")
print(f"Accuracy (Train): {accuracy_score(y_train, train_pred)}")
print(f"F1 Score (Train): {f1_score(y_train, train_pred, average='weighted')}")

print("\nValidation Data Evaluation:")
print(f"Accuracy (Validation): {accuracy_score(y_val, val_pred)}")
print(f"F1 Score (Validation): {f1_score(y_val, val_pred, average='weighted')}")

print("\nClassification Report (Validation):")
print(classification_report(y_val, val_pred))

In [None]:
conf_matrix = confusion_matrix(y_val, val_pred)
conf_matrix_df = pd.DataFrame(
    conf_matrix,
    index=[f"Actual_{label}" for label in random_forest_model.classes_],
    columns=[f"Predicted_{label}" for label in random_forest_model.classes_]
)

print("\nConfusion Matrix as DataFrame:")
print(conf_matrix_df)

plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues',
            xticklabels=random_forest_model.classes_, yticklabels=random_forest_model.classes_)
plt.title("Confusion Matrix (Validation)")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

In [None]:
#ADD HERE THE SCORE FROM KAGGLE= 

**Gradient Boosting**

In [None]:
# tenho 1 meu, tenho de correr primeiro e depois checkar este

In [16]:
'''Requirements for this to run
Training and Validation Data:
X_train, y_train: Training features and target labels.
X_val, y_val: Validation features and target labels.
Preprocessing:
Feature data (X_train, X_val) must be numeric.
Target labels (y_train, y_val) should be categorical or binary for classification tasks.
Handle missing values, encoding, and scaling before using the data.

Parameter Grid:

You need to specify a parameter grid to explore combinations of:
n_estimators: Number of boosting stages.
learning_rate: Controls the contribution of each tree.
max_depth: Depth of the trees.
subsample: Proportion of samples used for training each base learner.


SyntaxError: incomplete input (2584167092.py, line 1)

In [14]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, f1_score, accuracy_score
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Define the model
gb_model = GradientBoostingClassifier(random_state=42)

# Define the parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],         # Number of boosting stages
    'learning_rate': [0.01, 0.1, 0.2],     # Learning rate
    'max_depth': [3, 5, 7],                # Maximum depth of the individual estimators
    'subsample': [0.8, 1.0],               # Fraction of samples for fitting base learners
}

In [None]:
# Set up GridSearchCV
grid_search = GridSearchCV(
    estimator=gb_model, 
    param_grid=param_grid, 
    scoring='f1_weighted',  # Use F1-weighted score as the metric
    cv=3,                   # 3-fold cross-validation
    verbose=2,              # Verbosity for updates
    n_jobs=-1               # Use all processors
)

In [None]:
# Fit GridSearchCV on the training data
grid_search.fit(X_train, y_train)

# Retrieve the best parameters and model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

print("\nBest Parameters:")
print(best_params)


In [None]:
# Make predictions with the best model
train_pred = best_model.predict(X_train)
val_pred = best_model.predict(X_val)

# Evaluate performance on the training data
print("Training Data Evaluation:")
print(f"Accuracy (Train): {accuracy_score(y_train, train_pred)}")
print(f"F1 Score (Train): {f1_score(y_train, train_pred, average='weighted')}")

# Evaluate performance on the validation data
print("\nValidation Data Evaluation:")
print(f"Accuracy (Validation): {accuracy_score(y_val, val_pred)}")
print(f"F1 Score (Validation): {f1_score(y_val, val_pred, average='weighted')}")

# Generate classification report for validation data
print("\nClassification Report (Validation):")
print(classification_report(y_val, val_pred))

In [None]:
# Generate confusion matrix for validation data
conf_matrix = confusion_matrix(y_val, val_pred)

# Convert confusion matrix to a DataFrame
conf_matrix_df = pd.DataFrame(
    conf_matrix,
    index=[f"Actual_{label}" for label in best_model.classes_],
    columns=[f"Predicted_{label}" for label in best_model.classes_]
)

# Display the confusion matrix as a DataFrame
print("\nConfusion Matrix as DataFrame:")
print(conf_matrix_df)

# Visualize the confusion matrix as a heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', 
            xticklabels=best_model.classes_, yticklabels=best_model.classes_)
plt.title("Confusion Matrix (Validation)")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

In [None]:
#ADD HERE THE SCORE FROM KAGGLE= 

In [None]:
'''from sklearn.neural_network import MLPClassifier


model_complex = MLPClassifier(hidden_layer_sizes=(100,100,100,100))

In [None]:
'''model_complex.fit(X_train, y_train)

In [None]:
'''predictNN = model_complex.predict(X_train, y_train)

**Naives Bayes**

Requirements
2. Input Data
Training and Validation Data:
X_train, y_train: Training features and target labels.
X_val, y_val: Validation features and target labels.
Preprocessing:
Numeric Data:
GaussianNB assumes numeric feature values. Ensure all features in X_train and X_val are numeric.
Categorical Labels:
The target labels (y_train, y_val) should be categorical or binary for classification tasks.
No Missing Values:
Handle missing values before fitting the model.

3. GaussianNB Specific Requirements
Feature Distribution:
GaussianNB assumes that features follow a normal (Gaussian) distribution. While not strictly enforced, this assumption impacts performance.
Variance Smoothing:
The parameter var_smoothing=1e-9 prevents division by zero by adding a small value to variances. Adjust if necessary.

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import f1_score, accuracy_score, classification_report, confusion_matrix
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Initialize the Gaussian Naive Bayes model with variance smoothing
gnb_model = GaussianNB(var_smoothing=1e-9)

# Fit the model on the training data
gnb_model.fit(X_train, y_train)

In [None]:
# Make predictions on the training set
train_pred = gnb_model.predict(X_train)

# Make predictions on the validation set
val_pred = gnb_model.predict(X_val)


In [None]:
# Evaluate performance on the training data
print("Training Data Evaluation:")
print(f"Accuracy (Train): {accuracy_score(y_train, train_pred)}")
print(f"F1 Score (Train): {f1_score(y_train, train_pred, average='weighted')}")

# Evaluate performance on the validation data
print("\nValidation Data Evaluation:")
print(f"Accuracy (Validation): {accuracy_score(y_val, val_pred)}")
print(f"F1 Score (Validation): {f1_score(y_val, val_pred, average='weighted')}")

# Generate classification report for validation
print("\nClassification Report (Validation):")
print(classification_report(y_val, val_pred))


In [None]:
# Generate and visualize the confusion matrix
conf_matrix = confusion_matrix(y_val, val_pred)

# Convert the confusion matrix into a DataFrame
conf_matrix_df = pd.DataFrame(
    conf_matrix, 
    index=[f"Actual_{label}" for label in gnb_model.classes_], 
    columns=[f"Predicted_{label}" for label in gnb_model.classes_]
)

# Display the confusion matrix as a DataFrame
print("\nConfusion Matrix as DataFrame:")
print(conf_matrix_df)

# Visualize the confusion matrix as a heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', 
            xticklabels=gnb_model.classes_, yticklabels=gnb_model.classes_)
plt.title("Confusion Matrix (Validation)")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

## KNN ##

In [None]:
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score
import pandas as pd

def find_best_k(X_train, y_train, X_val, y_val, ks):
    results = []  # Correct indentation for results initialization

    for k in ks:
        # Initialize the KNN model with the current k
        knn_model = KNeighborsClassifier(n_neighbors=k)

        # Fit the model to the training data
        knn_model.fit(X_train, y_train)

        # Predict on the validation set
        val_pred = knn_model.predict(X_val)

        # Calculate the F1 Macro score
        f1_macro = f1_score(y_val, val_pred, average='macro')
        results.append({'k': k, 'F1 Macro': f1_macro})

        # Print results for each k
        print(f"k: {k}, F1 Macro: {f1_macro}")

    # Convert results to a DataFrame
    results_df = pd.DataFrame(results)

    # Find the best k
    best_k = results_df.loc[results_df['F1 Macro'].idxmax(), 'k']
    print(f"\nBest k: {best_k} with F1 Macro: {results_df['F1 Macro'].max()}")

    # Plot the F1 Macro scores
    plt.figure(figsize=(10, 6))
    plt.plot(results_df['k'], results_df['F1 Macro'], marker='o')
    plt.title('F1 Macro vs. Number of Neighbors (k)')
    plt.xlabel('Number of Neighbors (k)')
    plt.ylabel('F1 Macro Score')
    plt.grid()
    plt.show()

    return best_k, results_df


In [None]:
best_k = 5  # Example

# Initialize the KNN model with the best k
knn_model = KNeighborsClassifier(n_neighbors=best_k)

# Fit the model on the training data
knn_model.fit(X_train, y_train)

# Make predictions on the training set
train_pred = knn_model.predict(X_train)

# Make predictions on the validation set
val_pred = knn_model.predict(X_val)

In [None]:
# Evaluate performance on the training data
print("Training Data Evaluation:")
print(f"Accuracy (Train): {accuracy_score(y_train, train_pred)}")
print(f"F1 Score (Train): {f1_score(y_train, train_pred, average='weighted')}")

# Evaluate performance on the validation data
print("\nValidation Data Evaluation:")
print(f"Accuracy (Validation): {accuracy_score(y_val, val_pred)}")
print(f"F1 Score (Validation): {f1_score(y_val, val_pred, average='weighted')}")

# Generate classification report for validation data
print("\nClassification Report (Validation):")
print(classification_report(y_val, val_pred))


In [None]:
# Generate confusion matrix for validation data
conf_matrix = confusion_matrix(y_val, val_pred)

# Convert confusion matrix to a DataFrame
conf_matrix_df = pd.DataFrame(
    conf_matrix,
    index=[f"Actual_{label}" for label in knn_model.classes_],
    columns=[f"Predicted_{label}" for label in knn_model.classes_]
)

# Display the confusion matrix as a DataFrame
print("\nConfusion Matrix as DataFrame:")
print(conf_matrix_df)

# Visualize the confusion matrix as a heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', 
            xticklabels=knn_model.classes_, yticklabels=knn_model.classes_)
plt.title("Confusion Matrix (Validation)")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

## Embedded Mehtods ##

**Voting Classifier**

In [None]:
from sklearn.ensemble import VotingClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, f1_score, accuracy_score
import seaborn as sns
import matplotlib.pyplot as plt

# Initialize the individual models
logistic_model = LogisticRegression(max_iter=1000, random_state=42)
random_forest_model = RandomForestClassifier(random_state=42, n_estimators=100, class_weight='balanced')
gradient_boosting_model = GradientBoostingClassifier(random_state=42, n_estimators=100)

# Combine models in a VotingClassifier
voting_clf = VotingClassifier(
    estimators=[
        ('lr', logistic_model),
        ('rf', random_forest_model),
        ('gb', gradient_boosting_model)
    ],
    voting='soft'  # Soft voting for probability-based aggregation
)

In [None]:
# Train the ensemble model
voting_clf.fit(X_train, y_train)

In [None]:
# Make predictions on the training set
train_pred = voting_clf.predict(X_train)

# Make predictions on the validation set
val_pred = voting_clf.predict(X_val)

In [None]:
# Evaluate the ensemble on the training set
print("Training Data Evaluation:")
print(f"Accuracy (Train): {accuracy_score(y_train, train_pred):.4f}")
print(f"F1 Score (Train): {f1_score(y_train, train_pred, average='weighted'):.4f}")

# Evaluate the ensemble on the validation set
print("\nValidation Data Evaluation:")
print(f"Accuracy (Validation): {accuracy_score(y_val, val_pred):.4f}")
print(f"F1 Score (Validation): {f1_score(y_val, val_pred, average='weighted'):.4f}")

# Generate classification report for validation data
print("\nClassification Report (Validation):")
print(classification_report(y_val, val_pred))

In [None]:
# Generate confusion matrix for validation data
conf_matrix = confusion_matrix(y_val, val_pred)

# Visualize the confusion matrix as a heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", 
            xticklabels=voting_clf.classes_, yticklabels=voting_clf.classes_)
plt.title("Confusion Matrix - Voting Classifier")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()


**Kaggle Submissions**

In [47]:
'''# Reset index to include 'Claim Identifier' as a column if it's the index
X_test = X_test.reset_index()

label_mapping = {
    0: "1. CANCELLED",
    1: "2. NON-COMP",
    2: "3. MED ONLY",
    3: "4. TEMPORARY",
    4: "5. PPD SCH LOSS",
    5: "6. PPD NSL",
    6: "7. PTD",
    7: "8. DEATH",
    8: "Unknown"
}

predictions = [label_mapping[label] for label in pred_RF]

# Create the submission DataFrame
submission_df = pd.DataFrame({
    'Claim Identifier': X_test['Claim Identifier'],  # Ensure this column exists
    'Claim Injury Type': predictions                     # Replace 'pred_RF' with your predictions
})

# Save to CSV
submission_df.to_csv('submission.csv', index=False)

