#  Baseline Model

In [21]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from math import log
import warnings
warnings.filterwarnings("ignore")
# install libraries
from sklearn.model_selection import train_test_split
import os
import sys


In [22]:
train_path = "../Data/clean_train_data_mok.csv"
valid_path = "../Data/clean_valid_data_mok.csv"
test_path = "../Data/clean_test_data_mok.csv"
# Replace 'Column29' with the actual column name that has mixed types
dtype = {'Column29': 'str'}

train = pd.read_csv(train_path, index_col='Claim Identifier',
                    dtype=dtype, low_memory=False)
valid = pd.read_csv(valid_path, index_col='Claim Identifier',
                    dtype=dtype, low_memory=False)
test = pd.read_csv(test_path, index_col='Claim Identifier',
                   dtype=dtype, low_memory=False)

In [23]:
train = train.dropna(subset=['Claim Injury Type'])
y_train_int = train['Claim Injury Type']
y_train_str = y_train_int.astype(str)
train = train.drop(columns=['Claim Injury Type'])


valid = valid.dropna(subset=['Claim Injury Type'])
y_valid_int = valid['Claim Injury Type']
y_valid_str = y_valid_int.astype(str)
valid = valid.drop(columns=['Claim Injury Type'])


## random forest

In [None]:
# import random forest classifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

model_rf = RandomForestClassifier(n_estimators=100, random_state=0)

num_features = train.select_dtypes(include=[np.number]).columns

model_rf.fit(train [num_features], y_train_str)
y_pred_rf = model_rf.predict(valid[num_features])
y_pred_rf_train = model_rf.predict(train[num_features])


In [28]:
# Create a string
# Save the model
import pickle

def report_and_save (model,model_name,y_pred, y, model_filepath = '../Models'
                     , report_filepath = '../Reports'): 
    
    
    model_parameters = model.get_params()
    model_filename = model_name + '.sav'
    full_model_filename_os = os.path.join(model_filepath, model_filename)
    report_filename = model_name + '.txt'
    full_report_filename_os = os.path.join(report_filepath, report_filename)

    try:
        with open(full_model_filename_os, 'wb') as file:
            pickle.dump(model, file)
    except Exception as e:
        print(f"An error occurred while saving the model: {e}")
    accuracy = accuracy_score(y, y_pred)
    precision = precision_score(y, y_pred, average='weighted')
    recall = recall_score(y, y_pred, average='weighted')
    cm = confusion_matrix(y, y_pred)

    # Save the string into a text file
    try:
        with open(full_report_filename_os, "w") as file:
            file.write(model_name)
            file.write("\n________________________\n")
            file.write('model_parameters: \n')
            file.write(str(model_parameters).replace(",", "\n"))   
            file.write("\n________________________\n")
            file.write('Accuracy: ')
            file.write(str(accuracy))
            file.write("\n________________________\n")
            file.write('Precision: ')
            file.write(str(precision))
            file.write("\n________________________\n")
            file.write('Recall: ')
            file.write(str(recall))
            file.write("\n________________________\n")
            file.write('Confusion Matrix:\n')
            file.write(str(cm))
            file.write("\n________________________\n")
    except Exception as e:
        print(f"An error occurred while writing the report: {e}")
    


In [29]:
report_and_save(model_rf, 'RandomForest', y_pred_rf, y_valid_str)

In [None]:
model_name = "RandomForest"
filepath = '../Models'
filename = model_name + '.sav'
filename_os = os.path.join(filepath, filename)
filename_os

'../Models/Random Forest.sav'

## try multiple models and choose the best one


In [None]:
from sklearn.feature_selection import RFE

# Create the RFE object and rank each feature
rfe = RFE(estimator=model_rf, n_features_to_select=4, step=1)
rfe.fit(train[num_features], y_train_str)
ranking = rfe.ranking_
ranking

# Get the indices of the 4 most important features
indices = [i for i, x in enumerate(ranking) if x == 1]
indices

#train the model with the 4 most important features
model_rf.fit(train[num_features].iloc[:, indices], y_train_str)
y_pred_rf = model_rf.predict(valid[num_features].iloc[:, indices])
y_pred_rf_train = model_rf.predict(train[num_features].iloc[:, indices])

report_and_save(model_rf, 'RandomForest_RFE', y_pred_rf, y_valid_str)

precisions_rfe = [] 
f1_rfe = []
best_set_of_features_rfe = []
best_score_rfe = 0
for n_featrues in range(1, 10):
    rfe = RFE(estimator=model_rf, n_features_to_select=n_featrues, step=1)
    rfe.fit(train[num_features], y_train_str)
    ranking = rfe.ranking_
    indices = [i for i, x in enumerate(ranking) if x == 1]
    model_rf.fit(train[num_features].iloc[:, indices], y_train_str)
    y_pred_rf = model_rf.predict(valid[num_features].iloc[:, indices])
    y_pred_rf_train = model_rf.predict(train[num_features].iloc[:, indices])
    report_and_save(model_rf, f'RandomForest_RFE_{n_featrues}', y_pred_rf, y_valid_str)
    
    precisions_rfe.append(precision_score(y_valid_str, y_pred_rf, average='weighted'))
    f1_model = f1_score(y_valid_str, y_pred_rf, average='weighted')
    f1_rfe.append(f1_model)
    if f1_score > best_score:
        best_score = f1_score
        best_set_of_features = indices
    
plt.plot(range(1, 10), precisions_rfe, label='Precision')
plt.plot(range(1, 10), f1_rfe, label='F1')
plt.xlabel('Number of Features')
plt.ylabel('Score')
plt.legend()
    
    

## multple layers perceptron

In [None]:
# multiple layer perceptron
from sklearn.neural_network import MLPClassifier

model_mlp = MLPClassifier(hidden_layer_sizes=(100, 100), max_iter=1000, random_state=0)
model_mlp.fit(train[num_features], y_train_str)
y_pred_mlp = model_mlp.predict(valid[num_features])
y_pred_mlp_train = model_mlp.predict(train[num_features])

report_and_save(model_mlp, 'MLP', y_pred_mlp, y_valid_str)

precisions_mpl = []
f1_mlp = []
best_set_of_features_mlp = []
best_score_mlp = 0
for n_featrues in range(1, 10):
    rfe = RFE(estimator=model_mlp, n_features_to_select=n_featrues, step=1)
    rfe.fit(train[num_features], y_train_str)
    ranking = rfe.ranking_
    indices = [i for i, x in enumerate(ranking) if x == 1]
    model_mlp.fit(train[num_features].iloc[:, indices], y_train_str)
    y_pred_mlp = model_mlp.predict(valid[num_features].iloc[:, indices])
    y_pred_mlp_train = model_mlp.predict(train[num_features].iloc[:, indices])
    report_and_save(model_mlp, f'MLP_RFE_{n_featrues}', y_pred_mlp, y_valid_str)
    
    precisions_mpl.append(precision_score(y_valid_str, y_pred_mlp, average='weighted'))
    f1_model = f1_score(y_valid_str, y_pred_mlp, average='weighted')
    f1_mlp.append(f1_model)
    if f1_score > best_score:
        best_score = f1_score
        best_set_of_features = indices
        
plt.plot(range(1, 10), precisions_mpl, label='Precision')
plt.plot(range(1, 10), f1_mlp, label='F1')
plt.xlabel('Number of Features')
plt.ylabel('Score')

plt.legend()
plt.show()

IndentationError: unexpected indent (872194366.py, line 16)