# Model Exploration
This objective of this project is to evaluates 3 approaches to accurately analyze real-world data: a naive approach, a non deep learning approach, and a neural network-based deep learning approach

In [1]:
# Imports
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import warnings

warnings.filterwarnings('ignore')

In [2]:
train_df = pd.read_csv('./data/processed/train.csv')
val_df   = pd.read_csv('./data/processed/val.csv')
test_df  = pd.read_csv('./data/processed/test.csv')

# Naive Approach
Predicts the most common medical condition in the dataset

In [None]:
y_train = train_df['id'] # Extract target variable from training set
y_test = test_df['id']

In [None]:
most_common_class = y_train.value_counts().idxmax() # Find the most common class
y_pred = [most_common_class] * len(y_test) # Predict the most common class for all test samples

In [9]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)

print("Accuracy:", round(accuracy, 4))
print("Precision:", round(precision, 4))
print("Recall:", round(recall, 4))
print("F1 Score:", round(f1, 4))

Accuracy: 0.585
Precision: 0.3422
Recall: 0.585
F1 Score: 0.4318


# Classical Machine Learning Approach
Uses TF-IDF features of the symptoms text to train a logistic regression classifier

# Neural Network-based Deep Learning Approach
Fine-tunes a BERT text classification model to map symptoms to conditions