# Text Classification with Logistic Regression, Linear SVM, and Naive Bayes
This notebook reads training and testing data from CSV files, preprocesses the text data, and performs classification using Logistic Regression, Linear SVM, and Naive Bayes with TF-IDF vectors. It reports the macro and micro F1 scores, accuracy, precision, recall, and confusion matrix.

In [48]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, confusion_matrix, classification_report

In [49]:
# Function to load data
train_df = pd.read_csv('Dataset/preprocessed/part1/train.csv')
val_df = pd.read_csv('Dataset/preprocessed/part1/val.csv')
test_df = pd.read_csv('Dataset/preprocessed/part1/test.csv')

In [50]:
len(train_df), len(val_df), len(test_df)

(161, 20, 21)

In [51]:
def numberize_labels(df, labels):
    for label in labels:
        df[label] = df[label].apply(lambda x: 1 if x == 'met' else 0)
    return df

train_df = numberize_labels(train_df, labels=['abdominal', 'creatinine', 'major_diabetes'])
test_df = numberize_labels(test_df, labels=['abdominal', 'creatinine', 'major_diabetes'])
val_df = numberize_labels(val_df, labels=['abdominal', 'creatinine', 'major_diabetes'])

In [52]:
#combine train and val for this part
train_df = pd.concat([train_df, val_df], ignore_index=True)
len(train_df)

181

In [53]:
# Function to preprocess and vectorize text data
def preprocess_data(train_df, test_df, text_column):
    vectorizer = TfidfVectorizer()
    X_train = vectorizer.fit_transform(train_df[text_column])
    X_test = vectorizer.transform(test_df[text_column])
    return X_train, X_test, vectorizer

# Preprocess data
X_train, X_test, vectorizer = preprocess_data(train_df, test_df, 'text')

In [54]:
# Function to train and evaluate model
def train_evaluate_model(X_train, X_test, y_train, y_test, model):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return y_pred

In [55]:
# Main function to load data, train and evaluate models for each label
labels = ['abdominal', 'creatinine', 'major_diabetes']
preds_lr = np.zeros((len(test_df), len(labels)))
preds_svm = np.zeros((len(test_df), len(labels)))
preds_nb = np.zeros((len(test_df), len(labels)))



for i, label in enumerate(labels):
    y_train = train_df[label]
    y_test = test_df[label]
        
    lr_model = LogisticRegression(max_iter=1000)
    preds_lr[:, i] = train_evaluate_model(X_train, X_test, y_train, y_test, lr_model)
        
    svm_model = SVC(kernel='linear')
    preds_svm[:, i] = train_evaluate_model(X_train, X_test, y_train, y_test, svm_model)
        
    nb_model = MultinomialNB()
    preds_nb[:, i] = train_evaluate_model(X_train, X_test, y_train, y_test, nb_model)
    

In [63]:
def compute_metrics(preds, labels):
    
   

    # preds = preds.int().numpy()
    labels = labels.to_numpy()
    print(labels.shape, preds.shape)

    
    # Initialize counts
    tp = np.zeros((3,))
    tn = np.zeros((3,))
    fp = np.zeros((3,))
    fn = np.zeros((3,))
    
    for i in range(3):
        tp[i] = np.sum((preds[:, i] == 1) & (labels[:, i] == 1))
        tn[i] = np.sum((preds[:, i] == 0) & (labels[:, i] == 0))
        fp[i] = np.sum((preds[:, i] == 1) & (labels[:, i] == 0))
        fn[i] = np.sum((preds[:, i] == 0) & (labels[:, i] == 1))

    # Calculate metrics
    precision = tp / (tp + fp + 1e-10)
    recall = tp / (tp + fn + 1e-10)
    micro_f1 = 2 * np.sum(tp) / (2 * np.sum(tp) + np.sum(fp) + np.sum(fn) + 1e-10)
    macro_f1 = 2 * (precision * recall) / (precision + recall + 1e-10)

    return {
        'accuracy': accuracy_score(labels.flatten(), preds.flatten()),
        'precision': precision,
        'recall': recall,
        'micro_f1': micro_f1,
        'macro_f1': np.mean(macro_f1),
        'confusion_matrix': confusion_matrix(labels.flatten(), preds.flatten())
    }

print("Logistic Regression Evaluation")
print(compute_metrics(preds_lr, test_df[labels]))
print("SVM Evaluation")
print(compute_metrics(preds_svm, test_df[labels]))
print("Naive Bayes Evaluation")
print(compute_metrics(preds_nb, test_df[labels]))

Logistic Regression Evaluation
(21, 3) (21, 3)
{'accuracy': 0.6507936507936508, 'precision': array([0.        , 1.        , 0.73684211]), 'recall': array([0.        , 0.22222222, 1.        ]), 'micro_f1': 0.5925925925914952, 'macro_f1': 0.4040404040102847, 'confusion_matrix': array([[25,  6],
       [16, 16]], dtype=int64)}
SVM Evaluation
(21, 3) (21, 3)
{'accuracy': 0.7142857142857143, 'precision': array([0.75, 1.  , 0.75]), 'recall': array([0.66666667, 0.22222222, 0.85714286]), 'micro_f1': 0.689655172412604, 'macro_f1': 0.6231729054759779, 'confusion_matrix': array([[25,  6],
       [12, 20]], dtype=int64)}
Naive Bayes Evaluation
(21, 3) (21, 3)
{'accuracy': 0.6031746031746031, 'precision': array([0.        , 0.        , 0.66666667]), 'recall': array([0., 0., 1.]), 'micro_f1': 0.5283018867914561, 'macro_f1': 0.2666666666491429, 'confusion_matrix': array([[24,  7],
       [18, 14]], dtype=int64)}
