# Text Classification with Logistic Regression, Linear SVM, and Naive Bayes
This notebook reads training and testing data from CSV files, preprocesses the text data, and performs classification using Logistic Regression, Linear SVM, and Naive Bayes with TF-IDF vectors. It reports the macro and micro F1 scores, accuracy, precision, recall, and confusion matrix.

In [35]:
!pip install numpy
!pip install pandas
!pip install scikit-learn



In [36]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, confusion_matrix, classification_report

In [37]:
# Function to load data
train_df = pd.read_csv('Dataset/preprocessed/part1/train.csv')
val_df = pd.read_csv('Dataset/preprocessed/part1/val.csv')
test_df = pd.read_csv('Dataset/preprocessed/part1/test.csv')

In [38]:
len(train_df), len(val_df), len(test_df)

(161, 20, 21)

In [39]:
def numberize_labels(df, labels):
    for label in labels:
        df[label] = df[label].apply(lambda x: 1 if x == 'met' else 0)
    return df

train_df = numberize_labels(train_df, labels=['abdominal', 'creatinine', 'major_diabetes'])
test_df = numberize_labels(test_df, labels=['abdominal', 'creatinine', 'major_diabetes'])
val_df = numberize_labels(val_df, labels=['abdominal', 'creatinine', 'major_diabetes'])

In [40]:
#combine train and val for this part
train_df = pd.concat([train_df, val_df], ignore_index=True)
len(train_df)

181

In [41]:
# Function to preprocess and vectorize text data
def preprocess_data(train_df, test_df, text_column):
    vectorizer = TfidfVectorizer()
    X_train = vectorizer.fit_transform(train_df[text_column])
    X_test = vectorizer.transform(test_df[text_column])
    return X_train, X_test, vectorizer

# Preprocess data
# X_train, X_test, vectorizer = preprocess_data(train_df, test_df, 'text')

In [42]:
# Function to train and evaluate model
def train_evaluate_model(X_train, X_test, y_train, y_test, model):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return y_pred

In [50]:
def compute_metrics(preds, labels):
    # preds = preds.int().numpy()
    labels = labels.to_numpy()
    # print(labels.shape, preds.shape)

    
    # Initialize counts
    tp = np.zeros((3,))
    tn = np.zeros((3,))
    fp = np.zeros((3,))
    fn = np.zeros((3,))
    
    for i in range(3):
        tp[i] = np.sum((preds[:, i] == 1) & (labels[:, i] == 1))
        tn[i] = np.sum((preds[:, i] == 0) & (labels[:, i] == 0))
        fp[i] = np.sum((preds[:, i] == 1) & (labels[:, i] == 0))
        fn[i] = np.sum((preds[:, i] == 0) & (labels[:, i] == 1))

    # Calculate metrics
    precision = tp / (tp + fp + 1e-10)
    recall = tp / (tp + fn + 1e-10)
    micro_f1 = 2 * np.sum(tp) / (2 * np.sum(tp) + np.sum(fp) + np.sum(fn) + 1e-10)
    macro_f1 = 2 * (precision * recall) / (precision + recall + 1e-10)

    return {
        'accuracy': accuracy_score(labels.flatten(), preds.flatten()),
        'precision': precision,
        'recall': recall,
        'micro_f1': micro_f1,
        'macro_f1': np.mean(macro_f1),
        'confusion_matrix': confusion_matrix(labels.flatten(), preds.flatten())
    }


In [51]:
# Main function to load data, train and evaluate models for each label

labels = ['abdominal', 'creatinine', 'major_diabetes']

def create_models_and_eval(train_df, test_df, C=1.0):
    preds_lr = np.zeros((len(test_df), len(labels)))
    preds_svm = np.zeros((len(test_df), len(labels)))
    preds_nb = np.zeros((len(test_df), len(labels)))
    X_train, X_test, vectorizer = preprocess_data(train_df, test_df, 'text')



    for i, label in enumerate(labels):
        y_train = train_df[label]
        y_test = test_df[label]
            
        # lr_model = LogisticRegression(max_iter=1000)
        # preds_lr[:, i] = train_evaluate_model(X_train, X_test, y_train, y_test, lr_model)
            
        svm_model = SVC(kernel='linear', C = C)
        preds_svm[:, i] = train_evaluate_model(X_train, X_test, y_train, y_test, svm_model)
            
        # nb_model = MultinomialNB()
        # preds_nb[:, i] = train_evaluate_model(X_train, X_test, y_train, y_test, nb_model)
    
    # print("Logistic Regression Evaluation")
    # print(compute_metrics(preds_lr, test_df[labels]))

    return compute_metrics(preds_svm, test_df[labels])
    # print("Naive Bayes Evaluation")
    # print(compute_metrics(preds_nb, test_df[labels]))  
        

In [62]:
# k fold cross validation
for C in [0.1, 0.5, 5, 10]:
    k = 5
    folds = np.array_split(train_df, k)
    metrics = {}
    for i in range(k):
        new_train_df = pd.concat([fold for j, fold in enumerate(folds) if j != i], ignore_index=True)
        val_df = folds[i]
        metrics_i = create_models_and_eval(new_train_df, val_df, C=C)
        for key in metrics_i.keys():
            if key in metrics:
                metrics[key] += metrics_i[key]
            else:
                metrics[key] = metrics_i[key]
    for key in metrics.keys():
        if key != 'confusion_matrix':
            metrics[key] /= k
    
    print("SVM Evaluation. C = ", C)
    print(metrics)
        
    

SVM Evaluation. C =  0.1
{'accuracy': 0.5894394394394394, 'precision': array([0.        , 0.        , 0.54744745]), 'recall': array([0., 0., 1.]), 'micro_f1': 0.47012422823174854, 'macro_f1': 0.23499118164184885, 'confusion_matrix': array([[221,  82],
       [141,  99]], dtype=int64)}
SVM Evaluation. C =  0.5
{'accuracy': 0.5931431431431432, 'precision': array([0.        , 0.        , 0.55529058]), 'recall': array([0., 0., 1.]), 'micro_f1': 0.4725484706559856, 'macro_f1': 0.23683026210157224, 'confusion_matrix': array([[223,  80],
       [141,  99]], dtype=int64)}
SVM Evaluation. C =  5
{'accuracy': 0.6778278278278277, 'precision': array([0.55656566, 0.68686869, 0.67180576]), 'recall': array([0.42846908, 0.50371785, 0.84980392]), 'micro_f1': 0.628745794328068, 'macro_f1': 0.5958786890458901, 'confusion_matrix': array([[220,  83],
       [ 92, 148]], dtype=int64)}
SVM Evaluation. C =  10
{'accuracy': 0.6778278278278277, 'precision': array([0.55656566, 0.68686869, 0.67180576]), 'recall':

In [63]:
# so we pick C = 5

metrics = create_models_and_eval(train_df, test_df, C=5)
print('SVM Evaluation on Test set on C=5')
print(metrics)

SVM Evaluation on Test set on C=5
{'accuracy': 0.7301587301587301, 'precision': array([0.58333333, 1.        , 0.73333333]), 'recall': array([0.77777778, 0.66666667, 0.78571429]), 'micro_f1': 0.7384615384604023, 'macro_f1': 0.7417624520508904, 'confusion_matrix': array([[22,  9],
       [ 8, 24]], dtype=int64)}
