# Text Classification with Logistic Regression, Linear SVM, and Naive Bayes
This notebook reads training and testing data from CSV files, preprocesses the text data, and performs classification using Logistic Regression, Linear SVM, and Naive Bayes with TF-IDF vectors. It reports the macro and micro F1 scores, accuracy, precision, recall, and confusion matrix.

In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, confusion_matrix, classification_report

In [23]:
# Function to load data
train_df = pd.read_csv('Dataset/preprocessed/part1/train.csv')
val_df = pd.read_csv('Dataset/preprocessed/part1/val.csv')
test_df = pd.read_csv('Dataset/preprocessed/part1/test.csv')

In [24]:
len(train_df), len(val_df), len(test_df)

(161, 20, 21)

In [25]:
def numberize_labels(df, labels):
    for label in labels:
        df[label] = df[label].apply(lambda x: 1 if x == 'met' else 0)
    return df

train_df = numberize_labels(train_df, labels=['abdominal', 'creatinine', 'major_diabetes'])
test_df = numberize_labels(test_df, labels=['abdominal', 'creatinine', 'major_diabetes'])
val_df = numberize_labels(val_df, labels=['abdominal', 'creatinine', 'major_diabetes'])

In [26]:
#combine train and val for this part
train_df = pd.concat([train_df, val_df], ignore_index=True)
len(train_df)

181

In [28]:
# Function to preprocess and vectorize text data
def preprocess_data(train_df, test_df, text_column):
    vectorizer = TfidfVectorizer()
    X_train = vectorizer.fit_transform(train_df[text_column])
    X_test = vectorizer.transform(test_df[text_column])
    return X_train, X_test, vectorizer

# Preprocess data
X_train, X_test, vectorizer = preprocess_data(train_df, test_df, 'text')

In [29]:
# Function to train and evaluate model
def train_evaluate_model(X_train, X_test, y_train, y_test, model):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='macro')
    recall = recall_score(y_test, y_pred, average='macro')
    f1_macro = f1_score(y_test, y_pred, average='macro')
    f1_micro = f1_score(y_test, y_pred, average='micro')
    conf_matrix = confusion_matrix(y_test, y_pred)
    
    print(f"Accuracy: {accuracy}")
    print(f"Precision (Macro): {precision}")
    print(f"Recall (Macro): {recall}")
    print(f"F1 Score (Macro): {f1_macro}")
    print(f"F1 Score (Micro): {f1_micro}")
    print("Confusion Matrix:")
    print(conf_matrix)
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    
    return model

In [31]:
# Main function to load data, train and evaluate models for each label
labels = ['abdominal', 'creatinine', 'major_diabetes']

for label in labels:
    print(f"Evaluating label: {label}")
    y_train = train_df[label]
    y_test = test_df[label]
        
    print("\nLogistic Regression:")
    lr_model = LogisticRegression(max_iter=1000)
    train_evaluate_model(X_train, X_test, y_train, y_test, lr_model)
        
    print("\nLinear SVM:")
    svm_model = SVC(kernel='linear')
    train_evaluate_model(X_train, X_test, y_train, y_test, svm_model)
        
    print("\nNaive Bayes:")
    nb_model = MultinomialNB()
    train_evaluate_model(X_train, X_test, y_train, y_test, nb_model)
    print("\n" + "="*60 + "\n")

Evaluating label: abdominal

Logistic Regression:
Accuracy: 0.5238095238095238
Precision (Macro): 0.275
Recall (Macro): 0.4583333333333333
F1 Score (Macro): 0.34374999999999994
F1 Score (Micro): 0.5238095238095238
Confusion Matrix:
[[11  1]
 [ 9  0]]
Classification Report:
              precision    recall  f1-score   support

           0       0.55      0.92      0.69        12
           1       0.00      0.00      0.00         9

    accuracy                           0.52        21
   macro avg       0.28      0.46      0.34        21
weighted avg       0.31      0.52      0.39        21


Linear SVM:
Accuracy: 0.7619047619047619
Precision (Macro): 0.7596153846153846
Recall (Macro): 0.75
F1 Score (Macro): 0.7529411764705882
F1 Score (Micro): 0.7619047619047619
Confusion Matrix:
[[10  2]
 [ 3  6]]
Classification Report:
              precision    recall  f1-score   support

           0       0.77      0.83      0.80        12
           1       0.75      0.67      0.71         9



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.6666666666666666
Precision (Macro): 0.8157894736842105
Recall (Macro): 0.6111111111111112
F1 Score (Macro): 0.5689149560117301
F1 Score (Micro): 0.6666666666666666
Confusion Matrix:
[[12  0]
 [ 7  2]]
Classification Report:
              precision    recall  f1-score   support

           0       0.63      1.00      0.77        12
           1       1.00      0.22      0.36         9

    accuracy                           0.67        21
   macro avg       0.82      0.61      0.57        21
weighted avg       0.79      0.67      0.60        21


Naive Bayes:
Accuracy: 0.5714285714285714
Precision (Macro): 0.2857142857142857
Recall (Macro): 0.5
F1 Score (Macro): 0.36363636363636365
F1 Score (Micro): 0.5714285714285714
Confusion Matrix:
[[12  0]
 [ 9  0]]
Classification Report:
              precision    recall  f1-score   support

           0       0.57      1.00      0.73        12
           1       0.00      0.00      0.00         9

    accuracy                         

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.7142857142857143
Precision (Macro): 0.675
Recall (Macro): 0.6428571428571428
F1 Score (Macro): 0.6499999999999999
F1 Score (Micro): 0.7142857142857143
Confusion Matrix:
[[ 3  4]
 [ 2 12]]
Classification Report:
              precision    recall  f1-score   support

           0       0.60      0.43      0.50         7
           1       0.75      0.86      0.80        14

    accuracy                           0.71        21
   macro avg       0.68      0.64      0.65        21
weighted avg       0.70      0.71      0.70        21


Naive Bayes:
Accuracy: 0.6666666666666666
Precision (Macro): 0.3333333333333333
Recall (Macro): 0.5
F1 Score (Macro): 0.4
F1 Score (Micro): 0.6666666666666666
Confusion Matrix:
[[ 0  7]
 [ 0 14]]
Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         7
           1       0.67      1.00      0.80        14

    accuracy                           0.67        21
   macro avg

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
