In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.feature_extraction.text import CountVectorizer

from imblearn.pipeline import Pipeline, make_pipeline
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler

seed = 42

In [2]:
file_path = "dataset_raw.csv"
data = pd.read_csv(file_path)

In [3]:
X = data['text']
y = data['category']

# Vectorizing the data
vectorizer_try = CountVectorizer(stop_words='english', min_df=0.0003, ngram_range=(1, 4))
X_encoded = vectorizer_try.fit_transform(X)

# Printing data shape
print('Data shape: ', X_encoded.shape)

# Splitting the data and renaming variables
X_train, X_temp, y_train, y_temp = train_test_split(X_encoded, y, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Displaying the shapes of the resulting sets
print("X_train shape:", X_train.shape)
print("X_val shape:", X_val.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_val shape:", y_val.shape)
print("y_test shape:", y_test.shape)


Data shape:  (20000, 5009)
X_train shape: (12000, 5009)
X_val shape: (4000, 5009)
X_test shape: (4000, 5009)
y_train shape: (12000,)
y_val shape: (4000,)
y_test shape: (4000,)


# OverSampling

## 1. Logistic Regression without oversampling - Baseline

In [4]:
def logistic_classification_BASE(X_train, y_train, X_test, y_test):
    '''
    This function is designed to benchmark the performance of CountVectorizer()
    A rough result is all I need, therefore the hyperparameters of Logistic classifier is not important here.
    '''
    classifier = LogisticRegression(penalty='l1', solver='saga', multi_class='multinomial', C = 0.8, random_state=42, max_iter = 100)
    classifier.fit(X_train, y_train)

    # Compute and print the confusion matrix for test data
    test_predictions = classifier.predict(X_test)
    cm_test = confusion_matrix(y_test, test_predictions)
    print('\nConfusion Matrix (Test):')
    print(cm_test)

    # Compute and print F1 score on the test data
    test_f1 = f1_score(y_test, test_predictions, average='weighted')
    print('\nF1 Score (Test):', format(test_f1, '.2f'))

    return classifier

In [5]:
classifier_base = logistic_classification_BASE(X_train, y_train, X_test, y_test)


Confusion Matrix (Test):
[[1095   37    4   26   12    1]
 [  14 1297   47    5    6    4]
 [   6   44  270    3    2    0]
 [  33   17    1  483    9    0]
 [  19   10    2   16  388   13]
 [   5    7    0    1   29   94]]

F1 Score (Test): 0.91




## 2. Logistic Regression with Random Oversampling

In [6]:
def logistic_classification_RO(X_train, y_train, X_test, y_test):
    '''
    This function is designed to benchmark the performance of CountVectorizer()
    A rough result is all I need, therefore the hyperparameters of Logistic classifier is not important here.
    '''
    classifier = LogisticRegression(penalty='l1', solver='saga', multi_class='multinomial', C = 0.8, random_state=42, max_iter = 100)
    sampler = RandomOverSampler(sampling_strategy='not majority', random_state=seed)
    X_resampled, y_resampled = sampler.fit_resample(X_train, y_train)
    classifier.fit(X_resampled, y_resampled)

    # Compute and print the confusion matrix for test data
    test_predictions = classifier.predict(X_test)
    cm_test = confusion_matrix(y_test, test_predictions)
    print('\nConfusion Matrix (Test):')
    print(cm_test)

    # Compute and print F1 score on the test data
    test_f1 = f1_score(y_test, test_predictions, average='weighted')
    print('\nF1 Score (Test):', format(test_f1, '.2f'))

    return classifier

In [7]:
classifier_ro = logistic_classification_RO(X_train, y_train, X_test, y_test)


Confusion Matrix (Test):
[[1080   33    4   33   21    4]
 [  16 1247   83    6    8   13]
 [   1   14  306    2    2    0]
 [  20   10    2  500   10    1]
 [   9    5    2   17  391   24]
 [   1    1    0    1   11  122]]

F1 Score (Test): 0.91




## 3. Logistic Regression with SMOTE

In [8]:
def logistic_classification_SMOTE(X_train, y_train, X_test, y_test):
    '''
    This function is designed to benchmark the performance of CountVectorizer()
    A rough result is all I need, therefore the hyperparameters of Logistic classifier is not important here.
    '''
    classifier = LogisticRegression(penalty='l1', solver='saga', multi_class='multinomial', C = 0.8, random_state=42, max_iter = 100)
    sampler = SMOTE(sampling_strategy='not majority', random_state=seed)
    X_resampled, y_resampled = sampler.fit_resample(X_train, y_train)
    classifier.fit(X_resampled, y_resampled)

    # Compute and print the confusion matrix for test data
    test_predictions = classifier.predict(X_test)
    cm_test = confusion_matrix(y_test, test_predictions)
    print('\nConfusion Matrix (Test):')
    print(cm_test)

    # Compute and print F1 score on the test data
    test_f1 = f1_score(y_test, test_predictions, average='weighted')
    print('\nF1 Score (Test):', format(test_f1, '.2f'))

    return classifier

In [9]:
classifier_smote = logistic_classification_SMOTE(X_train, y_train, X_test, y_test)


Confusion Matrix (Test):
[[1080   17   15   33   23    7]
 [  16 1243   79    8   12   15]
 [   3   25  289    3    3    2]
 [  27    6    4  490   16    0]
 [   8    5    5   14  388   28]
 [   3    4    0    2   14  113]]

F1 Score (Test): 0.90




# UnderSampling

## 1. Logistic Regression with Random Undersampling

In [10]:
def logistic_classification_RU(X_train, y_train, X_test, y_test):
    '''
    This function is designed to benchmark the performance of CountVectorizer()
    A rough result is all I need, therefore the hyperparameters of Logistic classifier is not important here.
    '''
    classifier = LogisticRegression(penalty='l1', solver='saga', multi_class='multinomial', C = 0.8, random_state=42, max_iter = 100)
    sampler = RandomUnderSampler(sampling_strategy='not majority', random_state=seed)
    X_resampled, y_resampled = sampler.fit_resample(X_train, y_train)
    classifier.fit(X_resampled, y_resampled)

    # Compute and print the confusion matrix for test data
    test_predictions = classifier.predict(X_test)
    cm_test = confusion_matrix(y_test, test_predictions)
    print('\nConfusion Matrix (Test):')
    print(cm_test)

    # Compute and print F1 score on the test data
    test_f1 = f1_score(y_test, test_predictions, average='weighted')
    print('\nF1 Score (Test):', format(test_f1, '.2f'))

    return classifier

In [11]:
classifier_ru = logistic_classification_RU(X_train, y_train, X_test, y_test)


Confusion Matrix (Test):
[[ 721  384    9   35   19    7]
 [   4 1347   11    1    3    7]
 [   0   94  225    4    2    0]
 [  14   65    3  446   13    2]
 [  10   51    3    8  340   36]
 [   0    4    0    1    9  122]]

F1 Score (Test): 0.80




### Logistic Regression with Different Dataset

| Dataset | Weighted F1 Score |
| :-------- | -------- |
| Dataset without Oversampling (Baseline) | 0.91 |
| Dataset with Random Oversampling | 0.91 |
| Dataset with SMOTE | 0.90 |
| Dataset with Random Undersampling | 0.80 |
