# Logistic Classifier

In [19]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

In [8]:
file_path = "../data_sets/dataset_raw.csv"
data = pd.read_csv(file_path)

In [9]:
X = data['text']
y = data['category']

In [10]:
from sklearn.feature_extraction.text import CountVectorizer

# Vectorizing the data
vectorizer_try = CountVectorizer(stop_words='english', min_df=0.0003, ngram_range=(1, 4))
X_encoded = vectorizer_try.fit_transform(X)

# Printing data shape
print('Data shape: ', X_encoded.shape)


# Splitting the data and renaming variables
X_train, X_temp, y_train, y_temp = train_test_split(X_encoded, y, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Displaying the shapes of the resulting sets
print("X_train shape:", X_train.shape)
print("X_val shape:", X_val.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_val shape:", y_val.shape)
print("y_test shape:", y_test.shape)

Data shape:  (20000, 5009)
X_train shape: (12000, 5009)
X_val shape: (4000, 5009)
X_test shape: (4000, 5009)
y_train shape: (12000,)
y_val shape: (4000,)
y_test shape: (4000,)


### Logistic classifier

In [13]:
# takes around 20 mins to run (30s per iteration)

def logistic_classification(X_train, y_train, X_val, y_val, X_test, y_test):
    # Define the parameter grid
    param_grid = {
        'penalty': ['l1'], # lasso
        'solver': ['saga'],
        'multi_class': ['multinomial'],
        'C': [0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.1],
        'max_iter': [400, 500, 600]
    }

    # Create logistic regression classifier
    classifier = LogisticRegression(random_state=42)

    # Perform grid search with cross-validation
    grid_search = GridSearchCV(classifier, param_grid, cv=5, scoring='accuracy')
    grid_search.fit(X_train, y_train)

    # Get the best classifier from grid search
    best_classifier = grid_search.best_estimator_

    # Print the best parameters found by grid search
    print("Best Parameters:", grid_search.best_params_)

    # Fit the best classifier on the training data
    best_classifier.fit(X_train, y_train)

    # Compute and print accuracy on the validation data
    val_predictions = best_classifier.predict(X_val)
    val_accuracy = np.mean(val_predictions == y_val)
    print('\nValidation accuracy:', format(100 * val_accuracy, '.2f'))

    # Compute and print the confusion matrix for validation data
    cm_val = confusion_matrix(y_val, val_predictions)
    print('\nConfusion Matrix (Validation):')
    print(cm_val)

    # Compute and print F1 score on the validation data
    val_f1 = f1_score(y_val, val_predictions, average='weighted')
    print('\nF1 Score (Validation):', format(val_f1, '.2f'))

    # Compute and print accuracy on the test data
    test_predictions = best_classifier.predict(X_test)
    test_accuracy = np.mean(test_predictions == y_test)
    print('\nTesting accuracy:', format(100 * test_accuracy, '.2f'))

    # Compute and print the confusion matrix for test data
    cm_test = confusion_matrix(y_test, test_predictions)
    print('\nConfusion Matrix (Test):')
    print(cm_test)

    # Compute and print F1 score on the test data
    test_f1 = f1_score(y_test, test_predictions, average='weighted')
    print('\nF1 Score (Test):', format(test_f1, '.2f'))

    return best_classifier



Best Parameters: {'C': 1.0, 'max_iter': 500, 'multi_class': 'multinomial', 'penalty': 'l1', 'solver': 'saga'}

Validation accuracy: 89.62

Confusion Matrix (Validation):
[[1094   37    6   23   20    0]
 [  22 1222   50    5    8    5]
 [   6   57  283    3    0    0]
 [  25   26    3  475   21    1]
 [  21   12    0   11  394   16]
 [   5   10    0    4   18  117]]




NameError: name 'f1_score' is not defined

In [None]:
# Example usage
best_classifier = logistic_classification(X_train, y_train, X_val, y_val, X_test, y_test)

### the best logistic model
Best Parameters: {'C': 1.0, 'max_iter': 500, 'multi_class': 'multinomial', 'penalty': 'l1', 'solver': 'saga'}


In [20]:
best_classifier = LogisticRegression(penalty='l1', solver='saga', multi_class='multinomial', C = 1.0, random_state=42, max_iter = 500)
best_classifier.fit(X_train, y_train)



In [21]:
val_predictions = best_classifier.predict(X_val)
val_accuracy = np.mean(val_predictions == y_val)
print('\nValidation accuracy:', format(100 * val_accuracy, '.2f'))

# Compute and print the confusion matrix for validation data
cm_val = confusion_matrix(y_val, val_predictions)
print('\nConfusion Matrix (Validation):')
print(cm_val)

# Compute and print F1 score on the validation data
val_f1 = f1_score(y_val, val_predictions, average='weighted')
print('\nF1 Score (Validation):', format(val_f1, '.2f'))

# Compute and print accuracy on the test data
test_predictions = best_classifier.predict(X_test)
test_accuracy = np.mean(test_predictions == y_test)
print('\nTesting accuracy:', format(100 * test_accuracy, '.2f'))

# Compute and print the confusion matrix for test data
cm_test = confusion_matrix(y_test, test_predictions)
print('\nConfusion Matrix (Test):')
print(cm_test)

# Compute and print F1 score on the test data
test_f1 = f1_score(y_test, test_predictions, average='weighted')
print('\nF1 Score (Test):', format(test_f1, '.2f'))


Validation accuracy: 89.62

Confusion Matrix (Validation):
[[1094   37    6   23   20    0]
 [  22 1222   50    5    8    5]
 [   6   57  283    3    0    0]
 [  25   26    3  475   21    1]
 [  21   12    0   11  394   16]
 [   5   10    0    4   18  117]]

F1 Score (Validation): 0.90

Testing accuracy: 90.88

Confusion Matrix (Test):
[[1098   34    1   27   14    1]
 [  17 1290   49    5    8    4]
 [   6   45  270    2    2    0]
 [  28   15    1  492    7    0]
 [  20    7    2   17  390   12]
 [   4    6    0    1   30   95]]

F1 Score (Test): 0.91
