# Logistic Classifier

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.feature_extraction.text import CountVectorizer
from imblearn.over_sampling import RandomOverSampler

In [2]:
file_path = "../data_sets/dataset_raw.csv"
data = pd.read_csv(file_path)

In [3]:
X = data['text'].to_numpy()
y = data['category'].to_numpy()

In [4]:
# Vectorizing the data
vectorizer_try = CountVectorizer(stop_words='english', min_df=0.0003, ngram_range=(1, 4))
X_encoded = vectorizer_try.fit_transform(X)

# Printing data shape
print('Data shape: ', X_encoded.shape)


# Splitting the data and renaming variables
X_train, X_temp, y_train, y_temp = train_test_split(X_encoded, y, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)
              
# Displaying the shapes of the resulting sets
print("X_train shape:", X_train.shape)
print("X_val shape:", X_val.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_val shape:", y_val.shape)
print("y_test shape:", y_test.shape)

Data shape:  (20000, 5009)
X_train shape: (12000, 5009)
X_val shape: (4000, 5009)
X_test shape: (4000, 5009)
y_train shape: (12000,)
y_val shape: (4000,)
y_test shape: (4000,)


### Logistic classifier

In [5]:
# takes around 20 mins to run (30s per iteration)

def logistic_classification(X_train, y_train, X_val, y_val, X_test, y_test):
    # Define the parameter grid
    param_grid = {
        'penalty': ['l1'], # lasso
        'solver': ['saga'],
        'multi_class': ['multinomial'],
        'C': [0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.1],
        'max_iter': [400, 500, 600]
    }

    # Create logistic regression classifier
    classifier = LogisticRegression(random_state=42)

    # Perform grid search with cross-validation
    grid_search = GridSearchCV(classifier, param_grid, cv=5, scoring='f1_weighted')
    grid_search.fit(X_train, y_train)

    # Get the best classifier from grid search
    best_classifier = grid_search.best_estimator_

    # Print the best parameters found by grid search
    print("Best Parameters:", grid_search.best_params_)

    # Fit the best classifier on the training data
    best_classifier.fit(X_train, y_train)

    # Compute and print accuracy on the validation data
    val_predictions = best_classifier.predict(X_val)
    val_accuracy = np.mean(val_predictions == y_val)
    print('\nValidation accuracy:', format(100 * val_accuracy, '.4f'))

    # Compute and print the confusion matrix for validation data
    cm_val = confusion_matrix(y_val, val_predictions)
    print('\nConfusion Matrix (Validation):')
    print(cm_val)

    # Compute and print F1 score on the validation data
    val_f1 = f1_score(y_val, val_predictions, average='weighted')
    print('\nF1 Score (Validation):', format(val_f1, '.4f'))

    # Compute and print accuracy on the test data
    test_predictions = best_classifier.predict(X_test)
    test_accuracy = np.mean(test_predictions == y_test)
    print('\nTesting accuracy:', format(100 * test_accuracy, '.4f'))

    # Compute and print the confusion matrix for test data
    cm_test = confusion_matrix(y_test, test_predictions)
    print('\nConfusion Matrix (Test):')
    print(cm_test)

    # Compute and print F1 score on the test data
    test_f1 = f1_score(y_test, test_predictions, average='weighted')
    print('\nF1 Score (Test):', format(test_f1, '.4f'))

    return best_classifier

In [6]:
# Example usage
best_classifier = logistic_classification(X_train, y_train, X_val, y_val, X_test, y_test)







Best Parameters: {'C': 0.9, 'max_iter': 400, 'multi_class': 'multinomial', 'penalty': 'l1', 'solver': 'saga'}

Validation accuracy: 89.5750

Confusion Matrix (Validation):
[[1094   38    6   23   19    0]
 [  19 1225   50    5    8    5]
 [   6   57  283    3    0    0]
 [  25   28    3  473   21    1]
 [  22   13    0   12  391   16]
 [   5   10    0    4   18  117]]

F1 Score (Validation): 0.8954

Testing accuracy: 90.7750

Confusion Matrix (Test):
[[1097   36    2   25   14    1]
 [  16 1290   50    5    8    4]
 [   6   45  270    2    2    0]
 [  29   17    1  488    8    0]
 [  19    8    2   15  392   12]
 [   4    7    0    1   30   94]]

F1 Score (Test): 0.9073




### the best logistic model
Best Parameters: {'C': 1.0, 'max_iter': 500, 'multi_class': 'multinomial', 'penalty': 'l1', 'solver': 'saga'}


In [7]:
best_classifier = LogisticRegression(penalty='l1', solver='saga', multi_class='multinomial', C = 1.0, random_state=42, max_iter = 500)
best_classifier.fit(X_train, y_train)



In [9]:
val_predictions = best_classifier.predict(X_val)
val_accuracy = np.mean(val_predictions == y_val)
print('\nValidation accuracy:', format(100 * val_accuracy, '.4f'))

# Compute and print the confusion matrix for validation data
cm_val = confusion_matrix(y_val, val_predictions)
print('\nConfusion Matrix (Validation):')
print(cm_val)

# Compute and print F1 score on the validation data
val_f1 = f1_score(y_val, val_predictions, average='weighted')
print('\nF1 Score (Validation):', format(val_f1, '.4f'))

# Compute and print accuracy on the test data
test_predictions = best_classifier.predict(X_test)
test_accuracy = np.mean(test_predictions == y_test)
print('\nTesting accuracy:', format(100 * test_accuracy, '.4f'))

# Compute and print the confusion matrix for test data
cm_test = confusion_matrix(y_test, test_predictions)
print('\nConfusion Matrix (Test):')
print(cm_test)

# Compute and print F1 score on the test data
test_f1 = f1_score(y_test, test_predictions, average='weighted')
print('\nF1 Score (Test):', format(test_f1, '.4f'))


Validation accuracy: 89.6250

Confusion Matrix (Validation):
[[1094   37    6   23   20    0]
 [  22 1222   50    5    8    5]
 [   6   57  283    3    0    0]
 [  25   26    3  475   21    1]
 [  21   12    0   11  394   16]
 [   5   10    0    4   18  117]]

F1 Score (Validation): 0.8959

Testing accuracy: 90.8750

Confusion Matrix (Test):
[[1098   34    1   27   14    1]
 [  17 1290   49    5    8    4]
 [   6   45  270    2    2    0]
 [  28   15    1  492    7    0]
 [  20    7    2   17  390   12]
 [   4    6    0    1   30   95]]

F1 Score (Test): 0.9083


### N-gram Balanced


In [10]:
def logistic_classification(X_train, y_train, X_val, y_val, X_test, y_test):
    # Define the parameter grid
    param_grid = {
        'penalty': ['l1'], # lasso
        'solver': ['saga'],
        'multi_class': ['multinomial'],
        'C': [0.5, 1.0, 1.5]
    }

    # Create logistic regression classifier
    classifier = LogisticRegression(random_state=42)

    # Perform grid search with cross-validation
    grid_search = GridSearchCV(classifier, param_grid, cv=5, scoring='f1_weighted')
    grid_search.fit(X_train, y_train)

    # Get the best classifier from grid search
    best_classifier = grid_search.best_estimator_

    # Print the best parameters found by grid search
    print("Best Parameters:", grid_search.best_params_)

    # Fit the best classifier on the training data
    best_classifier.fit(X_train, y_train)

    # Compute and print accuracy on the validation data
    val_predictions = best_classifier.predict(X_val)
    val_accuracy = np.mean(val_predictions == y_val)
    print('\nValidation accuracy:', format(100 * val_accuracy, '.4f'))

    # Compute and print the confusion matrix for validation data
    cm_val = confusion_matrix(y_val, val_predictions)
    print('\nConfusion Matrix (Validation):')
    print(cm_val)

    # Compute and print F1 score on the validation data
    val_f1 = f1_score(y_val, val_predictions, average='weighted')
    print('\nF1 Score (Validation):', format(val_f1, '.4f'))

    # Compute and print accuracy on the test data
    test_predictions = best_classifier.predict(X_test)
    test_accuracy = np.mean(test_predictions == y_test)
    print('\nTesting accuracy:', format(100 * test_accuracy, '.4f'))

    # Compute and print the confusion matrix for test data
    cm_test = confusion_matrix(y_test, test_predictions)
    print('\nConfusion Matrix (Test):')
    print(cm_test)

    # Compute and print F1 score on the test data
    test_f1 = f1_score(y_test, test_predictions, average='weighted')
    print('\nF1 Score (Test):', format(test_f1, '.4f'))

    return best_classifier

In [11]:
ros = RandomOverSampler(sampling_strategy='not majority', random_state=42)
X_train, y_train = ros.fit_resample(X_train, y_train)

# Displaying the shapes of the resulting sets
print("X_train shape:", X_train.shape)
print("X_val shape:", X_val.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_val shape:", y_val.shape)
print("y_test shape:", y_test.shape)

X_train shape: (24456, 5009)
X_val shape: (4000, 5009)
X_test shape: (4000, 5009)
y_train shape: (24456,)
y_val shape: (4000,)
y_test shape: (4000,)


In [12]:
np.bincount(y_train)

array([4076, 4076, 4076, 4076, 4076, 4076])

In [13]:
best_classifier = logistic_classification(X_train, y_train, X_val, y_val, X_test, y_test)



Best Parameters: {'C': 1.5, 'multi_class': 'multinomial', 'penalty': 'l1', 'solver': 'saga'}

Validation accuracy: 89.4500

Confusion Matrix (Validation):
[[1085   32    8   29   26    0]
 [  26 1182   80    6   11    7]
 [   5   37  302    4    1    0]
 [  20   18    6  486   19    2]
 [  11    9    2   14  388   30]
 [   1    8    0    2    8  135]]

F1 Score (Validation): 0.8954

Testing accuracy: 91.0000

Confusion Matrix (Test):
[[1087   26    4   35   18    5]
 [  17 1254   74    8    8   12]
 [   1   19  300    3    2    0]
 [  24   12    1  494   11    1]
 [  11    4    4   16  390   23]
 [   1    6    0    1   13  115]]

F1 Score (Test): 0.9110




In [14]:
best_classifier

In [15]:
def most_significant_terms(classifier, vectorizer, K):
    # cycle through the positive weights, in the order of largest weight first and print out
    # K lines where each line contains
    # (a) the term corresponding to the weight (a string)
    # (b) the weight value itself (a scalar printed to 3 decimal places)
    
    ### YOUR CODE STARTS HERE ###

    # extract coefficients from trained model and sort based on coefficient
    args = np.argsort(classifier.coef_[0])
    top_args = args[-K:][::-1]
    low_args = args[:K]
    vocab = {v : k for k, v in vectorizer.vocabulary_.items()}
    
    # extract topK positive terms
    topK_pos_terms = [ vocab[index] for index in top_args ]
    
    # extract corresponding weights
    topK_pos_weights = np.round(classifier.coef_[0][top_args], decimals=3)
    
    # extract topK negative terms
    topK_neg_terms = [ vocab[index] for index in low_args ]
    
    # extract corresponding weights
    topK_neg_weights = np.round(classifier.coef_[0][low_args], decimals=3)


    ###  YOUR CODE ENDS HERE  ###
    print('topK_pos_weights', topK_pos_weights)
    print('topK_pos_terms', topK_pos_terms)
    print('topK_neg_weights', topK_neg_weights)
    print('topK_neg_terms',topK_neg_terms)
    
    return (topK_pos_weights, topK_neg_weights, topK_pos_terms, topK_neg_terms)

In [16]:
output = most_significant_terms(best_classifier, vectorizer_try, K=10)

topK_pos_weights [5.463 5.457 5.378 5.363 5.299 5.231 5.118 5.114 5.087 5.036]
topK_pos_terms ['disturbed', 'heartbroken', 'doomed', 'messy', 'punished', 'troubled', 'dumb', 'gloomy', 'homesick', 'abused']
topK_neg_weights [-1.856 -1.773 -1.739 -1.733 -1.679 -1.653 -1.653 -1.631 -1.598 -1.581]
topK_neg_terms ['restless', 'cranky', 'faithful', 'pissed', 'accepted', 'feel super', 'passionate', 'doesn', 'violent', 'act']


In [17]:
labels = ["sadness", "joy", "love", "anger", "fear", "surprise"]
counts = np.bincount(y)
percent = counts / len(y)
pd.DataFrame({
    'labels': labels,
    'counts': counts,
    'percent': percent
})

Unnamed: 0,labels,counts,percent
0,sadness,5797,0.28985
1,joy,6761,0.33805
2,love,1641,0.08205
3,anger,2709,0.13545
4,fear,2373,0.11865
5,surprise,719,0.03595
