# Modeling with Logistic Classification and XGBoost

In [11]:
# !pip install datasets
from datasets import load_dataset
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

import xgboost as xgb
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score

seed = 1234

In [2]:
dataset = load_dataset("dair-ai/emotion", trust_remote_code=True)

X_train = dataset['train'][:]['text'].copy()
y_train = dataset['train'][:]['label'].copy()
X_val = dataset['validation'][:]['text'].copy()
y_val = dataset['validation'][:]['label'].copy()
X_test = dataset['test'][:]['text'].copy()
y_test = dataset['test'][:]['label'].copy()

X = np.concatenate((X_train, X_val, X_test), axis=0)
y = np.concatenate((y_train, y_val, y_test), axis = 0)

vec = CountVectorizer(stop_words='english',min_df=0.001, ngram_range=(1, 1))
X = vec.fit_transform(X)
print('Data shape: ', X.shape)

Data shape:  (20000, 1261)


### Logistic classifier

In [None]:
def logistic_classifier(X, y):
    scoring = 'f1_weighted'  # scoring methods: https://scikit-learn.org/stable/modules/model_evaluation.html#common-cases-predefined-values

    parameters = {
        'C': [.1, .5, 1, 1.5, 2],  # Regularization parameter
        'penalty': ['l1'],  # Penalty term
    }
    
    lr = LogisticRegression(solver='saga', max_iter=200, multi_class='multinomial', random_state=seed)
    clf = GridSearchCV(lr, parameters, scoring='f1_weighted', cv=5)  # grid search with cross validation
    clf.fit(X, y)

    score = clf.best_score_
    param = clf.best_params_

    print(f'Best {scoring} score: {round(score, 2)}\n Best params: {param}')

    return clf.best_estimator_

In [None]:
clf = logistic_classifier(X, y)



Best f1_weighted score: 0.88
 Best params: {'C': 0.5, 'penalty': 'l1'}




In [None]:
print(f'Overall accuracy: {clf.score(X, y)}')

Overall accuracy: 0.90405


### XGBoost

In [8]:
def xgb_classifier(X, y):
    scoring = 'f1_weighted'  # scoring methods: https://scikit-learn.org/stable/modules/model_evaluation.html#common-cases-predefined-values

    parameters = {
        'eta': [0.1, 0.3, 0.5],  # step size shrinkage
        'max_depth': [4, 6, 8],       # size of each tree
        'n_estimators': [100, 300, 500]
    }
    
    xgb_model = xgb.XGBClassifier(device='cuda')
    clf = GridSearchCV(xgb_model, parameters, scoring='f1_weighted', cv=5)  # grid search with cross validation
    clf.fit(X, y)

    score = clf.best_score_
    param = clf.best_params_

    print(f'Best {scoring} score: {round(score, 2)}\n Best params: {param}')

    return clf.best_estimator_

In [9]:
clf = xgb_classifier(X, y)
print(f'Overall accuracy: {clf.score(X, y)}')

Best f1_weighted score: 0.76
 Best params: {'eta': 0.5, 'max_depth': 4, 'n_estimators': 100}
Overall accuracy: 0.78145


In [10]:
print(f'Overall accuracy: {clf.score(X, y)}')

Overall accuracy: 0.78145
