In [1]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.base import BaseEstimator, TransformerMixin

import numpy as np
import pandas as pd

from joblib import Parallel, delayed

import string

In [2]:
df = pd.read_parquet('../data/interim/cleaned_jokes.parquet')

In [3]:
stop_words = set(stopwords.words('english'))
wnl = WordNetLemmatizer()

# We can build our custom transformer to preprocess the text data
# This will allow us to use the sklearn pipeline API

class TextPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self, stop_words, wnl):
        self.stop_words = stop_words
        self.wnl = wnl

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return Parallel(n_jobs=-1)(delayed(self.preprocess_text)(text) for text in X)

    def preprocess_text(self, text):
        """
        Preprocess text data by removing stop words, and lemmatizing words.
        """
        text = text.lower()
        tokens = word_tokenize(text)
        tokens = [t for t in tokens if t not in self.stop_words]
        tokens = [self.wnl.lemmatize(t) for t in tokens]
        text = ' '.join(tokens)
        return text
    
logreg_clf = Pipeline([
    ('preprocessor', TextPreprocessor(stop_words, wnl)),
    ('vectorizer', TfidfVectorizer(max_features=5000)),
    ('classifier', LogisticRegression(n_jobs=-1, random_state=42, max_iter=1000))
])

bayes_clf = Pipeline([
    ('preprocessor', TextPreprocessor(stop_words, wnl)),
    ('vectorizer', TfidfVectorizer(max_features=5000, ngram_range=(1, 3))),
    ('classifier', MultinomialNB(alpha=1))
])

In [4]:
X = df['cleaned_joke'].values
y = df['score_class'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True, stratify=y)  # Stratify to ensure equal class distribution 

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(273869,) (68468,) (273869,) (68468,)


In [6]:
cross_val_score(logreg_clf, X_train, y_train, cv=5, scoring='accuracy').mean()

0.24654946907653189

In [5]:
# Make a sub sample of the data to speed up the grid search
X_train_sub, _, y_train_sub, _ = train_test_split(X_train, y_train, test_size=0.9, random_state=42, shuffle=True, stratify=y_train)

print(X_train_sub.shape, y_train_sub.shape)

(27386,) (27386,)


In [14]:
param_grid = {
    'classifier__C': [0.01, 0.1, 1, 10, 100],
    'classifier__penalty': ['l1', 'l2'],
    'classifier__solver': ['lbfgs']
}

# Uncomment to run GridSearchCV

grid_search = GridSearchCV(logreg_clf, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

print('Best parameters:', grid_search.best_params_)
print('Best score:', grid_search.best_score_)

25 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
25 fits failed with the following error:
Traceback (most recent call last):
  File "/home/magsam/miniconda3/envs/unsloth_llm/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/magsam/miniconda3/envs/unsloth_llm/lib/python3.10/site-packages/sklearn/base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/home/magsam/miniconda3/envs/unsloth_llm/lib/python3.10/site-packages/sklearn/pipeline.py", line 475, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "/home/magsam/miniconda3/envs/uns

Best parameters: {'classifier__C': 0.1, 'classifier__penalty': 'l2', 'classifier__solver': 'lbfgs'}
Best score: 0.2657219432558968


In [15]:
cross_val_score(bayes_clf, X_train, y_train, cv=5, scoring='accuracy').mean()

0.26020105269203864