In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import re
import string
from nltk.stem import WordNetLemmatizer
#import contractions
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter

Import Dataset

In [None]:
df = pd.read_csv('../data/interim/ready_for_model.csv', index_col=0)

df.head()

In [None]:
df.score_class.value_counts()

Preprocess the "cleaned_joke" column

In [None]:
# Download the require NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Custom Stopword list
stop_words = [
    "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while", "of", "at", 
    "by", "for", "with", "about", "against", "between", "into", "through", "during", "before", 
    "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", 
    "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", 
    "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", 
    "no", "nor", "not", "only", "own", "same", "so", "than", "very", "s", "t", "can", 
    "will", "just", "don", "should", "now"
]

# Define the stopwords
#stop_words = set(stopwords.words('english'))

# Define the stemmer
stem = PorterStemmer()

# Define the lemmatizer
lem = WordNetLemmatizer()

# Define the function to clean the text
def preprocessing(text, how = 'lem'):

    # Remove the stopwords and apply lemmitization/stemming
    if how == 'lem':
        cleaned_text = [lem.lemmatize(word) for word in word_tokenize(text.lower()) if (word not in stop_words) & (word.isalnum())]
    else:
        cleaned_text = [stem.stem(word) for word in word_tokenize(text.lower()) if (word not in stop_words) & (word.isalnum())]

    return cleaned_text

In [None]:
# Apply the preprocessing function
df['joke_new'] = df['joke_new'].apply(lambda x: preprocessing(x, 'lem'))

# Verify the result
df.head()

Make Word2Vec embeddings for the "cleaned_joke" column to predict the "score_class", i.e., label of a joke

	1.	Train a Word2Vec model on your jokes dataset. --> potentially increase number of features
	2.	Generate sentence vectors for each joke.
	3.	Train a Naive Bayes classifier using the sentence vectors as input.

In [None]:
# Train the Word2Vec model

from gensim.models import Word2Vec

# Prepare data for Word2Vec
sentences = df['joke_new']

# Train the Word2Vec model
word2vec_model = Word2Vec(sentences, vector_size=50, window=10, min_count=1, workers=4, epochs=20)

# Save the model for future use
word2vec_model.save("word2vec_model.model")

In [None]:
# Generate the sentence vectors
def avg_feature_vector(words, model, num_features = 50):
    feature_vec = np.zeros(num_features, dtype='float32')
    n_words = 0

    for word in words:
        try:
            n_words += 1
            feature_vec = np.add(feature_vec, model.wv[word])
        except KeyError:
            # Token not in model
            pass
    return (feature_vec / n_words)

In [None]:
# Apply the vectorize_joke function to each cleaned joke
df['joke_vector'] = df['joke_new'].apply(lambda x: avg_feature_vector(x, word2vec_model))

In [None]:
# Verify the result
df['joke_vector'].iloc[0]

In [None]:
# Check for NaN values in the joke_vector column
nan_count = df['joke_vector'].apply(lambda x: np.isnan(x).any()).sum()
print(f"Number of NaN values in joke_vector: {nan_count}")

In [None]:
# Impute the NaN values with the mean of the joke_vector column
from sklearn.impute import SimpleImputer

# Convert the joke vectors to a 2D array
X = np.array(df['joke_vector'].tolist())

# Impute NaN values with the mean of the column
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)

In [None]:
# Train Naive Bayes classifier
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

# Prepare the data for training
y = df['score_class']

# Split the data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X_imputed, y, test_size=0.2, random_state=42)

# Initialize and Train the Gaussian Naive Bayes classifier
nb_classifier = GaussianNB()
nb_classifier.fit(X_train, y_train)

# Predict the test data
y_pred = nb_classifier.predict(X_test)

Validate accuracy of the model and create classification report

In [None]:
from sklearn.metrics import classification_report
# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy of the model: {accuracy}")

# Generate the classification report
class_report = classification_report(y_test, y_pred)
print(class_report)

# Accuracy with vector size 100: 0.23065
# Accuracy with vector size 300: 0.23077
# Accuracy with vector size 200: 0.229546
# Accuracy with vector size 100 (GridSearch Parameters): 0.235141
# Accuracy with vector size 50: 0.236414

**Gridsearch**

In [None]:
from gensim.models import Word2Vec
from sklearn.model_selection import ParameterGrid
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

# Prepare data for Word2Vec
sentences = df['joke_new']

# Function to train Word2Vec and evaluate embeddings
def train_and_evaluate(params):
    model = Word2Vec(sentences, vector_size=params['vector_size'], window=params['window'], 
                     min_count=params['min_count'], workers=4, epochs=params['epochs'])
    
    # Generate sentence vectors
    def vectorize_joke(joke, model):
        vectors = [model.wv[word] for word in joke if word in model.wv]
        if len(vectors) == 0:
            return np.zeros(model.vector_size)
        return np.mean(vectors, axis=0)

    df['joke_vector'] = df['joke_new'].apply(lambda x: vectorize_joke(x, model))
    X = np.array(df['joke_vector'].tolist())

    # Impute NaN values with the mean of the column
    imputer = SimpleImputer(strategy='mean')
    X_imputed = imputer.fit_transform(X)

    # Prepare the target variable
    y = df['score_class']

    # Split the data into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X_imputed, y, test_size=0.2, random_state=42)

    # Initialize and train the Gaussian Naive Bayes classifier
    nb_classifier = GaussianNB()
    nb_classifier.fit(X_train, y_train)

    # Predict the test data
    y_pred = nb_classifier.predict(X_test)

    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy

# Define the parameter grid
param_grid = {
    'vector_size': [100, 200],
    'window': [10, 15, 20],
    'min_count': [1],
    'epochs': [10, 20]
}

# Initialize best parameters and best accuracy
best_params = None
best_accuracy = 0

# Perform grid search
for params in ParameterGrid(param_grid):
    print(f"Training with parameters: {params}")
    accuracy = train_and_evaluate(params)
    print(f"Accuracy: {accuracy}")
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_params = params

print(f"Best parameters: {best_params}")
print(f"Best accuracy: {best_accuracy}")