# BOW, without stop word removal

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

dataset = pd.read_csv('dataset.csv')

train_set, temp_set = train_test_split(dataset, test_size=0.2, random_state=42)
test_set, val_set = train_test_split(temp_set, test_size=0.5, random_state=42)

vectorizer = CountVectorizer(max_features=10000)
X_train = vectorizer.fit_transform(train_set['text'])
X_test = vectorizer.transform(test_set['text'])
X_val = vectorizer.transform(val_set['text'])
y_train = train_set['humor']
y_test = test_set['humor']
y_val = val_set['humor']

In [2]:
from sklearn.linear_model import SGDClassifier, RidgeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


logreg_model = LogisticRegression(max_iter=10000, solver='lbfgs')
logreg_model.fit(X_train, y_train)

naive_bayes_model = MultinomialNB()
naive_bayes_model.fit(X_train, y_train)

sgd_model = SGDClassifier(max_iter=1000, tol=1e-3)
sgd_model.fit(X_train, y_train)

ridge_model = RidgeClassifier()
ridge_model.fit(X_train, y_train)

logreg_preds = logreg_model.predict(X_val)
naive_bayes_preds = naive_bayes_model.predict(X_val)
sgd_preds = sgd_model.predict(X_val)
ridge_preds = ridge_model.predict(X_val)

logreg_accuracy = accuracy_score(y_val, logreg_preds)
naive_bayes_accuracy = accuracy_score(y_val, naive_bayes_preds)
sgd_accuracy = accuracy_score(y_val, sgd_preds)
ridge_accuracy = accuracy_score(y_val, ridge_preds)
print(f"Logistic Regression Accuracy: {logreg_accuracy}")
print(f"Naive Bayes Accuracy: {naive_bayes_accuracy}")
print(f"SGD Classifier Accuracy: {sgd_accuracy}")
print(f"Ridge Classifier Accuracy: {ridge_accuracy}\n")

# Calculate and print precision, recall, and F1-score for each model
logreg_precision = precision_score(y_val, logreg_preds)
naive_bayes_precision = precision_score(y_val, naive_bayes_preds)
sgd_precision = precision_score(y_val, sgd_preds)
ridge_precision = precision_score(y_val, ridge_preds)
print(f"Logistic Regression Precision: {logreg_precision}")
print(f"Naive Bayes Precision: {naive_bayes_precision}")
print(f"SGD Classifier Precision: {sgd_precision}")
print(f"Ridge Classifier Precision: {ridge_precision}\n")

logreg_recall = recall_score(y_val, logreg_preds)
naive_bayes_recall = recall_score(y_val, naive_bayes_preds)
sgd_recall = recall_score(y_val, sgd_preds)
ridge_recall = recall_score(y_val, ridge_preds)
print(f"Logistic Regression Recall: {logreg_recall}")
print(f"Naive Bayes Recall: {naive_bayes_recall}")
print(f"SGD Classifier Recall: {sgd_recall}")
print(f"Ridge Classifier Recall: {ridge_recall}\n")

logreg_f1_score = f1_score(y_val, logreg_preds)
naive_bayes_f1_score = f1_score(y_val, naive_bayes_preds)
sgd_f1_score = f1_score(y_val, sgd_preds)
ridge_f1_score = f1_score(y_val, ridge_preds)
print(f"Logistic Regression F1-Score: {logreg_f1_score}")
print(f"Naive Bayes F1-Score: {naive_bayes_f1_score}")
print(f"SGD Classifier F1-Score: {sgd_f1_score}")
print(f"Ridge Classifier F1-Score: {ridge_f1_score}\n")

Logistic Regression Accuracy: 0.93215
Naive Bayes Accuracy: 0.91575
SGD Classifier Accuracy: 0.92655
Ridge Classifier Accuracy: 0.92865

Logistic Regression Precision: 0.9347651545564031
Naive Bayes Precision: 0.899386738213875
SGD Classifier Precision: 0.9312222670431626
Ridge Classifier Precision: 0.9314257028112449

Logistic Regression Recall: 0.9294481588663805
Naive Bayes Recall: 0.9366330705518411
SGD Classifier Recall: 0.9214649236603133
Ridge Classifier Recall: 0.9257559125835745

Logistic Regression F1-Score: 0.9320990743057292
Naive Bayes F1-Score: 0.9176321063694579
SGD Classifier F1-Score: 0.9263179013893765
Ridge Classifier F1-Score: 0.9285821530453932



^^ BOW, without stop word removal

In [3]:
# Train a SVC model with RBF kernel
svc_model = SVC(kernel='rbf')
svc_model.fit(X_train, y_train)

svc_preds = svc_model.predict(X_val)

svc_accuracy = accuracy_score(y_val, svc_preds)
print(f"SVC with RBF Kernel Accuracy: {svc_accuracy}")

svc_precision = precision_score(y_val, svc_preds)
print(f"SGD Classifier Precision: {sgd_precision}")

svc_recall = recall_score(y_val, svc_preds)
print(f"SGD Classifier Recall: {svc_recall}")

svc_f1_score = f1_score(y_val, svc_preds)
print(f"SGD Classifier F1-Score: {svc_f1_score}")

SVC with RBF Kernel Accuracy: 0.9408
SGD Classifier Precision: 0.9312222670431626
SGD Classifier Recall: 0.9391278315537371
SGD Classifier F1-Score: 0.940817754673598


^^ BOW, without stop word removal

In [4]:
# Train a SVC model with RBF kernel
svc_model = SVC(kernel='linear')
svc_model.fit(X_train, y_train)

svc_preds = svc_model.predict(X_val)

svc_accuracy = accuracy_score(y_val, svc_preds)
print(f"SVC with linear kernel Accuracy: {svc_accuracy}")

svc_precision = precision_score(y_val, svc_preds)
print(f"SVC with linear kernel Precision: {svc_precision}")

svc_recall = recall_score(y_val, svc_preds)
print(f"SVC with linear kernel Recall: {svc_recall}")

svc_f1_score = f1_score(y_val, svc_preds)
print(f"SVC with linear kernel F1-Score: {svc_f1_score}")

SVC with linear kernel Accuracy: 0.9284
SVC with linear kernel Precision: 0.9289781240635301
SVC with linear kernel Recall: 0.9280510927053188
SVC with linear kernel F1-Score: 0.9285143769968051


^^ BOW, without stop word removal

# TF-IDF, without stop word removal

In [5]:
%reset -f

In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# Load the dataset
dataset = pd.read_csv('dataset.csv')

# Split the dataset into training, testing, and validation sets
train_set, temp_set = train_test_split(dataset, test_size=0.2, random_state=42)
test_set, val_set = train_test_split(temp_set, test_size=0.5, random_state=42)

# Vectorize the text data using Bag-of-Words (BoW) with a size of 10,000
vectorizer = TfidfVectorizer(max_features=10_000)
X_train = vectorizer.fit_transform(train_set['text'])
X_test = vectorizer.transform(test_set['text'])
X_val = vectorizer.transform(val_set['text'])
y_train = train_set['humor']
y_test = test_set['humor']
y_val = val_set['humor']

In [7]:
from sklearn.linear_model import SGDClassifier, RidgeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

logreg_model = LogisticRegression(max_iter=10_000, solver='lbfgs')
logreg_model.fit(X_train, y_train)
naive_bayes_model = MultinomialNB()
naive_bayes_model.fit(X_train, y_train)
sgd_model = SGDClassifier(max_iter=1000, tol=1e-3)
sgd_model.fit(X_train, y_train)
ridge_model = RidgeClassifier()
ridge_model.fit(X_train, y_train)

logreg_preds = logreg_model.predict(X_val)
naive_bayes_preds = naive_bayes_model.predict(X_val)
sgd_preds = sgd_model.predict(X_val)
ridge_preds = ridge_model.predict(X_val)

logreg_accuracy = accuracy_score(y_val, logreg_preds)
naive_bayes_accuracy = accuracy_score(y_val, naive_bayes_preds)
sgd_accuracy = accuracy_score(y_val, sgd_preds)
ridge_accuracy = accuracy_score(y_val, ridge_preds)
print(f"Logistic Regression Accuracy: {logreg_accuracy}")
print(f"Naive Bayes Accuracy: {naive_bayes_accuracy}")
print(f"SGD Classifier Accuracy: {sgd_accuracy}")
print(f"Ridge Classifier Accuracy: {ridge_accuracy}\n")

logreg_precision = precision_score(y_val, logreg_preds)
naive_bayes_precision = precision_score(y_val, naive_bayes_preds)
sgd_precision = precision_score(y_val, sgd_preds)
ridge_precision = precision_score(y_val, ridge_preds)
print(f"Logistic Regression Precision: {logreg_precision}")
print(f"Naive Bayes Precision: {naive_bayes_precision}")
print(f"SGD Classifier Precision: {sgd_precision}")
print(f"Ridge Classifier Precision: {ridge_precision}\n")

logreg_recall = recall_score(y_val, logreg_preds)
naive_bayes_recall = recall_score(y_val, naive_bayes_preds)
sgd_recall = recall_score(y_val, sgd_preds)
ridge_recall = recall_score(y_val, ridge_preds)
print(f"Logistic Regression Recall: {logreg_recall}")
print(f"Naive Bayes Recall: {naive_bayes_recall}")
print(f"SGD Classifier Recall: {sgd_recall}")
print(f"Ridge Classifier Recall: {ridge_recall}\n")

logreg_f1_score = f1_score(y_val, logreg_preds)
naive_bayes_f1_score = f1_score(y_val, naive_bayes_preds)
sgd_f1_score = f1_score(y_val, sgd_preds)
ridge_f1_score = f1_score(y_val, ridge_preds)
print(f"Logistic Regression F1-Score: {logreg_f1_score}")
print(f"Naive Bayes F1-Score: {naive_bayes_f1_score}")
print(f"SGD Classifier F1-Score: {sgd_f1_score}")
print(f"Ridge Classifier F1-Score: {ridge_f1_score}\n")

Logistic Regression Accuracy: 0.9275
Naive Bayes Accuracy: 0.9118
SGD Classifier Accuracy: 0.91385
Ridge Classifier Accuracy: 0.9277

Logistic Regression Precision: 0.9285928592859286
Naive Bayes Precision: 0.8976211114321487
SGD Classifier Precision: 0.916148445336008
Ridge Classifier Precision: 0.926829268292683

Logistic Regression Recall: 0.9265542361041812
Naive Bayes Recall: 0.9300469015068357
SGD Classifier Recall: 0.9114858796527293
Ridge Classifier Recall: 0.9290489971060772

Logistic Regression F1-Score: 0.9275724275724276
Naive Bayes F1-Score: 0.9135463634581454
SGD Classifier F1-Score: 0.913811215046771
Ridge Classifier F1-Score: 0.9279378052426991



^^ TF-IDF, without stop word removal

In [8]:
svc_model = SVC(kernel='rbf')
svc_model.fit(X_train, y_train)

svc_preds = svc_model.predict(X_val)

svc_accuracy = accuracy_score(y_val, svc_preds)
print(f"SVC with RBF Kernel Accuracy: {svc_accuracy}")

svc_precision = precision_score(y_val, svc_preds)
print(f"SVC with RBF Kernel Precision: {svc_precision}")

svc_recall = recall_score(y_val, svc_preds)
print(f"SVC with RBF Kernel Recall: {svc_recall}")

svc_f1_score = f1_score(y_val, svc_preds)
print(f"SVC with RBF Kernel F1-Score: {svc_f1_score}")

SVC with RBF Kernel Accuracy: 0.9398
SVC with RBF Kernel Precision: 0.9390498954287422
SVC with RBF Kernel Recall: 0.9409240594751023
SVC with RBF Kernel F1-Score: 0.9399860432658758


^^ TF-IDF, without stop word removal

In [9]:
svc_model = SVC(kernel='linear')
svc_model.fit(X_train, y_train)

svc_preds = svc_model.predict(X_val)

svc_accuracy = accuracy_score(y_val, svc_preds)
print(f"SVC with linear kernel Accuracy: {svc_accuracy}")

svc_precision = precision_score(y_val, svc_preds)
print(f"SVC with linear kernel Precision: {svc_precision}")

svc_recall = recall_score(y_val, svc_preds)
print(f"SVC with linear kernel Recall: {svc_recall}")

svc_f1_score = f1_score(y_val, svc_preds)
print(f"SVC with linear kernel F1-Score: {svc_f1_score}")

SVC with linear kernel Accuracy: 0.92775
SVC with linear kernel Precision: 0.9260731319554849
SVC with linear kernel Recall: 0.9300469015068357
SVC with linear kernel F1-Score: 0.9280557630072193


^^ TF-IDF, without stop word removal

# BOW, with stop word removal

In [10]:
%reset -f

In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

dataset = pd.read_csv('dataset.csv')

train_set, temp_set = train_test_split(dataset, test_size=0.2, random_state=42)
test_set, val_set = train_test_split(temp_set, test_size=0.5, random_state=42)

vectorizer = CountVectorizer(max_features=10_000, stop_words='english')
X_train = vectorizer.fit_transform(train_set['text'])
X_test = vectorizer.transform(test_set['text'])
X_val = vectorizer.transform(val_set['text'])
y_train = train_set['humor']
y_test = test_set['humor']
y_val = val_set['humor']

In [12]:
from sklearn.linear_model import SGDClassifier, RidgeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

logreg_model = LogisticRegression(max_iter=10_000, solver='lbfgs')
logreg_model.fit(X_train, y_train)
naive_bayes_model = MultinomialNB()
naive_bayes_model.fit(X_train, y_train)
sgd_model = SGDClassifier(max_iter=1000, tol=1e-3)
sgd_model.fit(X_train, y_train)
ridge_model = RidgeClassifier()
ridge_model.fit(X_train, y_train)

logreg_preds = logreg_model.predict(X_val)
naive_bayes_preds = naive_bayes_model.predict(X_val)
sgd_preds = sgd_model.predict(X_val)
ridge_preds = ridge_model.predict(X_val)

logreg_accuracy = accuracy_score(y_val, logreg_preds)
naive_bayes_accuracy = accuracy_score(y_val, naive_bayes_preds)
sgd_accuracy = accuracy_score(y_val, sgd_preds)
ridge_accuracy = accuracy_score(y_val, ridge_preds)
print(f"Logistic Regression Accuracy: {logreg_accuracy}")
print(f"Naive Bayes Accuracy: {naive_bayes_accuracy}")
print(f"SGD Classifier Accuracy: {sgd_accuracy}")
print(f"Ridge Classifier Accuracy: {ridge_accuracy}\n")

logreg_precision = precision_score(y_val, logreg_preds)
naive_bayes_precision = precision_score(y_val, naive_bayes_preds)
sgd_precision = precision_score(y_val, sgd_preds)
ridge_precision = precision_score(y_val, ridge_preds)
print(f"Logistic Regression Precision: {logreg_precision}")
print(f"Naive Bayes Precision: {naive_bayes_precision}")
print(f"SGD Classifier Precision: {sgd_precision}")
print(f"Ridge Classifier Precision: {ridge_precision}\n")

logreg_recall = recall_score(y_val, logreg_preds)
naive_bayes_recall = recall_score(y_val, naive_bayes_preds)
sgd_recall = recall_score(y_val, sgd_preds)
ridge_recall = recall_score(y_val, ridge_preds)
print(f"Logistic Regression Recall: {logreg_recall}")
print(f"Naive Bayes Recall: {naive_bayes_recall}")
print(f"SGD Classifier Recall: {sgd_recall}")
print(f"Ridge Classifier Recall: {ridge_recall}\n")

logreg_f1_score = f1_score(y_val, logreg_preds)
naive_bayes_f1_score = f1_score(y_val, naive_bayes_preds)
sgd_f1_score = f1_score(y_val, sgd_preds)
ridge_f1_score = f1_score(y_val, ridge_preds)
print(f"Logistic Regression F1-Score: {logreg_f1_score}")
print(f"Naive Bayes F1-Score: {naive_bayes_f1_score}")
print(f"SGD Classifier F1-Score: {sgd_f1_score}")
print(f"Ridge Classifier F1-Score: {ridge_f1_score}\n")

Logistic Regression Accuracy: 0.8961
Naive Bayes Accuracy: 0.88895
SGD Classifier Accuracy: 0.8913
Ridge Classifier Accuracy: 0.89245

Logistic Regression Precision: 0.8924019365675329
Naive Bayes Precision: 0.884387936132466
SGD Classifier Precision: 0.8862853204686423
Ridge Classifier Precision: 0.8840523130977943

Logistic Regression Recall: 0.9013072547649935
Naive Bayes Recall: 0.8954196188005189
SGD Classifier Recall: 0.8983135415627183
Ridge Classifier Recall: 0.9039018062069654

Logistic Regression F1-Score: 0.8968324893257869
Naive Bayes F1-Score: 0.8898695889324144
SGD Classifier F1-Score: 0.8922588958271385
Ridge Classifier F1-Score: 0.8938668771895198



^^ BOW, with stop word removal

In [13]:
svc_model = SVC(kernel='rbf')
svc_model.fit(X_train, y_train)

svc_preds = svc_model.predict(X_val)

svc_accuracy = accuracy_score(y_val, svc_preds)
print(f"SVC with RBF Kernel Accuracy: {svc_accuracy}")

svc_precision = precision_score(y_val, svc_preds)
print(f"SVC with RBF Kernel Precision: {svc_precision}")

svc_recall = recall_score(y_val, svc_preds)
print(f"SVC with RBF Kernel Recall: {svc_recall}")

svc_f1_score = f1_score(y_val, svc_preds)
print(f"SVC with RBF Kernel F1-Score: {svc_f1_score}")

SVC with RBF Kernel Accuracy: 0.9054
SVC with RBF Kernel Precision: 0.9013528191962081
SVC with RBF Kernel Recall: 0.9108871370122742
SVC with RBF Kernel F1-Score: 0.9060948977566011


^^ BOW, with stop word removal

In [14]:
svc_model = SVC(kernel='linear')
svc_model.fit(X_train, y_train)

svc_preds = svc_model.predict(X_val)

svc_accuracy = accuracy_score(y_val, svc_preds)
print(f"SVC with linear kernel Accuracy: {svc_accuracy}")

svc_precision = precision_score(y_val, svc_preds)
print(f"SVC with linear kernel Precision: {svc_precision}")

svc_recall = recall_score(y_val, svc_preds)
print(f"SVC with linear kernel Recall: {svc_recall}")

svc_f1_score = f1_score(y_val, svc_preds)
print(f"SVC with linear kernel F1-Score: {svc_f1_score}")

SVC with linear kernel Accuracy: 0.89385
SVC with linear kernel Precision: 0.8882989183874139
SVC with linear kernel Recall: 0.9015068356451452
SVC with linear kernel F1-Score: 0.894854142934971


^^ BOW, with stop word removal

# TF-IDF, with stop word removal

In [15]:
%reset -f

In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# Load the dataset
dataset = pd.read_csv('dataset.csv')

# Split the dataset into training, testing, and validation sets
train_set, temp_set = train_test_split(dataset, test_size=0.2, random_state=42)
test_set, val_set = train_test_split(temp_set, test_size=0.5, random_state=42)

# Vectorize the text data using Bag-of-Words (BoW) with a size of 10,000
vectorizer = TfidfVectorizer(max_features=10_000, stop_words='english')
X_train = vectorizer.fit_transform(train_set['text'])
X_test = vectorizer.transform(test_set['text'])
X_val = vectorizer.transform(val_set['text'])
y_train = train_set['humor']
y_test = test_set['humor']
y_val = val_set['humor']

In [17]:
from sklearn.linear_model import SGDClassifier, RidgeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

logreg_model = LogisticRegression(max_iter=10_000, solver='lbfgs')
logreg_model.fit(X_train, y_train)
naive_bayes_model = MultinomialNB()
naive_bayes_model.fit(X_train, y_train)
sgd_model = SGDClassifier(max_iter=1000, tol=1e-3)
sgd_model.fit(X_train, y_train)
ridge_model = RidgeClassifier()
ridge_model.fit(X_train, y_train)

logreg_preds = logreg_model.predict(X_val)
naive_bayes_preds = naive_bayes_model.predict(X_val)
sgd_preds = sgd_model.predict(X_val)
ridge_preds = ridge_model.predict(X_val)

logreg_accuracy = accuracy_score(y_val, logreg_preds)
naive_bayes_accuracy = accuracy_score(y_val, naive_bayes_preds)
sgd_accuracy = accuracy_score(y_val, sgd_preds)
ridge_accuracy = accuracy_score(y_val, ridge_preds)
print(f"Logistic Regression Accuracy: {logreg_accuracy}")
print(f"Naive Bayes Accuracy: {naive_bayes_accuracy}")
print(f"SGD Classifier Accuracy: {sgd_accuracy}")
print(f"Ridge Classifier Accuracy: {ridge_accuracy}\n")

logreg_precision = precision_score(y_val, logreg_preds)
naive_bayes_precision = precision_score(y_val, naive_bayes_preds)
sgd_precision = precision_score(y_val, sgd_preds)
ridge_precision = precision_score(y_val, ridge_preds)
print(f"Logistic Regression Precision: {logreg_precision}")
print(f"Naive Bayes Precision: {naive_bayes_precision}")
print(f"SGD Classifier Precision: {sgd_precision}")
print(f"Ridge Classifier Precision: {ridge_precision}\n")

logreg_recall = recall_score(y_val, logreg_preds)
naive_bayes_recall = recall_score(y_val, naive_bayes_preds)
sgd_recall = recall_score(y_val, sgd_preds)
ridge_recall = recall_score(y_val, ridge_preds)
print(f"Logistic Regression Recall: {logreg_recall}")
print(f"Naive Bayes Recall: {naive_bayes_recall}")
print(f"SGD Classifier Recall: {sgd_recall}")
print(f"Ridge Classifier Recall: {ridge_recall}\n")

logreg_f1_score = f1_score(y_val, logreg_preds)
naive_bayes_f1_score = f1_score(y_val, naive_bayes_preds)
sgd_f1_score = f1_score(y_val, sgd_preds)
ridge_f1_score = f1_score(y_val, ridge_preds)
print(f"Logistic Regression F1-Score: {logreg_f1_score}")
print(f"Naive Bayes F1-Score: {naive_bayes_f1_score}")
print(f"SGD Classifier F1-Score: {sgd_f1_score}")
print(f"Ridge Classifier F1-Score: {ridge_f1_score}\n")

Logistic Regression Accuracy: 0.8938
Naive Bayes Accuracy: 0.887
SGD Classifier Accuracy: 0.88435
Ridge Classifier Accuracy: 0.8942

Logistic Regression Precision: 0.8941798941798942
Naive Bayes Precision: 0.8872368027142999
SGD Classifier Precision: 0.8893715902202465
Ridge Classifier Precision: 0.8909108891306498

Logistic Regression Recall: 0.8938229717593055
Naive Bayes Recall: 0.8872368027142999
SGD Classifier Recall: 0.878455243987626
Ridge Classifier Recall: 0.8989122842031734

Logistic Regression F1-Score: 0.8940013973450445
Naive Bayes F1-Score: 0.8872368027142999
SGD Classifier F1-Score: 0.8838797128369897
Ridge Classifier F1-Score: 0.8948937015696404



^^ TF-IDF, with stop word removal

In [18]:
svc_model = SVC(kernel='rbf')
svc_model.fit(X_train, y_train)

svc_preds = svc_model.predict(X_val)

svc_accuracy = accuracy_score(y_val, svc_preds)
print(f"SVC with RBF Kernel Accuracy: {svc_accuracy}")

svc_precision = precision_score(y_val, svc_preds)
print(f"SVC with RBF Kernel Precision: {svc_precision}")

svc_recall = recall_score(y_val, svc_preds)
print(f"SVC with RBF Kernel Recall: {svc_recall}")

svc_f1_score = f1_score(y_val, svc_preds)
print(f"SVC with RBF Kernel F1-Score: {svc_f1_score}")

SVC with RBF Kernel Accuracy: 0.9071
SVC with RBF Kernel Precision: 0.9064834179862563
SVC with RBF Kernel Recall: 0.9082925855703023
SVC with RBF Kernel F1-Score: 0.9073870999900308


^^ TF-IDF, with stop word removal

In [19]:
svc_model = SVC(kernel='linear')
svc_model.fit(X_train, y_train)

svc_preds = svc_model.predict(X_val)

svc_accuracy = accuracy_score(y_val, svc_preds)
print(f"SVC with linear kernel Accuracy: {svc_accuracy}")

svc_precision = precision_score(y_val, svc_preds)
print(f"SVC with linear kernel Precision: {svc_precision}")

svc_recall = recall_score(y_val, svc_preds)
print(f"SVC with linear kernel Recall: {svc_recall}")

svc_f1_score = f1_score(y_val, svc_preds)
print(f"SVC with linear kernel F1-Score: {svc_f1_score}")

SVC with linear kernel Accuracy: 0.8962
SVC with linear kernel Precision: 0.8932778932778933
SVC with linear kernel Recall: 0.9004091408043109
SVC with linear kernel F1-Score: 0.8968293410197793


^^ TF-IDF, with stop word removal

# BOW, with POS-tags

In [20]:
%reset -f

In [21]:
import pandas as pd
import spacy
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# Load the spaCy model
nlp = spacy.load("en_core_web_sm")

global counter
counter = 0

def tokenize(text):
    doc = nlp(text)
    word_unigrams, pos_unigrams = [], []
    for i, token in enumerate(doc):
        word_unigrams.append(token.text)
        pos_unigrams.append(token.tag_)
        
    combined_n_grams = word_unigrams + pos_unigrams
    return combined_n_grams

# Load the dataset
dataset = pd.read_csv('dataset.csv')

# Split the dataset into training, testing, and validation sets
train_set, temp_set = train_test_split(dataset, test_size=0.2, random_state=42)
test_set, val_set = train_test_split(temp_set, test_size=0.5, random_state=42)

# Vectorize the text data using Bag-of-Words (BoW) with a size of 10,000
vectorizer = CountVectorizer(max_features=10_000, tokenizer=tokenize)
X_train = vectorizer.fit_transform(train_set['text'])
X_test = vectorizer.transform(test_set['text'])
X_val = vectorizer.transform(val_set['text'])
y_train = train_set['humor']
y_test = test_set['humor']
y_val = val_set['humor']

  hasattr(torch, "has_mps")
  and torch.has_mps  # type: ignore[attr-defined]


In [22]:
from sklearn.linear_model import SGDClassifier, RidgeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

logreg_model = LogisticRegression(max_iter=10000, solver='lbfgs')
logreg_model.fit(X_train, y_train)
naive_bayes_model = MultinomialNB()
naive_bayes_model.fit(X_train, y_train)
sgd_model = SGDClassifier(max_iter=1000, tol=1e-3)
sgd_model.fit(X_train, y_train)
ridge_model = RidgeClassifier()
ridge_model.fit(X_train, y_train)

logreg_preds = logreg_model.predict(X_val)
naive_bayes_preds = naive_bayes_model.predict(X_val)
sgd_preds = sgd_model.predict(X_val)
ridge_preds = ridge_model.predict(X_val)

logreg_accuracy = accuracy_score(y_val, logreg_preds)
naive_bayes_accuracy = accuracy_score(y_val, naive_bayes_preds)
sgd_accuracy = accuracy_score(y_val, sgd_preds)
ridge_accuracy = accuracy_score(y_val, ridge_preds)
print(f"Logistic Regression Accuracy: {logreg_accuracy}")
print(f"Naive Bayes Accuracy: {naive_bayes_accuracy}")
print(f"SGD Classifier Accuracy: {sgd_accuracy}")
print(f"Ridge Classifier Accuracy: {ridge_accuracy}\n")

logreg_precision = precision_score(y_val, logreg_preds)
naive_bayes_precision = precision_score(y_val, naive_bayes_preds)
sgd_precision = precision_score(y_val, sgd_preds)
ridge_precision = precision_score(y_val, ridge_preds)
print(f"Logistic Regression Precision: {logreg_precision}")
print(f"Naive Bayes Precision: {naive_bayes_precision}")
print(f"SGD Classifier Precision: {sgd_precision}")
print(f"Ridge Classifier Precision: {ridge_precision}\n")

logreg_recall = recall_score(y_val, logreg_preds)
naive_bayes_recall = recall_score(y_val, naive_bayes_preds)
sgd_recall = recall_score(y_val, sgd_preds)
ridge_recall = recall_score(y_val, ridge_preds)
print(f"Logistic Regression Recall: {logreg_recall}")
print(f"Naive Bayes Recall: {naive_bayes_recall}")
print(f"SGD Classifier Recall: {sgd_recall}")
print(f"Ridge Classifier Recall: {ridge_recall}\n")

logreg_f1_score = f1_score(y_val, logreg_preds)
naive_bayes_f1_score = f1_score(y_val, naive_bayes_preds)
sgd_f1_score = f1_score(y_val, sgd_preds)
ridge_f1_score = f1_score(y_val, ridge_preds)
print(f"Logistic Regression F1-Score: {logreg_f1_score}")
print(f"Naive Bayes F1-Score: {naive_bayes_f1_score}")
print(f"SGD Classifier F1-Score: {sgd_f1_score}")
print(f"Ridge Classifier F1-Score: {ridge_f1_score}\n")

Logistic Regression Accuracy: 0.967
Naive Bayes Accuracy: 0.93905
SGD Classifier Accuracy: 0.96325
Ridge Classifier Accuracy: 0.9632

Logistic Regression Precision: 0.9700713066184594
Naive Bayes Precision: 0.9154238248064942
SGD Classifier Precision: 0.9581606473258338
Ridge Classifier Precision: 0.9720386375190646

Logistic Regression Recall: 0.9638758606925456
Naive Bayes Recall: 0.9677676878555035
SGD Classifier Recall: 0.9689651731364135
Ridge Classifier Recall: 0.9539966071250374

Logistic Regression F1-Score: 0.9669636600260286
Naive Bayes F1-Score: 0.9408682997817123
SGD Classifier F1-Score: 0.9635326221781195
Ridge Classifier F1-Score: 0.9629331184528606



^^ BOW, with POS-tags

In [23]:
svc_model = SVC(kernel='rbf')
svc_model.fit(X_train, y_train)

svc_preds = svc_model.predict(X_val)

svc_accuracy = accuracy_score(y_val, svc_preds)
print(f"SVC with RBF Kernel Accuracy: {svc_accuracy}")

svc_precision = precision_score(y_val, svc_preds)
print(f"SVC with RBF Kernel Precision: {svc_precision}")

svc_recall = recall_score(y_val, svc_preds)
print(f"SVC with RBF Kernel Recall: {svc_recall}")

svc_f1_score = f1_score(y_val, svc_preds)
print(f"SVC with RBF Kernel F1-Score: {svc_f1_score}")

SVC with RBF Kernel Accuracy: 0.96995
SVC with RBF Kernel Precision: 0.9722277922598757
SVC with RBF Kernel Recall: 0.9676678974154276
SVC with RBF Kernel F1-Score: 0.9699424856214053


^^ BOW, with POS-tags

In [24]:
svc_model = SVC(kernel='linear')
svc_model.fit(X_train, y_train)

svc_preds = svc_model.predict(X_val)

svc_accuracy = accuracy_score(y_val, svc_preds)
print(f"SVC with linear kernel Accuracy: {svc_accuracy}")

svc_precision = precision_score(y_val, svc_preds)
print(f"SVC with linear kernel Precision: {svc_precision}")

svc_recall = recall_score(y_val, svc_preds)
print(f"SVC with linear kernel Recall: {svc_recall}")

svc_f1_score = f1_score(y_val, svc_preds)
print(f"SVC with linear kernel F1-Score: {svc_f1_score}")

SVC with linear kernel Accuracy: 0.9635
SVC with linear kernel Precision: 0.9646894068220466
SVC with linear kernel Recall: 0.9623790040914081
SVC with linear kernel F1-Score: 0.9635328204615847


^^ BOW, with POS-tags

# Bigrams

In [17]:
%reset -f

In [1]:
import pandas as pd
import spacy
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# Load the spaCy model
nlp = spacy.load("en_core_web_sm")

def tokenize(text):
    doc = nlp(text)
    
    # Extract unigrams and bigrams in a single loop
    word_unigrams, pos_unigrams, word_bigrams, pos_bigrams = [], [], [], []
    for i, token in enumerate(doc):
        word_unigrams.append(token.text)
        pos_unigrams.append(token.tag_)
        
        if i < len(doc) - 1:
            word_bigrams.append(token.text + '_' + doc[i + 1].text)
            pos_bigrams.append(token.tag_ + '_' + doc[i + 1].tag_)
    
    # Combine all lists
    combined_n_grams = word_unigrams + pos_unigrams + word_bigrams + pos_bigrams

    return combined_n_grams

# Load the dataset
dataset = pd.read_csv('dataset.csv')

# Split the dataset into training, testing, and validation sets
train_set, temp_set = train_test_split(dataset, test_size=0.2, random_state=42)
test_set, val_set = train_test_split(temp_set, test_size=0.5, random_state=42)

# Vectorize the text data using Bag-of-Words (BoW) with a size of 10,000
vectorizer = CountVectorizer(max_features=10_000, tokenizer=tokenize)
X_train = vectorizer.fit_transform(train_set['text'])
X_test = vectorizer.transform(test_set['text'])
X_val = vectorizer.transform(val_set['text'])
y_train = train_set['humor']
y_test = test_set['humor']
y_val = val_set['humor']

  hasattr(torch, "has_mps")
  and torch.has_mps  # type: ignore[attr-defined]


In [2]:
from sklearn.linear_model import SGDClassifier, RidgeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

logreg_model = LogisticRegression(solver='lbfgs', max_iter=1_000_000)
logreg_model.fit(X_train, y_train)
naive_bayes_model = MultinomialNB()
naive_bayes_model.fit(X_train, y_train)

logreg_preds = logreg_model.predict(X_val)
naive_bayes_preds = naive_bayes_model.predict(X_val)

logreg_accuracy = accuracy_score(y_val, logreg_preds)
naive_bayes_accuracy = accuracy_score(y_val, naive_bayes_preds)
print(f"Logistic Regression Accuracy: {logreg_accuracy}")
print(f"Naive Bayes Accuracy: {naive_bayes_accuracy}\n")

logreg_precision = precision_score(y_val, logreg_preds)
naive_bayes_precision = precision_score(y_val, naive_bayes_preds)
print(f"Logistic Regression Precision: {logreg_precision}")
print(f"Naive Bayes Precision: {naive_bayes_precision}\n")

logreg_recall = recall_score(y_val, logreg_preds)
naive_bayes_recall = recall_score(y_val, naive_bayes_preds)
print(f"Logistic Regression Recall: {logreg_recall}")
print(f"Naive Bayes Recall: {naive_bayes_recall}\n")

logreg_f1_score = f1_score(y_val, logreg_preds)
naive_bayes_f1_score = f1_score(y_val, naive_bayes_preds)
print(f"Logistic Regression F1-Score: {logreg_f1_score}")
print(f"Naive Bayes F1-Score: {naive_bayes_f1_score}\n")

Logistic Regression Accuracy: 0.9742
Naive Bayes Accuracy: 0.9406

Logistic Regression Precision: 0.9784556528742575
Naive Bayes Precision: 0.9229148712055922

Logistic Regression Recall: 0.9698632870970961
Naive Bayes Recall: 0.961780261450953

Logistic Regression F1-Score: 0.9741405232033677
Naive Bayes F1-Score: 0.9419468334636435



In [None]:
SGD = 0.97045

In [3]:
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import SGDClassifier, RidgeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


logreg_model = LogisticRegression(solver='lbfgs', max_iter=1_000)
rbf_svc_model = SVC(kernel='rbf', probability=True, max_iter=1_000)
linear_svc_model = SVC(kernel='linear', probability=True, max_iter=1_000)
#ridge_model = RidgeClassifier()

# Create the ensemble classifier
ensemble_classifier = VotingClassifier(estimators=[
    ('logistic_regression', logreg_model),
    ('rbf_support_vector_classifier', rbf_svc_model),
	('linear_support_vector_classifier', linear_svc_model),
	#('ridge_classifier', ridge_model)
], voting='soft')  # You can use 'hard' or 'soft' voting, depending on your preference

# Fit the ensemble model on the training data
ensemble_classifier.fit(X_train, y_train)

# Predict using the ensemble model
ensemble_preds = ensemble_classifier.predict(X_val)

ensemble_accuracy = accuracy_score(y_val, ensemble_preds)
print(f"Ensemble Accuracy: {ensemble_accuracy}")

ensemble_precision = precision_score(y_val, ensemble_preds)
print(f"Ensemble Precision: {ensemble_precision}")

ensemble_recall = recall_score(y_val, ensemble_preds)
print(f"Ensemble Recall: {ensemble_recall}")

ensemble_f1_score = f1_score(y_val, ensemble_preds)
print(f"Ensemble F1-Score: {ensemble_f1_score}")



Ensemble Accuracy: 0.93975
Ensemble Precision: 0.910581222056632
Ensemble Recall: 0.975551342181419
Ensemble F1-Score: 0.9419472948884715


In [4]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier, RidgeClassifier
from sklearn.metrics import accuracy_score, classification_report

# Define base classifiers
logreg_model = LogisticRegression(solver='lbfgs', max_iter=1_000)
rbf_svc_model = SVC(kernel='rbf', probability=True, max_iter=1_000)
linear_svc_model = SVC(kernel='linear', probability=True, max_iter=1_000)
#ridge_model = RidgeClassifier()

# Define final (meta) classifier
final_logreg_model = LogisticRegression(solver='lbfgs', max_iter=1_000)

# Create the stacking classifier
stacking_classifier = StackingClassifier(estimators=[
    ('logistic_regression', logreg_model),
    ('rbf_support_vector_classifier', rbf_svc_model),
	('linear_support_vector_classifier', linear_svc_model),
], final_estimator=final_logreg_model)

# Fit the stacking classifier
stacking_classifier.fit(X_train, y_train)

# Make predictions with the stacking classifier
stacked_preds = stacking_classifier.predict(X_val)

# Evaluate the stacking classifier
stacked_accuracy = accuracy_score(y_val, stacked_preds)
print(f"Stacked Ensemble Accuracy with Logistic Regression as Final Estimator: {stacked_accuracy}")

stacked_precision = precision_score(y_val, stacked_preds)
print(f"Stacked Precision: {stacked_precision}")

stacked_recall = recall_score(y_val, stacked_preds)
print(f"Stacked Recall: {stacked_recall}")

stacked_f1_score = f1_score(y_val, stacked_preds)
print(f"Stacked F1-Score: {stacked_f1_score}")



Stacked Ensemble Accuracy with Logistic Regression as Final Estimator: 0.9741
Stacked Precision: 0.9774896995276856
Stacked Recall: 0.9706616106177028
Stacked F1-Score: 0.9740636891648308


^^ Bigrams

In [28]:
svc_model = SVC(kernel='rbf')
svc_model.fit(X_train, y_train)

svc_preds = svc_model.predict(X_val)

svc_accuracy = accuracy_score(y_val, svc_preds)
print(f"SVC with RBF Kernel Accuracy: {svc_accuracy}")

svc_precision = precision_score(y_val, svc_preds)
print(f"SVC with RBF Kernel Precision: {svc_precision}")

svc_recall = recall_score(y_val, svc_preds)
print(f"SVC with RBF Kernel Recall: {svc_recall}")

svc_f1_score = f1_score(y_val, svc_preds)
print(f"SVC with RBF Kernel F1-Score: {svc_f1_score}")

SVC with RBF Kernel Accuracy: 0.9717
SVC with RBF Kernel Precision: 0.9755557790966704
SVC with RBF Kernel Recall: 0.9677676878555035
SVC with RBF Kernel F1-Score: 0.9716461276425209


^^ Bigrams

In [5]:
svc_model = SVC(kernel='linear')
svc_model.fit(X_train, y_train)

svc_preds = svc_model.predict(X_val)

svc_accuracy = accuracy_score(y_val, svc_preds)
print(f"SVC with linear kernel Accuracy: {svc_accuracy}")

svc_precision = precision_score(y_val, svc_preds)
print(f"SVC with linear kernel Precision: {svc_precision}")

svc_recall = recall_score(y_val, svc_preds)
print(f"SVC with linear kernel Recall: {svc_recall}")

svc_f1_score = f1_score(y_val, svc_preds)
print(f"SVC with linear kernel F1-Score: {svc_f1_score}")

SVC with linear kernel Accuracy: 0.97065
SVC with linear kernel Precision: 0.9738798472975688
SVC with linear kernel Recall: 0.9673685260952001
SVC with linear kernel F1-Score: 0.970613266583229


SVC (linear) med max_iter=uendelig: 0.97065, 182min

# Trigrams

In [6]:
%reset -f

In [7]:
import pandas as pd
import spacy
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

nlp = spacy.load("en_core_web_sm")

def tokenize(text):
    doc = nlp(text)

    tokens = []
    pos_tokens = []
    for i, token in enumerate(doc):
        tokens.append(token.text)
        pos_tokens.append(token.tag_)

        if i < len(doc) - 1:
            tokens.append(token.text + '_' + doc[i + 1].text)
            pos_tokens.append(token.tag_ + '_' + doc[i + 1].tag_)
        if i < len(doc) - 2:
            tokens.append(token.text + '_' + doc[i + 1].text + '_' + doc[i + 2].text)
            pos_tokens.append(token.tag_ + '_' + doc[i + 1].tag_ + '_' + doc[i + 2].tag_)
    
    return tokens + pos_tokens

dataset = pd.read_csv('dataset.csv')

train_set, temp_set = train_test_split(dataset, test_size=0.2, random_state=42)
test_set, val_set = train_test_split(temp_set, test_size=0.5, random_state=42)

vectorizer = CountVectorizer(max_features=20000, tokenizer=tokenize)
X_train = vectorizer.fit_transform(train_set['text'])
X_test = vectorizer.transform(test_set['text'])
X_val = vectorizer.transform(val_set['text'])
y_train = train_set['humor']
y_test = test_set['humor']
y_val = val_set['humor']



In [8]:
from sklearn.linear_model import SGDClassifier, RidgeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

logreg_model = LogisticRegression(max_iter=10_000, solver='lbfgs')
logreg_model.fit(X_train, y_train)
naive_bayes_model = MultinomialNB()
naive_bayes_model.fit(X_train, y_train)
sgd_model = SGDClassifier(max_iter=1000, tol=1e-3)
sgd_model.fit(X_train, y_train)
ridge_model = RidgeClassifier()
ridge_model.fit(X_train, y_train)

logreg_preds = logreg_model.predict(X_val)
naive_bayes_preds = naive_bayes_model.predict(X_val)
sgd_preds = sgd_model.predict(X_val)
ridge_preds = ridge_model.predict(X_val)

logreg_accuracy = accuracy_score(y_val, logreg_preds)
naive_bayes_accuracy = accuracy_score(y_val, naive_bayes_preds)
sgd_accuracy = accuracy_score(y_val, sgd_preds)
ridge_accuracy = accuracy_score(y_val, ridge_preds)
print(f"Logistic Regression Accuracy: {logreg_accuracy}")
print(f"Naive Bayes Accuracy: {naive_bayes_accuracy}")
print(f"SGD Classifier Accuracy: {sgd_accuracy}")
print(f"Ridge Classifier Accuracy: {ridge_accuracy}\n")

logreg_precision = precision_score(y_val, logreg_preds)
naive_bayes_precision = precision_score(y_val, naive_bayes_preds)
sgd_precision = precision_score(y_val, sgd_preds)
ridge_precision = precision_score(y_val, ridge_preds)
print(f"Logistic Regression Precision: {logreg_precision}")
print(f"Naive Bayes Precision: {naive_bayes_precision}")
print(f"SGD Classifier Precision: {sgd_precision}")
print(f"Ridge Classifier Precision: {ridge_precision}\n")

logreg_recall = recall_score(y_val, logreg_preds)
naive_bayes_recall = recall_score(y_val, naive_bayes_preds)
sgd_recall = recall_score(y_val, sgd_preds)
ridge_recall = recall_score(y_val, ridge_preds)
print(f"Logistic Regression Recall: {logreg_recall}")
print(f"Naive Bayes Recall: {naive_bayes_recall}")
print(f"SGD Classifier Recall: {sgd_recall}")
print(f"Ridge Classifier Recall: {ridge_recall}\n")

logreg_f1_score = f1_score(y_val, logreg_preds)
naive_bayes_f1_score = f1_score(y_val, naive_bayes_preds)
sgd_f1_score = f1_score(y_val, sgd_preds)
ridge_f1_score = f1_score(y_val, ridge_preds)
print(f"Logistic Regression F1-Score: {logreg_f1_score}")
print(f"Naive Bayes F1-Score: {naive_bayes_f1_score}")
print(f"SGD Classifier F1-Score: {sgd_f1_score}")
print(f"Ridge Classifier F1-Score: {ridge_f1_score}\n")

Logistic Regression Accuracy: 0.97385
Naive Bayes Accuracy: 0.9396
SGD Classifier Accuracy: 0.97155
Ridge Classifier Accuracy: 0.9695

Logistic Regression Precision: 0.9778627490440732
Naive Bayes Precision: 0.9245592060892186
SGD Classifier Precision: 0.9723166100339796
Ridge Classifier Precision: 0.9778612775464609

Logistic Regression Recall: 0.9697634966570202
Naive Bayes Recall: 0.9575890629677677
SGD Classifier Recall: 0.9708611914978545
Ridge Classifier Recall: 0.9608821474902705

Logistic Regression F1-Score: 0.9737962823788767
Naive Bayes F1-Score: 0.9407843137254901
SGD Classifier F1-Score: 0.9715883557197784
Ridge Classifier F1-Score: 0.9692973625931147



In [9]:
svc_model = SVC(kernel='rbf')
svc_model.fit(X_train, y_train)

svc_preds = svc_model.predict(X_val)

svc_accuracy = accuracy_score(y_val, svc_preds)
print(f"SVC with RBF Kernel Accuracy: {svc_accuracy}")

svc_precision = precision_score(y_val, svc_preds)
print(f"SVC with RBF Kernel Precision: {svc_precision}")

svc_recall = recall_score(y_val, svc_preds)
print(f"SVC with RBF Kernel Recall: {svc_recall}")

svc_f1_score = f1_score(y_val, svc_preds)
print(f"SVC with RBF Kernel F1-Score: {svc_f1_score}")

SVC with RBF Kernel Accuracy: 0.9718
SVC with RBF Kernel Precision: 0.9751783740327605
SVC with RBF Kernel Recall: 0.9683664304959585
SVC with RBF Kernel F1-Score: 0.9717604646505108


In [10]:
svc_model = SVC(kernel='linear')
svc_model.fit(X_train, y_train)

svc_preds = svc_model.predict(X_val)

svc_accuracy = accuracy_score(y_val, svc_preds)
print(f"SVC with linear kernel Accuracy: {svc_accuracy}")

svc_precision = precision_score(y_val, svc_preds)
print(f"SVC with linear kernel Precision: {svc_precision}")

svc_recall = recall_score(y_val, svc_preds)
print(f"SVC with linear kernel Recall: {svc_recall}")

svc_f1_score = f1_score(y_val, svc_preds)
print(f"SVC with linear kernel F1-Score: {svc_f1_score}")

SVC with linear kernel Accuracy: 0.9683
SVC with linear kernel Precision: 0.9716611395839614
SVC with linear kernel Recall: 0.964873765093304
SVC with linear kernel F1-Score: 0.9682555577808931


# Hypernyms

In [11]:
%reset -f

In [5]:
from nltk.corpus import wordnet as wn

def get_hypernyms(word):
    """Retrieve up to three levels of all possible hypernyms for a noun."""
    synsets = wn.synsets(word, pos=wn.NOUN)
    hypernyms = []
    
    if synsets:
        first_level_hypernyms = synsets[0].hypernyms()
        for hyper in first_level_hypernyms:
            hypernyms.append(hyper.lemma_names()[0])
            
            second_level_hypernyms = hyper.hypernyms()
            for higher_hyper in second_level_hypernyms:
                hypernyms.append(higher_hyper.lemma_names()[0])
                
                third_level_hypernyms = higher_hyper.hypernyms()
                for highest_hyper in third_level_hypernyms:
                    hypernyms.append(highest_hyper.lemma_names()[0])
                    
    return hypernyms

In [6]:
import pandas as pd
import spacy
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# Load the spaCy model
nlp = spacy.load("en_core_web_sm")

def tokenize(text):
    doc = nlp(text)
    
    tokens = []
    pos_tokens = []
    for i, token in enumerate(doc):
        tokens.append(token.text)
        pos_tokens.append(token.tag_)
        
        if token.pos_ == 'NOUN':
            tokens.extend(get_hypernyms(token.text))
            
        if i < len(doc) - 1:
            tokens.append(token.text + '_' + doc[i + 1].text)
            pos_tokens.append(token.tag_ + '_' + doc[i + 1].tag_)
    
    return tokens + pos_tokens

# Load the dataset
dataset = pd.read_csv('dataset.csv')

train_set, temp_set = train_test_split(dataset, test_size=0.2, random_state=42)
test_set, val_set = train_test_split(temp_set, test_size=0.5, random_state=42)

vectorizer = CountVectorizer(max_features=10_000, tokenizer=tokenize)
X_train = vectorizer.fit_transform(train_set['text'])
X_test = vectorizer.transform(test_set['text'])
X_val = vectorizer.transform(val_set['text'])
y_train = train_set['humor']
y_test = test_set['humor']
y_val = val_set['humor']



In [14]:
from sklearn.linear_model import SGDClassifier, RidgeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

logreg_model = LogisticRegression(max_iter=10_000, solver='lbfgs')
logreg_model.fit(X_train, y_train)
naive_bayes_model = MultinomialNB()
naive_bayes_model.fit(X_train, y_train)

logreg_preds = logreg_model.predict(X_val)
naive_bayes_preds = naive_bayes_model.predict(X_val)

logreg_accuracy = accuracy_score(y_val, logreg_preds)
naive_bayes_accuracy = accuracy_score(y_val, naive_bayes_preds)
print(f"Logistic Regression Accuracy: {logreg_accuracy}")
print(f"Naive Bayes Accuracy: {naive_bayes_accuracy}\n")

logreg_precision = precision_score(y_val, logreg_preds)
naive_bayes_precision = precision_score(y_val, naive_bayes_preds)
print(f"Logistic Regression Precision: {logreg_precision}")
print(f"Naive Bayes Precision: {naive_bayes_precision}\n")

logreg_recall = recall_score(y_val, logreg_preds)
naive_bayes_recall = recall_score(y_val, naive_bayes_preds)
print(f"Logistic Regression Recall: {logreg_recall}")
print(f"Naive Bayes Recall: {naive_bayes_recall}\n")

logreg_f1_score = f1_score(y_val, logreg_preds)
naive_bayes_f1_score = f1_score(y_val, naive_bayes_preds)
print(f"Logistic Regression F1-Score: {logreg_f1_score}")
print(f"Naive Bayes F1-Score: {naive_bayes_f1_score}\n")

Logistic Regression Accuracy: 0.97375
Naive Bayes Accuracy: 0.9438
SGD Classifier Accuracy: 0.97245
Ridge Classifier Accuracy: 0.96975

Logistic Regression Precision: 0.9772818657016485
Naive Bayes Precision: 0.9276992596865686
SGD Classifier Precision: 0.9748295226634577
Ridge Classifier Precision: 0.9785525513315715

Logistic Regression Recall: 0.9701626584173236
Naive Bayes Recall: 0.9628779562917873
SGD Classifier Recall: 0.9700628679772478
Ridge Classifier Recall: 0.9606825666101187

Logistic Regression F1-Score: 0.9737092493364715
Naive Bayes F1-Score: 0.9449613162275977
SGD Classifier F1-Score: 0.9724403541239435
Ridge Classifier F1-Score: 0.9695352233244373



In [15]:
svc_model = SVC(kernel='rbf')
svc_model.fit(X_train, y_train)

svc_preds = svc_model.predict(X_val)

svc_accuracy = accuracy_score(y_val, svc_preds)
print(f"SVC with RBF Kernel Accuracy: {svc_accuracy}")

svc_precision = precision_score(y_val, svc_preds)
print(f"SVC with RBF Kernel Precision: {svc_precision}")

svc_recall = recall_score(y_val, svc_preds)
print(f"SVC with RBF Kernel Recall: {svc_recall}")

svc_f1_score = f1_score(y_val, svc_preds)
print(f"SVC with RBF Kernel F1-Score: {svc_f1_score}")

SVC with RBF Kernel Accuracy: 0.9727
SVC with RBF Kernel Precision: 0.976082805748166
SVC with RBF Kernel Recall: 0.969264544456641
SVC with RBF Kernel F1-Score: 0.9726617264169839


In [16]:
svc_model = SVC(kernel='linear')
svc_model.fit(X_train, y_train)

svc_preds = svc_model.predict(X_val)

svc_accuracy = accuracy_score(y_val, svc_preds)
print(f"SVC with linear kernel Accuracy: {svc_accuracy}")

svc_precision = precision_score(y_val, svc_preds)
print(f"SVC with linear kernel Precision: {svc_precision}")

svc_recall = recall_score(y_val, svc_preds)
print(f"SVC with linear kernel Recall: {svc_recall}")

svc_f1_score = f1_score(y_val, svc_preds)
print(f"SVC with linear kernel F1-Score: {svc_f1_score}")

SVC with linear kernel Accuracy: 0.96905
SVC with linear kernel Precision: 0.9711365003006615
SVC with linear kernel Recall: 0.9669693643348967
SVC with linear kernel F1-Score: 0.9690484524226212


In [7]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Define base classifiers
logreg_model = LogisticRegression(solver='lbfgs', max_iter=1_000)
rbf_svc_model = SVC(kernel='rbf', probability=True, max_iter=1_000)
linear_svc_model = SVC(kernel='linear', probability=True, max_iter=1_000)

# Define final (meta) classifier
final_logreg_model = LogisticRegression(solver='lbfgs', max_iter=1_000)

# Create the stacking classifier
stacking_classifier = StackingClassifier(estimators=[
    ('logistic_regression', logreg_model),
    ('rbf_support_vector_classifier', rbf_svc_model),
    ('linear_support_vector_classifier', linear_svc_model)
], final_estimator=final_logreg_model)

# Fit the stacking classifier
stacking_classifier.fit(X_train, y_train)

# Make predictions with the stacking classifier
stacked_preds = stacking_classifier.predict(X_val)

# Evaluate the stacking classifier
stacked_accuracy = accuracy_score(y_val, stacked_preds)
print(f"Stacked Ensemble Accuracy: {stacked_accuracy}")

stacked_precision = precision_score(y_val, stacked_preds)
print(f"Stacked Precision: {stacked_precision}")

stacked_recall = recall_score(y_val, stacked_preds)
print(f"Stacked Recall: {stacked_recall}")

stacked_f1_score = f1_score(y_val, stacked_preds)
print(f"Stacked F1-Score: {stacked_f1_score}")



Stacked Ensemble Accuracy: 0.97245
Stacked Precision: 0.9744488977955912
Stacked Recall: 0.9704620297375511
Stacked F1-Score: 0.9724513774311284
