# Humor Detection

In [33]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, LancasterStemmer, SnowballStemmer
from nltk.corpus import wordnet
from nltk.tokenize import PunktSentenceTokenizer # unsupervised machine learning sentence tokenizer , can be trained, but we use the default
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

# Importing dataset
dataset = pd.read_csv('files/train.csv', nrows=1000)
print(dataset.shape)
print(dataset.head())


(1000, 6)
   id                                               text  is_humor  \
0   1  TENNESSEE: We're the best state. Nobody even c...         1   
1   2  A man inserted an advertisement in the classif...         1   
2   3  How many men does it take to open a can of bee...         1   
3   4  Told my mom I hit 1200 Twitter followers. She ...         1   
4   5  Roses are dead. Love is fake. Weddings are bas...         1   

   humor_rating  humor_controversy  offense_rating  
0          2.42                1.0             0.2  
1          2.50                1.0             1.1  
2          1.95                0.0             2.4  
3          2.11                1.0             0.0  
4          2.78                0.0             0.1  


## Exploratory Data Analysis

In [32]:

# selecting important columns only
# dropping 'offense_rating'
dataset = dataset[['id', 'text', 'is_humor', 'humor_rating', 'humor_controversy']]
print(dataset.shape)
print(dataset.head())

print("\n----- Describe ------")
print(dataset.describe())

(1000, 5)
   id                                               text  is_humor  \
0   1  TENNESSEE: We're the best state. Nobody even c...         1   
1   2  A man inserted an advertisement in the classif...         1   
2   3  How many men does it take to open a can of bee...         1   
3   4  Told my mom I hit 1200 Twitter followers. She ...         1   
4   5  Roses are dead. Love is fake. Weddings are bas...         1   

   humor_rating  humor_controversy  
0          2.42                1.0  
1          2.50                1.0  
2          1.95                0.0  
3          2.11                1.0  
4          2.78                0.0  

----- Describe ------
                id     is_humor  humor_rating  humor_controversy
count  1000.000000  1000.000000    616.000000         616.000000
mean    500.500000     0.616000      2.263393           0.517857
std     288.819436     0.486601      0.571492           0.500087
min       1.000000     0.000000      0.100000           0.000000

## Data preprocessing

In [30]:


data_list =[]
data_list2 =[]

raw_text =""
stop_words = tuple(set(stopwords.words("english")))

def lemmatization(token_sentence):
    res = []
    for word in token_sentence:
        lemmatizer.lemmatize(word)
        res.append(word)
    return res


def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)


# Simple test
def input_test(classifier):
    sample_text = input("Enter joke: ")
    sample_text = re.sub('[^a-zA-Z]', ' ', sample_text).lower().split()
    sample_text = ' '.join([lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in sample_text])
    X = vectorizer.transform([sample_text]).toarray()

    print(X.shape)
    print(X)

    if(classifier.predict(X) == [1]):
        print("It's a joke! (+)")
    else:
        print("It's not a joke! (-)")


# range = dataset range
for i in range(0,1000):
    
   humor_sentence = dataset['text'][i]

   #Tokenization by words or sentences
   humor_sentence = word_tokenize(humor_sentence)
   #remove stopwords and lemmatization with pos_tag
   humor_sentence = " ".join([lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in humor_sentence if word not in set(stopwords.words('english'))])  
  
      
   # remove alpha chars/not alphabetical caracters 
   humor_sentence2 = re.sub('[^a-zA-Z]', ' ', dataset['text'][i])
   
   humor_sentence2 = humor_sentence2.lower()
   humor_sentence2 = word_tokenize(humor_sentence2)
   humor_sentence2 = " ".join([lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in humor_sentence2 if word not in set(stopwords.words('english'))])  
      
   #raw_text += aux_reader+ "\n"
   
    
   # aggregate all the rows of the dataset in one corpus ('data_list')
   data_list.append(humor_sentence)
    
    # how data_list2 is different from data_list?
   data_list2.append(humor_sentence2) 


## Extract features from data

In [29]:

# Create bag-of-words model
from sklearn.feature_extraction.text import CountVectorizer


vectorizer = CountVectorizer(max_features = 8000)
X = vectorizer.fit_transform(data_list).toarray() # no. of features per phrase (phrase=X[row])
y = dataset.iloc[:,2].values


print(vectorizer.get_feature_names())
print(X.shape, y.shape)

['000', '10', '100', '1000', '100ft', '106', '11th', '12', '1200', '13', '1313', '14', '14gm', '15', '150', '16', '17', '1759', '18', '1800', '1861', '1872', '19', '1930s', '1940', '1945', '1948', '1950s', '1964', '1984', '1995', '1998', '1st', '20', '200', '2000', '2001', '2003', '2013', '2014', '2017', '2018', '2019', '2020', '2052', '21', '2125', '22', '23', '24', '242', '25', '26', '28', '29', '30', '300k', '30pm', '31', '33', '35', '3629', '365', '37', '37th', '3d', '3rd', '40', '40th', '429', '43', '45', '48', '50', '500', '52', '57', '5sos', '60', '63', '666', '6x', '70', '72', '75', '80', '85', '90999', '911', '99', '999', 'aaaaafddasfrwe', 'abc', 'abdul', 'ability', 'able', 'aboard', 'about', 'abraham', 'abroad', 'absent', 'absolutely', 'absorption', 'abulia', 'abuse', 'abuzz', 'academy', 'accent', 'accept', 'acceptance', 'access', 'accessible', 'accident', 'accidentally', 'accord', 'account', 'accountable', 'accuse', 'accuses', 'achievement', 'ackles', 'acne', 'acquire', 'acr

## Split dataset into training and test sets

In [23]:

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)


print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(800, 3973) (800,)
(200, 3973) (200,)


## Generate metrics

In [None]:

from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

def print_metrics(y_test, y_pred):
    # confusion matrix
    print(confusion_matrix(y_test, y_pred))

    # accuracy
    print('Accuracy: ', accuracy_score(y_test, y_pred))

    # precision
    print('Precision: ', precision_score(y_test, y_pred))

    # recall
    print('Recall: ', recall_score(y_test, y_pred))

    # f1
    print('F1: ', f1_score(y_test, y_pred))

## Text Classification

In [44]:

# Fit Naive Bayes to the training set

from sklearn.naive_bayes import GaussianNB

classifier = GaussianNB()
classifier.fit(X_train, y_train)

# Predict test set results
y_pred = classifier.predict(X_test)

print(y_pred)
print_metrics(y_test, y_pred)


[1 0 0 1 1 0 1 0 1 1 0 0 1 1 1 1 0 1 0 0 1 0 1 0 1 1 0 0 1 1 0 0 1 1 0 1 0
 1 0 1 1 1 0 0 1 1 0 1 0 1 0 1 0 1 1 1 0 1 0 1 0 0 1 0 1 1 1 0 0 1 0 1 0 1
 1 1 0 1 1 1 1 1 1 0 1 0 0 0 0 0 0 1 1 1 0 1 0 0 1 0 1 1 1 1 1 0 1 1 0 0 1
 0 1 1 0 0 1 1 1 0 0 1 1 0 0 0 0 0 1 0 0 0 1 1 1 1 0 0 0 1 1 1 1 1 0 0 1 0
 0 0 0 1 1 0 0 0 0 0 0 0 1 1 1 0 0 0 0 1 0 1 1 0 0 0 1 1 0 1 1 0 1 1 1 0 0
 1 1 0 0 0 1 0 1 0 1 0 0 0 0 1]
[[58 22]
 [40 80]]
Accuracy:  0.69
Precision:  0.7843137254901961
Recall:  0.6666666666666666
F1:  0.7207207207207207


In [6]:
input_test(classifier)


Enter joke: Hi
(1, 8000)
[[0 0 0 ... 0 0 0]]
It's not a joke!


In [39]:
# SVM

from sklearn.svm import SVC

classifier = SVC()
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

print_metrics(y_test, y_pred)

[[ 26  54]
 [  3 117]]
Accuracy:  0.715
Precision:  0.6842105263157895
Recall:  0.975
F1:  0.8041237113402062


In [25]:
input_test(classifier)

Enter joke: I once toasted the bride and groom at a Pakistani wedding. All I did was push the button on the drone
(1, 3973)
[[0 0 0 ... 0 0 0]]
It's a joke!


In [17]:
from sklearn.linear_model import LogisticRegression

classifier = LogisticRegression()
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

print_metrics(y_test, y_pred)

[[ 66  53]
 [ 18 163]]
Accuracy:  0.7633333333333333
Precision:  0.7546296296296297
Recall:  0.9005524861878453
F1:  0.8211586901763224


In [18]:
input_test(classifier)

Enter joke: I once toasted the bride and groom at a Pakistani wedding. All I did was push the button on the drone
(1, 3973)
[[0 0 0 ... 0 0 0]]
It's a joke!


In [19]:
from sklearn.neural_network import MLPClassifier

classifier = MLPClassifier()
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

print_metrics(y_test, y_pred)

[[ 79  40]
 [ 33 148]]
Accuracy:  0.7566666666666667
Precision:  0.7872340425531915
Recall:  0.8176795580110497
F1:  0.8021680216802167


In [20]:
input_test(classifier)

Enter joke: I once toasted the bride and groom at a Pakistani wedding. All I did was push the button on the drone
(1, 3973)
[[0 0 0 ... 0 0 0]]
It's a joke!


In [42]:
from sklearn.neural_network import BernoulliRBM
lassifier = BernoulliRBM()
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

print_metrics(y_test, y_pred)

[[ 26  54]
 [  3 117]]
Accuracy:  0.715
Precision:  0.6842105263157895
Recall:  0.975
F1:  0.8041237113402062


In [31]:
input_test(classifier)

Enter joke: I once toasted the bride and groom at a Pakistani wedding. All I did was push the button on the drone
(1, 3973)
[[0 0 0 ... 0 0 0]]
It's a joke! (+)
