In [874]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
import re
import os
import codecs
import html2text
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from textblob import TextBlob
from nltk.corpus import stopwords
from wordcloud import WordCloud
import seaborn
from nltk import word_tokenize, pos_tag
import ast
from operator import itemgetter
from gensim.models import LdaModel
from scipy.sparse.linalg import svds
from gensim.models.phrases import Phrases, Phraser
from gensim.corpora import Dictionary
from gensim.models.nmf import Nmf
from gensim.models.coherencemodel import CoherenceModel
from sklearn.decomposition import NMF


In [744]:
dataset1 = pd.read_excel("jester-data-1.xls")
dataset2 = pd.read_excel("jester-data-2.xls")
dataset3 = pd.read_excel("jester-data-3.xls")

def insert_return(frame):
    ret_lst = []
    for index,row in frame.iterrows():
        ret_lst.append(list(row))
    
    return ret_lst

def combine_dataframe(frame1, frame2, frame3):
    joke_lst = ["Number of jokes rated"]
    for i in range(100):
        joke_lst.append(f"joke-{i}")

    rating_lst = []

    rating_lst.extend(insert_return(frame1))
    rating_lst.extend(insert_return(frame2))
    rating_lst.extend(insert_return(frame3))

    return pd.DataFrame(data=rating_lst,columns=joke_lst)

In [745]:
ratings = combine_dataframe(dataset1, dataset2,dataset3)

In [746]:
ratings.replace(99.0, np.nan, inplace=True)
ratings.head()


Unnamed: 0,Number of jokes rated,joke-0,joke-1,joke-2,joke-3,joke-4,joke-5,joke-6,joke-7,joke-8,...,joke-90,joke-91,joke-92,joke-93,joke-94,joke-95,joke-96,joke-97,joke-98,joke-99
0,100.0,4.08,-0.29,6.36,4.37,-2.38,-9.66,-0.73,-5.34,8.88,...,2.82,-4.95,-0.29,7.86,-0.19,-2.14,3.06,0.34,-4.32,1.07
1,49.0,,,,,9.03,9.27,9.03,9.27,,...,,,,9.08,,,,,,
2,48.0,,8.35,,,1.8,8.16,-2.82,6.21,,...,,,,0.53,,,,,,
3,91.0,8.5,4.61,-4.17,-5.39,1.36,1.6,7.04,4.61,-0.44,...,5.19,5.58,4.27,5.19,5.73,1.55,3.11,6.55,1.8,1.6
4,100.0,-6.17,-3.54,0.44,-8.5,-7.09,-4.32,-8.69,-0.87,-6.65,...,-3.54,-6.89,-0.68,-2.96,-2.18,-3.35,0.05,-9.08,-5.05,-3.45


In [747]:
user_item_matrix = ratings.drop(columns=["Number of jokes rated"]).to_numpy()

user_item_matrix = np.nan_to_num(user_item_matrix)

mask_matrix = np.where(user_item_matrix != 0, 1, 0)

U, S, Vt = svds(user_item_matrix, k=50)

predicted_matrix = np.dot(np.dot(U, np.diag(S)), Vt)

predicted_matrix = np.clip(predicted_matrix, -10, 10)

mse = np.sum((predicted_matrix - user_item_matrix)**2) / np.sum(mask_matrix)

print(f'Mean Squared Error: {mse}')

Mean Squared Error: 5.298360058720308


In [748]:
CF_matrix = predicted_matrix

In [749]:
jokes_dataframe = pd.DataFrame()

In [750]:
def load_clean_joke():

    ret_jokes = []

    for i in range(1,101):
        file = 'init'+str(i)+'.html'
        data = codecs.open('jokes/' + file, 'r', encoding="cp1252")
        joke_html = data.read()

        # Extracting joke
        joke = html2text.html2text(joke_html)
        # Extracting joke_id
        joke_id = int(file.split('init')[1].split('.html')[0])
        cleaned_string = re.sub(r'[\|]+|[-]+', '', joke)

        ret_jokes.append(cleaned_string.strip())
    
    return ret_jokes


In [751]:
Jokes = load_clean_joke()

In [890]:
Jokes

['A man visits the doctor. The doctor says "I have bad news for you.You have\ncancer and Alzheimer\'s disease".\n\nThe man replies "Well,thank God I don\'t have cancer!"',
 'This couple had an excellent relationship going until one day he came home\nfrom work to find his girlfriend packing. He asked her why she was leaving him\nand she told him that she had heard awful things about him.\n\n"What could they possibly have said to make you move out?"\n\n"They told me that you were a pedophile."\n\nHe replied, "That\'s an awfully big word for a ten year old."',
 "Q. What's 200 feet long and has 4 teeth?\n\nA. The front row at a Willie Nelson Concert.",
 "Q. What's the difference between a man and a toilet?\n\nA. A toilet doesn't follow you around after you use it.",
 "Q. What's O. J. Simpson's Internet address?\n\nA. Slash, slash, backslash, slash, slash, escape.",
 "Bill & Hillary are on a trip back to Arkansas. They're almost out of gas, so\nBill pulls into a service station on the outsk

In [752]:
jokes_dataframe["Original Jokes"] = Jokes

In [753]:
from nltk.corpus import stopwords

def remove_tags_puntuatuions_tags(Joke):
    special_char_patterns = r'[^a-zA-Z0-9\s]'
    urlPatterns = r'http\S+|www\S+'

    text = Joke

    text = re.sub(special_char_patterns, ' ', text)
    text = re.sub(urlPatterns, ' ', text)
    text = re.sub(r'\n', ' ', text)

    return text

def remove_stop_words_Tokenization(Joke):
    
    stopwords_ = set(stopwords.words('english'))
    Tokenize_Joke = []
    for word in Joke.split():
        if word not in stopwords_:
            Tokenize_Joke.append(word)

    return Tokenize_Joke

def lower(Joke):
    ret_lst = [word.lower() for word in Joke]
    return ret_lst

def preprocess_clean_jokes(uncleaned_Jokes):

    Cleaned_Jokes = []

    for Joke in uncleaned_Jokes:
        Joke = remove_tags_puntuatuions_tags(Joke)
        Joke = TextBlob(Joke)
        Joke = remove_stop_words_Tokenization(Joke)

        Cleaned_Jokes.append(lower(Joke))

    return Cleaned_Jokes



In [754]:
preprocessed_jokes = preprocess_clean_jokes(Jokes)

In [755]:
jokes_dataframe["PreProcessed Jokes"] = preprocessed_jokes

In [756]:
from nltk.stem import PorterStemmer

def stem_tokens(tokenized_text):
    stemmer = PorterStemmer()
    stemmed_tokens = []
    for preprocessed_joke in tokenized_text:
        stemmed_tokens.append([stemmer.stem(token) for token in preprocessed_joke])
    
    return stemmed_tokens

In [757]:
import nltk
from nltk.stem import WordNetLemmatizer

def lemmatize_text(text):
    lemmatizer = WordNetLemmatizer()
        
    lemmatized_text = " ".join([lemmatizer.lemmatize(word) for word in text])
    
    return lemmatized_text

In [758]:
lemmitized_jokes = [lemmatize_text(text) for text in preprocessed_jokes]

In [759]:
jokes_dataframe["lemmitized_jokes"] = lemmitized_jokes 

In [760]:
from sklearn.feature_extraction.text import TfidfVectorizer

def feature_Extraction_TF_IDF(corpus):
    tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,3))
    tfidf_matrix = tfidf_vectorizer.fit_transform(corpus)
    feature_names = tfidf_vectorizer.get_feature_names_out()
    tfidf_array = tfidf_matrix.toarray()

    return tfidf_array


In [761]:
TF_IDF_lemma = feature_Extraction_TF_IDF(lemmitized_jokes)

In [762]:
jokes_dataframe["TF_IDF_Lemma"] = [i for i in TF_IDF_lemma] 

In [763]:
cleaned_list = list(jokes_dataframe["PreProcessed Jokes"])

merged_lst = []

for joke in cleaned_list:
    merged_lst.append(" ".join(joke))



In [764]:
jokes_dataframe["merged cleaned"] = merged_lst

In [765]:
jokes_dataframe["TF_IDF_Lemma"][0]

array([0., 0., 0., ..., 0., 0., 0.])

In [766]:
import nltk

def posTagging(text):
    return nltk.pos_tag(text, tagset='universal')

jokes_dataframe['pos_tags'] = jokes_dataframe['PreProcessed Jokes'].apply(lambda x: posTagging(x))
jokes_dataframe.head(2)

Unnamed: 0,Original Jokes,PreProcessed Jokes,lemmitized_jokes,TF_IDF_Lemma,merged cleaned,pos_tags
0,"A man visits the doctor. The doctor says ""I ha...","[a, man, visits, doctor, the, doctor, says, i,...",a man visit doctor the doctor say i bad news y...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",a man visits doctor the doctor says i bad news...,"[(a, DET), (man, NOUN), (visits, VERB), (docto..."
1,This couple had an excellent relationship goin...,"[this, couple, excellent, relationship, going,...",this couple excellent relationship going one d...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",this couple excellent relationship going one d...,"[(this, DET), (couple, ADJ), (excellent, NOUN)..."


In [767]:
pd.options.mode.chained_assignment = None

In [768]:
jokes_dataframe["pos_tags"][0]

[('a', 'DET'),
 ('man', 'NOUN'),
 ('visits', 'VERB'),
 ('doctor', 'VERB'),
 ('the', 'DET'),
 ('doctor', 'NOUN'),
 ('says', 'VERB'),
 ('i', 'NOUN'),
 ('bad', 'ADJ'),
 ('news', 'NOUN'),
 ('you', 'PRON'),
 ('cancer', 'NOUN'),
 ('alzheimer', 'VERB'),
 ('disease', 'ADP'),
 ('the', 'DET'),
 ('man', 'NOUN'),
 ('replies', 'VERB'),
 ('well', 'ADV'),
 ('thank', 'ADJ'),
 ('god', 'NOUN'),
 ('i', 'NOUN'),
 ('cancer', 'NOUN')]

In [769]:
jokes_dataframe['ADJ'] = pd.Series(dtype=str)
jokes_dataframe['ADP'] = pd.Series(dtype=str)
jokes_dataframe['ADV'] = pd.Series(dtype=str)
jokes_dataframe['CONJ'] = pd.Series(dtype=str)
jokes_dataframe['DET'] = pd.Series(dtype=str)
jokes_dataframe['NOUN'] = pd.Series(dtype=str)
jokes_dataframe['NUM'] = pd.Series(dtype=str)
jokes_dataframe['PRT'] = pd.Series(dtype=str)
jokes_dataframe['PRON'] = pd.Series(dtype=str)
jokes_dataframe['PRT'] = pd.Series(dtype=str)
jokes_dataframe['PRON'] = pd.Series(dtype=str)
jokes_dataframe['VERB'] = pd.Series(dtype=str)
jokes_dataframe['PUNC'] = pd.Series(dtype=str)
jokes_dataframe['OTHERS'] = pd.Series(dtype=str)

In [770]:
def aggregate_tags(col_tags, tag_columns = {
    'ADJ': 'ADJ',
    'ADP': 'ADP',
    'ADV': 'ADV',
    'CONJ': 'CONJ',
    'DET': 'DET',
    'NOUN': 'NOUN',
    'NUM': 'NUM',
    'PRT': 'PRT',
    'PRON': 'PRON',
    'VERB': 'VERB',
    '.': '.',
    'X': 'X'}):
    aggregated_tags = {col: [] for col in tag_columns.values()}
    
    ret_frame = pd.DataFrame(columns=aggregated_tags)

    for i,joke_tags in enumerate(col_tags):
        temp_dict = {val:[] for val in aggregated_tags.keys()}
        for tag in joke_tags:
            temp_dict[tag[1]].append(tag[0])

        ret_frame = pd.concat([ret_frame, pd.DataFrame([temp_dict])], ignore_index=True)

    return ret_frame

aggregate_tags_frame = aggregate_tags(jokes_dataframe["pos_tags"])


In [771]:
aggregate_tags_frame

Unnamed: 0,ADJ,ADP,ADV,CONJ,DET,NOUN,NUM,PRT,PRON,VERB,.,X
0,"[bad, thank]",[disease],[well],[],"[a, the, the]","[man, doctor, i, news, cancer, man, god, i, ca...",[],[],[you],"[visits, doctor, says, alzheimer, replies]",[],[]
1,"[couple, girlfriend, awful, big, old]",[that],"[told, possibly, awfully]",[],[this],"[excellent, relationship, day, home, work, pac...","[one, ten]",[],"[he, what, they, he]","[going, came, find, asked, leaving, heard, cou...",[],[]
2,"[long, nelson]","[teeth, willie]",[],[],"[a, the]","[feet, front, row, concert]","[200, 4]",[],[what],[q],[],[]
3,[],[around],[],[],"[a, a]","[difference, man, toilet, follow, use]",[],[],[what],"[q, toilet]",[],[]
4,[slash],[o],[],[],[a],"[j, simpson, internet, address, slash, backsla...",[],[],[what],[q],[],[]
...,...,...,...,...,...,...,...,...,...,...,...,...
95,[shrugged],"[eat, told]","[diner, then, quite]",[],"[the, the]","[attorneys, drinks, sandwiches, briefcases, ow...","[two, two]",[],[you],"[went, ordered, produced, started, became, con...",[],[]
96,"[different, positive, negative, negative, posi...","[in, in]","[differently, together]",[but],[a],"[teacher, class, languages, negatives, languag...","[two, two, one]",[],[she],"[explaining, use, says, followed, followed, ma...",[],[]
97,"[hot, exotic, beautiful, free]","[between, like, between, like, between, like, ...","[fully, breathtakingly, still]",[],[],"[age, ages, africa, virgin, ages, asia, ages, ...","[1, 13, 18, 2, 19, 35, 3, 36, 45, 4, 46, 56, 5...",[],"[she, she, she, she]","[womanhood, unexplored, explored, exhausted, p...",[],[]
98,[],[on],[],[],"[a, a]","[bus, station, bus, train, station, train, sto...",[],[],[],[stops],[],[]


pos tagging: https://212digital.medium.com/an-introduction-to-part-of-speech-tagging-what-it-is-and-how-you-can-use-it-in-natural-language-9723f4696f78

sentiment analysis — By identifying words with positive or negative connotations, POS tagging can be used to calculate the overall sentiment of a piece of text.

topic identification — By looking at which words are most commonly used together, POS tagging can help automatically identify the main topics of a document.

In [814]:
def filterByPOS(tags):
    try:
        tags = ast.literal_eval(tags)
    except ValueError as e:
        print("Error during literal_eval:", e)
        return None
    
    
    txt = []
    
    for word, pos in tags:
        if pos in ['ADJ', 'NOUN'] and len(word) > 1:
            txt.append(word)
            
    return ' '.join(txt)

helper = [filterByPOS(str(tags)) for tags in jokes_dataframe["pos_tags"]]

Applying topic modelling


In [887]:
from gensim.models.phrases import Phrases, Phraser
from gensim.corpora import Dictionary

docs = [d.split() for d in helper if d]

bigram_model = Phrases(docs, min_count=5, threshold=15)
trigram_model = Phrases(bigram_model[docs], min_count=5, threshold=15)

docs_with_ngrams = trigram_model[bigram_model[docs]]

dictionary = Dictionary(docs_with_ngrams)
corpus = [dictionary.doc2bow(doc) for doc in docs_with_ngrams]


In [888]:
temp = dictionary[0]
id2word = dictionary.id2token

model = LdaModel(
    corpus=corpus,
    id2word=id2word,
    chunksize=500,
    alpha='auto',
    eta='auto',
    iterations=700,
    num_topics=10,
    passes=15,
    eval_every=None
    ,random_state=42
)

In [889]:
top_topics = model.top_topics(corpus)
top_topics

[([(0.018022142, 'difference'),
   (0.013626469, 'polish'),
   (0.013626469, 'builder'),
   (0.013626469, 'beer'),
   (0.013626468, 'guy'),
   (0.013626464, 'station'),
   (0.00923078, 'stone'),
   (0.00923078, 'hands'),
   (0.00923078, 'bare'),
   (0.00923078, 'look'),
   (0.00923078, 'road'),
   (0.00923078, 'macgregor'),
   (0.0092307795, 'lad'),
   (0.0092307795, 'pier'),
   (0.0092307795, 'sips'),
   (0.0092307795, 'wall'),
   (0.0092307795, 'partner'),
   (0.0092307795, 'hell'),
   (0.0092307795, 'call'),
   (0.009230778, 'shot')],
  -13.10242067488885),
 ([(0.025902715, 'woman'),
   (0.021656344, 'man'),
   (0.017409975, 'car'),
   (0.013163602, 'bear'),
   (0.013163601, 'bmw'),
   (0.013163601, 'door'),
   (0.013163599, 'lawyer'),
   (0.0089172255, 'bartender'),
   (0.008917225, 'hurt'),
   (0.008917225, 'officer'),
   (0.008917225, 'arm'),
   (0.008917225, 'look'),
   (0.008917224, 'panda'),
   (0.008917219, 'change'),
   (0.008917216, 'many'),
   (0.0046708216, 'orders'),
   

This above code extracts the top topics from the trained LDA model using the top_topics method. Each topic is 
represented as a list of tuples, where each tuple contains a word and its associated weight in the topic.

In [826]:
topic_proportions_matrix = np.zeros((len(corpus), 10)) 

for i, doc_bow in enumerate(corpus):
    topic_distribution = model[doc_bow]
    
    for topic, proportion in topic_distribution:
        topic_proportions_matrix[i, topic] = proportion

In [827]:
topic_word_dists = model.get_topics()
doc_topic_dists = model.get_document_topics(corpus)

most_dominant_topics = [max(doc, key=lambda x: x[1])[0] for doc in doc_topic_dists]


Features extraction for applying algorithm

In [881]:
from string import punctuation
from nltk.sentiment.vader import SentimentIntensityAnalyzer

def text_analysis(text):

    blob = TextBlob(text)

    sentiment_polarity = blob.sentiment.polarity
    sentiment_subjectivity = blob.sentiment.subjectivity

    sid = SentimentIntensityAnalyzer()

    sentiment_scores = sid.polarity_scores(text)

    text_length = len(text)

    words = word_tokenize(text)

    punctuation_count = sum(1 for char in words if char in punctuation)

    stop_words = set(stopwords.words('english'))
    words = [word.lower() for word in words if word.isalpha() and word.lower() not in stop_words]

    word_count = len(words)

    unique_words = len(set(words))

    pos_tags = pos_tag(words)

    verb_count = sum(1 for word, pos in pos_tags if pos.startswith('VB'))
    noun_count = sum(1 for word, pos in pos_tags if pos.startswith('NN'))

    return {
        'text_length': text_length,
        'punctuation_count': punctuation_count,
        'word_count': word_count,
        'unique_words': unique_words,
        'verb_count': verb_count,
        'noun_count': noun_count,
        'joke_sentiment_neg': sentiment_scores['neg'],
        'joke_sentiment_neu': sentiment_scores['neu'],
        'joke_sentiment_pos': sentiment_scores['pos']
        }

def extract_Featrues(raw_jokes, topic_proportion, num_topics):

    columns=['text_length', 'punctuation_count', 'word_count',
                                      'unique_words','verb_count','noun_count', 'joke_sentiment_neg', 
                                       'joke_sentiment_neu', 'joke_sentiment_pos'
                                    ]

    lst = [f'topic-{i+1}' for i in range(num_topics)]

    columns.extend(lst)
    ret_frame = pd.DataFrame(columns=columns)
    

    for id,joke in enumerate(raw_jokes):
        joke = text_analysis(joke)

        for topic,proportion in zip(lst, topic_proportion[id]):
            joke[topic] = proportion

        ret_frame = pd.concat([ret_frame, pd.DataFrame([joke])], ignore_index=True)

    return ret_frame

In [882]:
features_jokes = extract_Featrues(jokes_dataframe["Original Jokes"], topic_proportions_matrix, 10)

In [884]:
features_jokes['joke_id'] = [i for i in range(1,101)]

In [885]:
features_jokes

Unnamed: 0,text_length,punctuation_count,word_count,unique_words,verb_count,noun_count,joke_sentiment_neg,joke_sentiment_neu,joke_sentiment_pos,topic-1,topic-2,topic-3,topic-4,topic-5,topic-6,topic-7,topic-8,topic-9,topic-10,joke_id
0,162,4,16,13,3,10,0.246,0.691,0.063,0.963938,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.000000,1
1,378,6,32,31,9,12,0.041,0.909,0.050,0.000000,0.978481,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.000000,2
2,86,4,9,9,0,4,0.000,1.000,0.000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.944745,3
3,109,4,8,7,1,6,0.000,1.000,0.000,0.937186,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.000000,4
4,94,9,10,7,0,10,0.440,0.471,0.089,0.000000,0.000000,0.000000,0.962037,0.0,0.0,0.000000,0.0,0.000000,0.000000,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,334,8,26,21,10,10,0.000,1.000,0.000,0.967408,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.000000,96
96,421,13,39,25,11,13,0.151,0.654,0.195,0.000000,0.984921,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.000000,97
97,545,14,29,22,4,13,0.017,0.859,0.124,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.981878,0.000000,98
98,118,3,10,5,2,8,0.167,0.833,0.000,0.000000,0.000000,0.965849,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.000000,99


In [871]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
features_scaled = scaler.fit_transform(features_jokes.drop(columns=["joke_id"]))

In [873]:
features_scaled.shape

(100, 19)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

def transform_joke(joke):
    preprocessed_joke = preprocess_clean_jokes(joke)
    tokenized_joke = preprocessed_joke.split()

    ngram_joke = trigram_model[bigram_model[tokenized_joke]]

    joke_bow = dictionary.doc2bow(ngram_joke)

    return joke_bow

def find_closest_vectors(target_vector, list_of_vectors, top_n=10):
    target_vector = target_vector.reshape(1, -1)

    similarities = cosine_similarity(target_vector, list_of_vectors)

    top_indices = np.argsort(similarities[0])[-top_n:][::-1]

    return top_indices

def testing(test_jokes, train_jokes_frame, train_scaler, topic_model, num_topics, user_item_imputed):
    preprocess_test = preprocess_clean_jokes(test_jokes)

    testing_ratings = []

    for joke in preprocess_test:

        lemma_test = lemmatize_text(joke)
        tf_idf_test = feature_Extraction_TF_IDF(lemma_test)
        pos_tag_test = posTagging(lemma_test)
        filter_Pos = filterByPOS(pos_tag_test)

        bow_joke = transform_joke(filter_Pos)

        topic_distribution = topic_model.get_document_topics(bow_joke)
        
        features_extracted = extract_Featrues([joke], topic_distribution, num_topics)

        scaled_test = train_scaler.transform(features_extracted)

        top_i_closest = find_closest_vectors(scaled_test, train_jokes_frame.to_numpy())

        # Vectorized operation to get user ratings for the top N closest jokes
        user_ratings = user_item_imputed.iloc[:, top_i_closest].values

        # Calculate the average rating for each user across the top N closest jokes
        average_ratings = np.nanmean(user_ratings, axis=1)

        testing_ratings.append(average_ratings.tolist())

    return testing_ratings
