In [19]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity

# Dataset Loading

In [20]:
jokes=pd.read_excel("Dataset3JokeSet.xlsx",header=None)
ratings = pd.read_excel("FINAL jester 2006-15.xlsx", header=None)

# PreProcessing

In [21]:
joke_columns = [f'{i}' for i in range(1, ratings.shape[1])]
ratings.columns = ['Total_Ratings'] + joke_columns

ratings.insert(0, 'User_ID', range(1, len(ratings) + 1))

jokes.columns = ['Joke_Text']                                  # Rename the single column
jokes.insert(0, 'Joke_ID', range(1, len(jokes) + 1))           # Add Joke_ID as a sequential index

ratings.iloc[:, 2:] = ratings.iloc[:, 2:].replace(99, np.nan)

outdated_jokes = [1, 2, 3, 4, 5, 6, 9, 10, 11, 12, 14, 20, 27, 31, 43, 51, 52, 61, 73, 80, 100, 116]
ratings.drop(columns=[f'{i}' for i in outdated_jokes if f'{i}' in ratings.columns], inplace=True)

ratings['User_ID'] = ratings['User_ID'].astype(int)
ratings['Total_Ratings'] = ratings['Total_Ratings'].astype(int)

jokes = jokes.drop(jokes[jokes['Joke_ID'].isin(outdated_jokes)].index)
jokes = jokes.reset_index(drop=True)

if 'index' in jokes.columns:                                     # Drop the unwanted 'index' column
    jokes = jokes.drop(columns=['index'])

scaler = MinMaxScaler(feature_range=(0, 1))
ratings.iloc[:, 2:] = scaler.fit_transform(ratings.iloc[:, 2:])  # Skip User_ID and Total_Ratings

ratings = ratings.drop_duplicates()
ratings.reset_index(drop=True, inplace=True)



1       NaN
2       NaN
3       NaN
4       NaN
         ..
54900   NaN
54901   NaN
54902   NaN
54903   NaN
54904   NaN
Name: 1, Length: 54905, dtype: float64' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  ratings.iloc[:, 2:] = ratings.iloc[:, 2:].replace(99, np.nan)
1       NaN
2       NaN
3       NaN
4       NaN
         ..
54900   NaN
54901   NaN
54902   NaN
54903   NaN
54904   NaN
Name: 2, Length: 54905, dtype: float64' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  ratings.iloc[:, 2:] = ratings.iloc[:, 2:].replace(99, np.nan)
1       NaN
2       NaN
3       NaN
4       NaN
         ..
54900   NaN
54901   NaN
54902   NaN
54903   NaN
54904   NaN
Name: 3, Length: 54905, dtype: float64' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  ratings.iloc[:, 2:] = ratings.iloc[:, 2:].replace(99, np.nan)
1       NaN
2       NaN
3       NaN
4       NaN
         ..
54900  

# Similarity Scores

In [22]:
'''To ensure the reliability of results, keep only the users who have rated more than 50 jokes and 
the jokes which are rated by more than 100 users '''

ratings = ratings[ratings['Total_Ratings'] > 50]  
popular_jokes = ratings.drop(columns=['Total_Ratings', 'User_ID']).count(axis=0) >= 100

ratings = ratings[ ['User_ID'] + ['Total_Ratings'] + popular_jokes.index[popular_jokes].tolist()]

ratings_long = ratings.melt(
    id_vars=['User_ID', 'Total_Ratings'],  # Columns to keep
    var_name='Joke_ID',                    # Name for the new joke column
    value_name='Rating'                    # Name for the new ratings column
)

ratings_long = ratings_long.drop(columns=['Total_Ratings'])

ratings_long['Joke_ID'] = ratings_long['Joke_ID'].astype(int)

pt = ratings_long.pivot(
    index='Joke_ID',                         # New index
    columns='User_ID',                       # New columns
    values='Rating'                          # Values to fill the table
)

pt.fillna(0,inplace=True)
similarity_scores=cosine_similarity(pt)

# Jokes Recommender

In [23]:
def recommendJokes(joke):
    joke = joke.strip().lower()
    index = jokes[jokes['Joke_Text'].str.strip().str.lower() == joke].index

    if not index.empty:
        index = index[0]                                 # Get the first index if there are multiple
        similar_jokes = sorted(
            list(enumerate(similarity_scores[index])),   # Access similarity scores for the found index
            key=lambda x: x[1],
            reverse=True
        )[1:6]                                           # Exclude the joke itself, get top 5 most similar jokes

        for i in similar_jokes:
            joke_text = jokes.loc[i[0], 'Joke_Text']     # Access the Joke_Text at the given index
            print(f"Similarity-Score({i[1]}) ---> {joke_text}\n")
    else:
        print("Joke not found.")


recommendJokes("Q. Did you hear about the dyslexic devil worshiper?   A. He sold his soul to Santa.") #pass a joke from the dataset

Similarity-Score(0.8537143283112506) ---> A man arrives at the gates of heaven. St. Peter asks, "Religion?"  The man says, "Methodist." St. Peter looks down his list, and says,  "Go to room 24, but be very quiet as you pass room 8."   Another man arrives at the gates of heaven. "Religion?" "Baptist." "Go to room 18, but be very quiet as you pass room 8."   A third man arrives at the gates. "Religion?" "Jewish." "Go to room 11, but be very quiet as you pass room 8."  The man says, "I can understand there being different rooms for different religions, but why must I be quiet when I pass room 8?" St. Peter tells him, "Well the Catholics are in room 8,  and they think they're the only ones here.

Similarity-Score(0.85353854909306) ---> They asked the Japanese visitor if they have elections in his country.   "Every Morning" he answers.

Similarity-Score(0.8507990286854228) ---> Q: If a person who speaks three languages is called "tri-lingual," and a person who speaks two languages is called