In [19]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from tqdm import tqdm  # For progress tracking


# Dataset Loading

In [20]:
jokes=pd.read_excel("Dataset3JokeSet.xlsx",header=None)
ratings = pd.read_excel("FINAL jester 2006-15.xlsx", header=None)

# PreProcessing

In [21]:
joke_columns = [f'{i}' for i in range(1, ratings.shape[1])]
ratings.columns = ['Total_Ratings'] + joke_columns

ratings.insert(0, 'User_ID', range(1, len(ratings) + 1))

jokes.columns = ['Joke_Text']                                  # Rename the single column
jokes.insert(0, 'Joke_ID', range(1, len(jokes) + 1))           # Add Joke_ID as a sequential index

ratings.iloc[:, 2:] = ratings.iloc[:, 2:].replace(99, np.nan)

outdated_jokes = [1, 2, 3, 4, 5, 6, 9, 10, 11, 12, 14, 20, 27, 31, 43, 51, 52, 61, 73, 80, 100, 116]
ratings.drop(columns=[f'{i}' for i in outdated_jokes if f'{i}' in ratings.columns], inplace=True)

ratings['User_ID'] = ratings['User_ID'].astype(int)
ratings['Total_Ratings'] = ratings['Total_Ratings'].astype(int)

jokes = jokes.drop(jokes[jokes['Joke_ID'].isin(outdated_jokes)].index)
jokes = jokes.reset_index(drop=True)

if 'index' in jokes.columns:                                     # Drop the unwanted 'index' column
    jokes = jokes.drop(columns=['index'])

scaler = MinMaxScaler(feature_range=(0, 1))
ratings.iloc[:, 2:] = scaler.fit_transform(ratings.iloc[:, 2:])  # Skip User_ID and Total_Ratings

ratings = ratings.drop_duplicates()
ratings.reset_index(drop=True, inplace=True)



1       NaN
2       NaN
3       NaN
4       NaN
         ..
54900   NaN
54901   NaN
54902   NaN
54903   NaN
54904   NaN
Name: 1, Length: 54905, dtype: float64' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  ratings.iloc[:, 2:] = ratings.iloc[:, 2:].replace(99, np.nan)
1       NaN
2       NaN
3       NaN
4       NaN
         ..
54900   NaN
54901   NaN
54902   NaN
54903   NaN
54904   NaN
Name: 2, Length: 54905, dtype: float64' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  ratings.iloc[:, 2:] = ratings.iloc[:, 2:].replace(99, np.nan)
1       NaN
2       NaN
3       NaN
4       NaN
         ..
54900   NaN
54901   NaN
54902   NaN
54903   NaN
54904   NaN
Name: 3, Length: 54905, dtype: float64' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  ratings.iloc[:, 2:] = ratings.iloc[:, 2:].replace(99, np.nan)
1       NaN
2       NaN
3       NaN
4       NaN
         ..
54900  

In [22]:
'''To ensure the reliability of results, keep only the users who have rated more than 50 jokes and 
the jokes which are rated by more than 100 users '''

ratings = ratings[ratings['Total_Ratings'] > 50]
popular_jokes = ratings.drop(columns=['Total_Ratings', 'User_ID']).count(axis=0) >= 100

ratings = ratings[ ['User_ID'] + ['Total_Ratings'] + popular_jokes.index[popular_jokes].tolist()]

ratings_long = ratings.melt(
    id_vars=['User_ID', 'Total_Ratings'],  # Columns to keep
    var_name='Joke_ID',                    # Name for the new joke column
    value_name='Rating'                    # Name for the new ratings column
)

ratings_long = ratings_long.drop(columns=['Total_Ratings'])

ratings_long['Joke_ID'] = ratings_long['Joke_ID'].astype(int)


# Jokes Ratings Predictions

In [47]:
ratings_matrix = ratings.drop(columns=['User_ID', 'Total_Ratings'])
ratings_user_ids = ratings['User_ID']                # Retain User_IDs for mapping

user_similarity = cosine_similarity(ratings_matrix.fillna(0))  # Fill NaN with 0
user_similarity_df = pd.DataFrame(user_similarity,
                                  index=ratings_user_ids,
                                  columns=ratings_user_ids)


## Top 2000 user with similarity measures (Based on Rating Patterns)

In [60]:
#Calculates the sum of similarity scores for each user across all other users.
#A higher sum indicates users whose rating patterns are similar to many other users.

user_similarity_scores = user_similarity_df.sum(axis=1)

top_2000_users = user_similarity_scores.sort_values(ascending=False).head(2000).index

top_2000_similarity_df = user_similarity_df.loc[top_2000_users, top_2000_users]

print("Similarity Measures for Top 2000 Users (Based on Rating Patterns):")
print(top_2000_similarity_df)

Similarity Measures for Top 2000 Users (Based on Rating Patterns):
User_ID     45144     29324     48286     31010     23834     50635     3015   \
User_ID                                                                         
45144    1.000000  0.958284  0.978979  0.978482  0.966559  0.972936  0.962993   
29324    0.958284  1.000000  0.947500  0.951578  0.952693  0.946463  0.946942   
48286    0.978979  0.947500  1.000000  0.981211  0.960839  0.977158  0.963699   
31010    0.978482  0.951578  0.981211  1.000000  0.961221  0.985536  0.970710   
23834    0.966559  0.952693  0.960839  0.961221  1.000000  0.954739  0.955527   
...           ...       ...       ...       ...       ...       ...       ...   
41118    0.946244  0.922006  0.955630  0.947858  0.937869  0.945369  0.939011   
11813    0.919642  0.903233  0.917918  0.930211  0.924459  0.931238  0.917005   
30642    0.920728  0.918808  0.921687  0.925164  0.916187  0.911982  0.908635   
22788    0.929580  0.903090  0.922365  0.9

In [50]:
def random_recommender(user_id, joke_id, k=5):

    random_users = np.random.choice(ratings_user_ids, k)

    ratings_for_joke = ratings.loc[ratings['User_ID'].isin(random_users), str(joke_id)]

    ratings_for_joke = ratings_for_joke.dropna()

    # Fallback: If no valid ratings, return global average for the joke
    if ratings_for_joke.empty:
        return ratings[str(joke_id)].mean()

    return ratings_for_joke.mean()


In [51]:
def collaborative_recommender(user_id, joke_id, k=5):
    if user_id not in user_similarity_df.index:
        return ratings[str(joke_id)].mean()  # Fallback for unseen users

    user_similarities = user_similarity_df[user_id]

    similar_users = user_similarities.drop(user_id).sort_values(ascending=False).head(k).index

    ratings_for_joke = ratings.loc[ratings['User_ID'].isin(similar_users), ['User_ID', str(joke_id)]]

    valid_ratings = ratings_for_joke.dropna(subset=[str(joke_id)])

    if valid_ratings.empty:
        return ratings[str(joke_id)].mean()

    # Map ratings to their similarity scores
    valid_similarities = user_similarities.loc[valid_ratings['User_ID']]
    weighted_ratings = valid_ratings[str(joke_id)].values * valid_similarities.values

    # Calculate the weighted average rating
    denominator = valid_similarities.values.sum()
    if denominator == 0:
        return ratings[str(joke_id)].mean()  # Fallback to global average

    return weighted_ratings.sum() / denominator


## Evaluation

In [58]:
train, test = train_test_split(ratings_long, test_size=0.2, random_state=42)

random_predictions = []
collaborative_predictions = []
test_instances = []

for _, row in tqdm(test.iterrows(), total=len(test), desc="Predicting ratings", unit="row"):
    user_id = int(row['User_ID'])
    joke_id = int(row['Joke_ID'])
    actual_rating = row['Rating']

    # Predict ratings
    random_rating = random_recommender(user_id, joke_id, k=1200)
    collaborative_rating = collaborative_recommender(user_id, joke_id, k=12000)

    # Append predictions
    random_predictions.append(random_rating)
    collaborative_predictions.append(collaborative_rating)
    test_instances.append({
        'User_ID': user_id,
        'Joke_ID': joke_id,
        'Actual_Rating': actual_rating,
        'Random_Predicted_Rating': random_rating,
        'Collaborative_Predicted_Rating': collaborative_rating
    })


test_results_df = pd.DataFrame(test_instances)

Predicting ratings: 100%|██████████| 277607/277607 [34:47<00:00, 132.97row/s]


In [59]:
valid_results = test_results_df.dropna()

random_mae = mean_absolute_error(valid_results['Actual_Rating'], valid_results['Random_Predicted_Rating'])
collaborative_mae = mean_absolute_error(valid_results['Actual_Rating'], valid_results['Collaborative_Predicted_Rating'])


print(f"\nRandom Recommender MAE: {random_mae}")
print(f"Collaborative Recommender MAE: {collaborative_mae}")
print("Valid Test Data Instances (Original vs. Predicted Ratings):")
valid_results


Random Recommender MAE: 0.20082080341525665
Collaborative Recommender MAE: 0.1997522166335504
Valid Test Data Instances (Original vs. Predicted Ratings):


Unnamed: 0,User_ID,Joke_ID,Actual_Rating,Random_Predicted_Rating,Collaborative_Predicted_Rating
0,49324,124,0.334375,0.374346,0.387157
1,47411,67,0.821875,0.503128,0.506710
3,26257,38,0.753125,0.584951,0.600076
4,41853,94,0.548438,0.597882,0.595922
7,42017,81,0.865625,0.596203,0.601745
...,...,...,...,...,...
277602,32461,37,0.987500,0.526837,0.518662
277603,3727,18,0.751563,0.505822,0.511399
277604,30450,32,0.623437,0.678791,0.685751
277605,12918,34,0.751563,0.554665,0.557471
