In [3]:
import pandas as pd              ###importing the necessary library files 
import numpy as np

In [4]:
df = pd.read_csv("/content/jokes-data.csv")  ### reading the csv file 

In [5]:
df.head()

Unnamed: 0,id,user_id,joke_id,Rating
0,31030_110,31030,110,2.75
1,16144_109,16144,109,5.094
2,23098_6,23098,6,-6.438
3,14273_86,14273,86,4.406
4,18419_134,18419,134,9.375


In [6]:
df.shape

(265294, 4)

In [7]:
df['id'].nunique()

265294

In [8]:
df['user_id'].nunique()

39478

In [9]:
df['joke_id'].nunique()

139

In [10]:
df = df.drop('id', axis = 1)

In [11]:
!pip install surprise       ##installing the python package surprise

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting surprise
  Downloading surprise-0.1-py2.py3-none-any.whl (1.8 kB)
Collecting scikit-surprise (from surprise)
  Downloading scikit-surprise-1.1.3.tar.gz (771 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m772.0/772.0 kB[0m [31m24.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.3-cp310-cp310-linux_x86_64.whl size=3095492 sha256=bc7054347097f0164e804212d0d543ab5117f3b12bce64f250fe80eb45a25be9
  Stored in directory: /root/.cache/pip/wheels/a5/ca/a8/4e28def53797fdc4363ca4af740db15a9c2f1595ebc51fb445
Successfully built scikit-surprise
Installing collected packages: scikit-surprise, surprise
Successfully installed scikit-surprise-1.1.

In [12]:
df.isnull().sum()


user_id    0
joke_id    0
Rating     1
dtype: int64

In [13]:
from surprise import Reader, Dataset,SVD  
### imports the necessary classes from the Surprise library for building a collaborative recommendation system using Singular Value Decomposition (SVD).

In [None]:
## define a reader object
## the reader object helps in parsing the file or dataframe containing ratings

In [14]:
reader = Reader()  ## creating the reader object

In [15]:
data = Dataset.load_from_df(df, reader)  ###### create the dataset to be used for building the filter

In [16]:
data

<surprise.dataset.DatasetAutoFolds at 0x7fe5865ea9b0>

In [28]:
algo = SVD()

In [18]:
from surprise.model_selection import cross_validate  
 ###perform cross-validation to evaluate the performance of the collaborative filtering algorithm using the Surprise library.

In [19]:
cross_validate(algo, data, measures = ["RMSE"], cv = 5, verbose = True)

Evaluating RMSE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    6.1612  6.1882  6.1463  nan     6.1915  nan     nan     
Fit time          5.69    4.78    5.68    4.83    5.19    5.23    0.39    
Test time         0.74    0.75    0.70    0.93    0.67    0.76    0.09    


{'test_rmse': array([6.16120599, 6.188179  , 6.14630239,        nan, 6.19152756]),
 'fit_time': (5.68955135345459,
  4.779522180557251,
  5.675756454467773,
  4.832005977630615,
  5.187804460525513),
 'test_time': (0.7371354103088379,
  0.7453827857971191,
  0.6987929344177246,
  0.9282615184783936,
  0.6727051734924316)}

In [20]:
from surprise.model_selection import train_test_split
from surprise import accuracy


In [21]:
x_train, x_test = train_test_split(data, test_size=0.2, random_state=42)        # Split the data into training and testing sets


In [22]:
algo.fit(x_train)                             # Train the algorithm on the training set

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7fe5865d63e0>

In [23]:
predictions = algo.test(x_test)                                          # Predict ratings for the test set
accuracy.rmse(predictions)                                               # Evaluate the accuracy of the model

RMSE: 6.1909


6.1908917087337105

In [24]:
# Make recommendations for a specific user
user_id = 1
num_recommendations = 5

In [25]:
joke_ids = df['joke_id'].unique()       ### Get the list of all joke IDs
# Remove the jokes already rated by the user
joke_ids_unrated = [joke_id for joke_id in joke_ids if joke_id not in df[df['user_id'] == user_id]['joke_id']]

In [26]:
###Predict ratings for the unrated jokes by the user
unrated_joke_predictions = [algo.predict(user_id, joke_id) for joke_id in joke_ids_unrated]

###Sort the predictions by predicted ratings (in descending order)
sorted_predictions = sorted(unrated_joke_predictions, key=lambda x: x.est, reverse=True)

In [27]:
###Get the top-5 recommended joke IDs
top_k_joke_ids = [prediction.iid for prediction in sorted_predictions[:num_recommendations]]

# Print the top-5 recommended jokes
recommended_jokes = df[df['joke_id'].isin(top_k_joke_ids)]['joke_id'].unique()
for joke in recommended_jokes:
    print(joke)


110
109
6
86
134
