In [40]:
import pandas as pd
import numpy as np
import cornac
from recommenders.datasets.python_splitters import python_random_split
from recommenders.evaluation.python_evaluation import map, ndcg_at_k, precision_at_k, recall_at_k
from recommenders.models.cornac.cornac_utils import predict_ranking
from recommenders.utils.timer import Timer
from recommenders.utils.constants import SEED
from pymongo import MongoClient

In [41]:

pd.set_option('display.max_colwidth', None)


data = pd.read_csv("Filtered_Fanclub.Shows.csv")


data['Original_Fans'] = data['Fans']

In [42]:
# Explode the 'Fans' column into separate rows
data['Fans'] = data['Fans'].apply(lambda x: x.split(', ') if pd.notna(x) else [])
data = data.explode('Fans')

# Rename columns to match expected format
data = data.rename(columns={'Fans': 'userID', 'Show Name': 'itemID'})

In [43]:
# Prepare the data for Cornac by keeping only necessary columns for training
train_data = data[['userID', 'itemID']].drop_duplicates()
train_data['rating'] = 1  # Assign a default rating of 1

# Retain _id, Show URL, and Original_Fans columns for display purposes
original_data = data[['userID', 'itemID', 'Show URL', '_id', 'Original_Fans']].drop_duplicates()


train, test = python_random_split(train_data, 0.75)


train_set = cornac.data.Dataset.from_uir(train.itertuples(index=False), seed=SEED)

print('Number of users: {}'.format(train_set.num_users))
print('Number of items: {}'.format(train_set.num_items))

Number of users: 34755
Number of items: 885


In [44]:
# Initialize the BPR model
NUM_FACTORS = 10
NUM_EPOCHS = 100
TOP_K = 10

bpr = cornac.models.BPR(
    k=NUM_FACTORS,
    max_iter=NUM_EPOCHS,
    learning_rate=0.01,
    lambda_reg=0.001,
    verbose=True,
    seed=SEED
)

# Train the model
with Timer() as t:
    bpr.fit(train_set)
print("Took {} seconds for training.".format(t))

# Predict rankings for all users
with Timer() as t:
    all_predictions = predict_ranking(bpr, train, usercol='userID', itemcol='itemID', remove_seen=True)
print("Took {} seconds for prediction.".format(t))

  0%|          | 0/100 [00:00<?, ?it/s]

Optimization finished!
Took 0.9154 seconds for training.
Took 31.4943 seconds for prediction.


In [45]:
# Evaluate the model
k = TOP_K
eval_map = map(test, all_predictions, col_prediction='prediction', k=k)
eval_ndcg = ndcg_at_k(test, all_predictions, col_prediction='prediction', k=k)
eval_precision = precision_at_k(test, all_predictions, col_prediction='prediction', k=k)
eval_recall = recall_at_k(test, all_predictions, col_prediction='prediction', k=k)

print("Evaluation Metrics:",
      "MAP:\t{}".format(eval_map),
      "NDCG:\t{}".format(eval_ndcg),
      "Precision@K:\t{}".format(eval_precision),
      "Recall@K:\t{}".format(eval_recall), sep='\n')

  df_hit.groupby(col_user, as_index=False)[col_user].agg({"hit": "count"}),
  rating_true_common.groupby(col_user, as_index=False)[col_user].agg(
  df_hit.groupby(col_user, as_index=False)[col_user].agg({"hit": "count"}),
  rating_true_common.groupby(col_user, as_index=False)[col_user].agg(
  df_hit.groupby(col_user, as_index=False)[col_user].agg({"hit": "count"}),
  rating_true_common.groupby(col_user, as_index=False)[col_user].agg(


Evaluation Metrics:
MAP:	0.13021540902832962
NDCG:	0.1088285069550086
Precision@K:	0.04627633270695283
Recall@K:	0.2701291150401115


  df_hit.groupby(col_user, as_index=False)[col_user].agg({"hit": "count"}),
  rating_true_common.groupby(col_user, as_index=False)[col_user].agg(


In [46]:
# item_factors = bpr.i_factors

# from gensim.matutils import SparseMatrixSimilarity

# item_factors_sparse = SparseMatrixSimilarity(item_factors, num_features=NUM_FACTORS)

# similarity_matrix = item_factors_sparse.similarity_matrix()

# print("Item similarity matrix:\n", similarity_matrix)

In [47]:
# Get item factors (embeddings) from the trained model
item_factors = bpr.i_factors

# Calculate cosine similarities between item embeddings
def cosine_similarity(vec1, vec2):
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

# Calculate similarity for all item pairs
num_items = item_factors.shape[0]
similarity_matrix = np.zeros((num_items, num_items))

# Compute similarity matrix
for i in range(num_items):
    for j in range(i, num_items):
        sim = cosine_similarity(item_factors[i], item_factors[j])
        similarity_matrix[i, j] = sim
        similarity_matrix[j, i] = sim  # Symmetric matrix

print("Item similarity matrix:\n", similarity_matrix)

Item similarity matrix:
 [[ 1.         -0.2432669   0.05017046 ... -0.100008    0.16820298
  -0.20833756]
 [-0.2432669   1.         -0.35387969 ... -0.52771491 -0.50397301
   0.73318303]
 [ 0.05017046 -0.35387969  1.         ...  0.50328314  0.20821877
  -0.58717948]
 ...
 [-0.100008   -0.52771491  0.50328314 ...  1.          0.14041783
  -0.45340845]
 [ 0.16820298 -0.50397301  0.20821877 ...  0.14041783  1.
  -0.79074639]
 [-0.20833756  0.73318303 -0.58717948 ... -0.45340845 -0.79074639
   1.        ]]


In [48]:
item_factors.shape

(885, 10)

In [49]:
# get item factors from the BPR model
item_factors = bpr.i_factors
item_mapping = list(train_set.iid_map)
item_vectors_list = []
for item_factor, item_id in zip(item_factors, item_mapping):
    item_vectors_list.append(list(item_factor) + [item_id])

In [50]:
item_mapping

['Rangrasiya',
 'Bade Achhe Lagte Hain 2',
 'Iss Pyaar Ko Kya Naam Doon',
 'Madhubala - Ek Ishq Ek Junoon',
 'Ek Hasina Thi',
 'Dil Dostii Dance',
 'Geet Hui Sabse Parayi',
 'Ek Hazaaron Mein Meri Behna Hain',
 'Shaurya Aur Anokhi Ki Kahani',
 'Dastaan-E-Mohabbat: Salim Anarkali',
 'Zindagi Ki Mehek',
 'Gulaal',
 'Chhanchhan',
 'Ghum Hai Kisikey Pyaar Meiin',
 'Dill Mill Gayye',
 'Saath Nibhana Saathiya',
 'Iss Pyaar Ko Kya Naam Doon?',
 'Bigg Boss OTT',
 'Bade Acche Lagte Hai 3',
 'Dekha Ek Khwaab',
 'Ishqbaaaz',
 'Kuch Rang Pyar Ke Aise Bhi 3',
 'Diya Aur Baati Hum',
 'Udaan',
 'Yeh Rishta Kya Kehlata Hai',
 'Naagin 6',
 'Jhalak Dikhhlaa Jaa Season 6',
 'Bharat Ka Veer Putra - Maharana Pratap',
 'Gustakh Dil',
 'Na Bole Tum Na Maine Kuch Kaha Season 2',
 'Kalash - Ek Vishwaas',
 'Naamkarann',
 'Ek Duje Ke Vaaste',
 'Bigg Boss 8',
 'Khoob Ladi Mardaani - Jhansi Ki Rani',
 'The Buddy Project Season 2 - Now in College',
 'Punar Vivah',
 'Bigg Boss Season 5',
 'Pardes Mein Hai Mera Dil',

In [51]:
# Function to get show details
def get_show_details(show_names, original_data):
    
    show_details = original_data[original_data['itemID'].isin(show_names)][['_id', 'itemID', 'Show URL', 'Original_Fans']].drop_duplicates()
    return show_details

In [52]:
# Function to get top similar shows
def get_top_similar_shows(show_name, similarity_matrix, item_mapping, reverse_item_mapping, original_data, top_k=100):
    if show_name not in item_mapping:
        return f"Show '{show_name}' not found."

    item_id = item_mapping[show_name]

    similarity_scores = similarity_matrix[item_id]

    
    similar_indices = np.argsort(similarity_scores)[::-1][1:top_k+1]  

    
    similar_shows = [(reverse_item_mapping[idx], similarity_scores[idx]) for idx in similar_indices]

    
    print(f"Similarity scores for '{show_name}': {[score for _, score in similar_shows]}")
    print(f"Top {top_k} similar shows: {[name for name, _ in similar_shows]}")

    
    similar_show_names = [name for name, _ in similar_shows]
    show_details = get_show_details(similar_show_names, original_data)
    
    
    show_details['Similarity'] = [score for _, score in similar_shows]

    
    show_details = show_details.sort_values(by='Similarity', ascending=False)

    return show_details

In [53]:
# Create a mapping from itemID to internal Cornac ID
item_mapping = train_set.iid_map
reverse_item_mapping = {v: k for k, v in item_mapping.items()}


client = MongoClient('mongodb://localhost:27017/')
db = client['show_database']
collection = db['show_similarity']


mongo_data = []

for show_name in item_mapping.keys():
    similar_shows = get_top_similar_shows(show_name, similarity_matrix, item_mapping, reverse_item_mapping, original_data, top_k=100)
    show_entry = {
        "show": show_name,
        "similar_shows": [{"show": row['itemID'], "score": row['Similarity']} for _, row in similar_shows.iterrows()]
    }
    mongo_data.append(show_entry)

collection.insert_many(mongo_data)

print("Data inserted into MongoDB successfully.")

Similarity scores for 'Rangrasiya': [0.8619073033332825, 0.7721839547157288, 0.7500821948051453, 0.7375171184539795, 0.7258913516998291, 0.7189131379127502, 0.7163254022598267, 0.7133265733718872, 0.7131973505020142, 0.707282543182373, 0.7000098824501038, 0.6844213008880615, 0.682195782661438, 0.6753880381584167, 0.672102689743042, 0.670469343662262, 0.6675044298171997, 0.6649879217147827, 0.6612014174461365, 0.6499958634376526, 0.6480908989906311, 0.6376188397407532, 0.6143849492073059, 0.612026572227478, 0.6115580201148987, 0.611026406288147, 0.6095231771469116, 0.6089919209480286, 0.605070948600769, 0.5972722172737122, 0.5958003401756287, 0.5934168696403503, 0.5931841135025024, 0.5903091430664062, 0.5899513959884644, 0.5887773633003235, 0.5804776549339294, 0.5734094977378845, 0.573131263256073, 0.571570098400116, 0.5639011263847351, 0.5607897043228149, 0.5577995777130127, 0.556542158126831, 0.5543293952941895, 0.5538288950920105, 0.5529865026473999, 0.5450232625007629, 0.53402745723