In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import ast
from collections import Counter
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split


In [2]:
ratings = pd.read_csv(r'ratings_small_filtered_2.csv', index_col=0)
ratings['rating'] = ratings['rating'].apply(lambda x: 1 if x > 3.5 else 0)
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,0,1260759144
1,1,1061,0,1260759182
2,1,1129,0,1260759185
3,1,1172,1,1260759205
4,1,1263,0,1260759151
...,...,...,...,...
81725,671,6212,0,1065149436
81726,671,6268,0,1065579370
81727,671,6365,1,1070940363
81728,671,6385,0,1070979663


In [3]:
All_parts_objects = pd.read_csv(r'All_parts_objects.csv', index_col=2)
columns_to_keep = [str(i) for i in range(80)]
movies_features = All_parts_objects[columns_to_keep]
movies_features

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,70,71,72,73,74,75,76,77,78,79
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,194,0,1,1,3,0,0,1,0,0,...,0,0,1,33,0,1,0,5,0,1
2,629,0,43,0,3,0,0,5,0,2,...,0,0,0,9,16,14,4,0,0,3
3,709,0,11,0,0,0,0,2,26,0,...,0,0,0,0,0,2,0,0,0,0
4,1141,0,22,0,0,0,0,8,0,1,...,0,0,0,0,2,0,0,0,0,0
6,1049,0,115,0,12,5,1,11,0,9,...,0,0,2,0,0,0,1,0,0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
130682,282,0,0,0,0,0,0,0,0,0,...,0,4,0,0,0,1,2,0,0,0
130970,1731,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,402
130980,170,0,2,5,210,0,0,1,101,0,...,0,0,0,1,5,27,0,0,0,0
131013,1484,0,149,2,1,1,0,18,5,0,...,0,0,28,5,0,19,0,0,0,0


In [4]:
movies_metadata = pd.read_csv(r'movies_metadata_BERT_on_normal_tags_and_whisper.csv')

In [5]:
movies_metadata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6319 entries, 0 to 6318
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   movieId         6319 non-null   int64 
 1   tags            6319 non-null   object
 2   bert_embedding  6319 non-null   object
dtypes: int64(1), object(2)
memory usage: 148.2+ KB


In [6]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
Index: 81730 entries, 0 to 81729
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype
---  ------     --------------  -----
 0   userId     81730 non-null  int64
 1   movieId    81730 non-null  int64
 2   rating     81730 non-null  int64
 3   timestamp  81730 non-null  int64
dtypes: int64(4)
memory usage: 3.1 MB


In [7]:
movies_features.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6347 entries, 1 to 131168
Data columns (total 80 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   0       6347 non-null   int64
 1   1       6347 non-null   int64
 2   2       6347 non-null   int64
 3   3       6347 non-null   int64
 4   4       6347 non-null   int64
 5   5       6347 non-null   int64
 6   6       6347 non-null   int64
 7   7       6347 non-null   int64
 8   8       6347 non-null   int64
 9   9       6347 non-null   int64
 10  10      6347 non-null   int64
 11  11      6347 non-null   int64
 12  12      6347 non-null   int64
 13  13      6347 non-null   int64
 14  14      6347 non-null   int64
 15  15      6347 non-null   int64
 16  16      6347 non-null   int64
 17  17      6347 non-null   int64
 18  18      6347 non-null   int64
 19  19      6347 non-null   int64
 20  20      6347 non-null   int64
 21  21      6347 non-null   int64
 22  22      6347 non-null   int64
 23  23      6347 non

In [8]:
unique_movieids_ratings = ratings['movieId'].unique()
len(unique_movieids_ratings)

6347

In [9]:
unique_movieids_movies_metadata = movies_metadata['movieId'].unique()
len(unique_movieids_movies_metadata)

6319

In [10]:
unique_movieids_movies_features = movies_features.index.unique()
len(unique_movieids_movies_features)

6347

In [11]:
common_movieids = list(set(unique_movieids_movies_metadata) & set(unique_movieids_ratings) & set(unique_movieids_movies_features))
len(common_movieids)

6319

In [12]:
ratings = ratings[ratings['movieId'].isin(common_movieids)]
ratings.info()

<class 'pandas.core.frame.DataFrame'>
Index: 81337 entries, 0 to 81729
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype
---  ------     --------------  -----
 0   userId     81337 non-null  int64
 1   movieId    81337 non-null  int64
 2   rating     81337 non-null  int64
 3   timestamp  81337 non-null  int64
dtypes: int64(4)
memory usage: 3.1 MB


In [13]:
movies_metadata = movies_metadata[movies_metadata['movieId'].isin(common_movieids)]
movies_metadata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6319 entries, 0 to 6318
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   movieId         6319 non-null   int64 
 1   tags            6319 non-null   object
 2   bert_embedding  6319 non-null   object
dtypes: int64(1), object(2)
memory usage: 148.2+ KB


In [14]:
movies_features = movies_features.iloc[movies_features.index.isin(common_movieids)]
movies_features.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6319 entries, 1 to 131168
Data columns (total 80 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   0       6319 non-null   int64
 1   1       6319 non-null   int64
 2   2       6319 non-null   int64
 3   3       6319 non-null   int64
 4   4       6319 non-null   int64
 5   5       6319 non-null   int64
 6   6       6319 non-null   int64
 7   7       6319 non-null   int64
 8   8       6319 non-null   int64
 9   9       6319 non-null   int64
 10  10      6319 non-null   int64
 11  11      6319 non-null   int64
 12  12      6319 non-null   int64
 13  13      6319 non-null   int64
 14  14      6319 non-null   int64
 15  15      6319 non-null   int64
 16  16      6319 non-null   int64
 17  17      6319 non-null   int64
 18  18      6319 non-null   int64
 19  19      6319 non-null   int64
 20  20      6319 non-null   int64
 21  21      6319 non-null   int64
 22  22      6319 non-null   int64
 23  23      6319 non

### creation of cosine matrix

In [15]:
# Assuming ratings DataFrame is already loaded
user_item_matrix = ratings.pivot(index='userId', columns='movieId', values='rating')

# Fill NaN with a distinct value (e.g., -1) to differentiate from actual ratings
item_user_matrix_filled = user_item_matrix.T.fillna(0.5)

# Compute cosine similarity
item_similarity = cosine_similarity(item_user_matrix_filled)
item_similarity_df = pd.DataFrame(item_similarity, index=user_item_matrix.columns, columns=user_item_matrix.columns)

# Create training and testing sets
train_ratings, test_ratings = train_test_split(ratings, test_size=0.2, random_state=42)

def get_k_nearest_neighbors(item_similarity_df, movie_id, k=10):
    # Get the similarity scores for the movie with all other movies
    sim_scores = item_similarity_df[movie_id]
    # Sort the movies by similarity score in descending order and select the top k
    nearest_neighbors = sim_scores.sort_values(ascending=False).index[1:k+1]
    return nearest_neighbors

def predict_rating(user_item_matrix, item_similarity_df, user_id, movie_id, k=10):
    # Get k nearest neighbors
    nearest_neighbors = get_k_nearest_neighbors(item_similarity_df, movie_id, k)
    
    # Get the ratings of the user for the nearest neighbors
    neighbor_ratings = user_item_matrix.loc[user_id, nearest_neighbors]
    
    # If there are no ratings by nearest neighbors, return 0
    if neighbor_ratings.isna().all():
        return 0
    
    # Predict the rating based on the majority vote of neighbor ratings
    return Counter(neighbor_ratings.dropna()).most_common(1)[0][0]

# Predict ratings for all users and movies in the test set
y_true_all = []
y_pred_all = []

for index, row in test_ratings.iterrows():
    user_id = row['userId']
    movie_id = row['movieId']
    y_true_all.append(row['rating'])
    y_pred_all.append(predict_rating(user_item_matrix, item_similarity_df, user_id, movie_id, k=255))

# Convert to integers
y_true_all = [int(val) for val in y_true_all]
y_pred_all = [int(val) for val in y_pred_all]

# Generate classification report
print(classification_report(y_true_all, y_pred_all))

              precision    recall  f1-score   support

           0       0.72      1.00      0.84      7851
           1       1.00      0.65      0.78      8417

    accuracy                           0.82     16268
   macro avg       0.86      0.82      0.81     16268
weighted avg       0.86      0.82      0.81     16268



In [17]:
# Assuming ratings DataFrame is already loaded
user_item_matrix = ratings.pivot(index='userId', columns='movieId', values='rating')

# Assuming movies_features DataFrame is already loaded
item_metadata_matrix = movies_features

# Fill NaN with a distinct value (e.g., 0) to ensure it doesn't affect similarity
item_metadata_matrix_filled = item_metadata_matrix.fillna(0)

# Compute cosine similarity
item_similarity = cosine_similarity(item_metadata_matrix_filled)
item_similarity_df = pd.DataFrame(item_similarity, index=item_metadata_matrix.index, columns=item_metadata_matrix.index)

# Apply thresholding to item similarity matrix
threshold = 0.3 # Adjust as needed
item_similarity_df_thresholded = item_similarity_df.applymap(lambda x: x if x > threshold else 0)

# Create training and testing sets
train_ratings, test_ratings = train_test_split(ratings, test_size=0.2, random_state=42)


# Predict ratings for all users and movies in the test set using item-item similarity
y_true_all = []
y_pred_all = []

# Predict ratings using thresholded similarity matrix
for index, row in test_ratings.iterrows():
    user_id = row['userId']
    movie_id = row['movieId']
    y_true_all.append(row['rating'])
    # Get k nearest neighbors based on thresholded item-item similarity
    nearest_neighbors = item_similarity_df_thresholded[movie_id].nlargest(255).index
    
    # Get the ratings of the user for the nearest neighbors
    neighbor_ratings = user_item_matrix.loc[user_id, nearest_neighbors]
    
    # If there are no ratings by nearest neighbors, predict 0
    if neighbor_ratings.isna().all():
        y_pred_all.append(0)
    else:
        # Predict the rating based on the majority vote of neighbor ratings
        y_pred_all.append(Counter(neighbor_ratings.dropna()).most_common(1)[0][0])
      


# Convert to integers
y_true_all = [int(val) for val in y_true_all]
y_pred_all = [int(val) for val in y_pred_all]

# Generate classification report
print(classification_report(y_true_all, y_pred_all))

  item_similarity_df_thresholded = item_similarity_df.applymap(lambda x: x if x > threshold else 0)


              precision    recall  f1-score   support

           0       0.76      0.75      0.76      7851
           1       0.77      0.78      0.78      8417

    accuracy                           0.77     16268
   macro avg       0.77      0.77      0.77     16268
weighted avg       0.77      0.77      0.77     16268



In [15]:

def string_to_array(s):
    # Remove square brackets and split by spaces
    s = s.strip('[]')
    # Convert string to list of floats
    return np.array([float(x) for x in s.split()])

# Convert string representations to numpy arrays
movies_metadata['bert_embedding'] = movies_metadata['bert_embedding'].apply(string_to_array)

# Now stack the embeddings
movie_embeddings = np.stack(movies_metadata['bert_embedding'].values)

# Compute cosine similarity
cosine_sim = cosine_similarity(movie_embeddings, movie_embeddings)


Shape of movie_embeddings: (6319, 768)
Shape of cosine_sim: (6319, 6319)
Min cosine similarity: 0.43199873728462207
Max cosine similarity: 1.0000000000000018


In [16]:
# Create a reverse mapping of indices and movieId
indices = pd.Series(movies_metadata.index, index=movies_metadata['movieId']).drop_duplicates()

def get_recommendations(movieId, cosine_sim=cosine_sim):
    if movieId not in indices:
        return []
    idx = indices[movieId]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]  # Top 10 similar movies
    movie_indices = [i[0] for i in sim_scores]
    return movies_metadata['movieId'].iloc[movie_indices]

# Step 2: Integrate User Ratings
user_item_matrix = ratings.pivot(index='userId', columns='movieId', values='rating')

# Function to get k-nearest neighbors based on item-item similarity
def get_k_nearest_neighbors(item_similarity_df, movie_id, k=10):
    if movie_id not in item_similarity_df.columns:
        return pd.Index([])
    sim_scores = item_similarity_df[movie_id]
    nearest_neighbors = sim_scores.sort_values(ascending=False).index[1:k+1]
    return nearest_neighbors

# Function to predict rating for a specific user and movie
def predict_rating(user_item_matrix, item_similarity_df, user_id, movie_id, k=10):
    nearest_neighbors = get_k_nearest_neighbors(item_similarity_df, movie_id, k)
    if nearest_neighbors.empty:
        return 0
    neighbor_ratings = user_item_matrix.loc[user_id, nearest_neighbors]
    if neighbor_ratings.isna().all():
        return 0
    return Counter(neighbor_ratings.dropna()).most_common(1)[0][0]

# Compute the item similarity DataFrame using BERT embeddings
item_similarity_df = pd.DataFrame(cosine_sim, index=movies_metadata['movieId'], columns=movies_metadata['movieId'])

# Split the ratings data into training and testing sets
train_ratings = ratings.sample(frac=0.8, random_state=42)
test_ratings = ratings.drop(train_ratings.index)

# Evaluate at K=255
K = 255
y_true_all = []
y_pred_all = []

for index, row in test_ratings.iterrows():
    user_id = row['userId']
    movie_id = row['movieId']
    y_true_all.append(row['rating'])
    y_pred_all.append(predict_rating(user_item_matrix, item_similarity_df, user_id, movie_id, k=K))

# Convert to integers
y_true_all = [int(val) for val in y_true_all]
y_pred_all = [int(val) for val in y_pred_all]

# Generate classification report and confusion matrix
print("Classification Report:")
print(classification_report(y_true_all, y_pred_all))

print("Confusion Matrix:")
print(confusion_matrix(y_true_all, y_pred_all))


Classification Report:
              precision    recall  f1-score   support

           0       0.64      0.66      0.65      7803
           1       0.67      0.66      0.66      8464

    accuracy                           0.66     16267
   macro avg       0.66      0.66      0.66     16267
weighted avg       0.66      0.66      0.66     16267

Confusion Matrix:
[[5117 2686]
 [2916 5548]]


### combined version

In [15]:
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split

# Assuming ratings, movies_features, movies_metadata DataFrames are already loaded

# Method 1: Collaborative Filtering
user_item_matrix = ratings.pivot(index='userId', columns='movieId', values='rating')
item_user_matrix_filled = user_item_matrix.T.fillna(0.5)
item_similarity_cf = cosine_similarity(item_user_matrix_filled)
item_similarity_df_cf = pd.DataFrame(item_similarity_cf, index=user_item_matrix.columns, columns=user_item_matrix.columns)

# Method 2: Metadata-based Similarity
item_metadata_matrix_filled = movies_features.fillna(0)
item_similarity_metadata = cosine_similarity(item_metadata_matrix_filled)
item_similarity_df_metadata = pd.DataFrame(item_similarity_metadata, index=item_metadata_matrix_filled.index, columns=item_metadata_matrix_filled.index)
threshold = 0.3
item_similarity_df_metadata_thresholded = item_similarity_df_metadata.applymap(lambda x: x if x > threshold else 0)

# Method 3: BERT-based Similarity
def string_to_array(s):
    s = s.strip('[]')
    return np.array([float(x) for x in s.split()])

movies_metadata['bert_embedding'] = movies_metadata['bert_embedding'].apply(string_to_array)
movie_embeddings = np.stack(movies_metadata['bert_embedding'].values)
cosine_sim_bert = cosine_similarity(movie_embeddings, movie_embeddings)
item_similarity_df_bert = pd.DataFrame(cosine_sim_bert, index=movies_metadata['movieId'], columns=movies_metadata['movieId'])

# Split the ratings data into training and testing sets
train_ratings, test_ratings = train_test_split(ratings, test_size=0.2, random_state=42)

# Function to get k-nearest neighbors
def get_k_nearest_neighbors(item_similarity_df, movie_id, k=255):
    if movie_id not in item_similarity_df.columns:
        return pd.Index([])
    sim_scores = item_similarity_df[movie_id]
    nearest_neighbors = sim_scores.sort_values(ascending=False).index[:k]
    return nearest_neighbors

# Function to predict rating
def predict_rating(user_item_matrix, item_similarity_df, user_id, movie_id, k=255):
    nearest_neighbors = get_k_nearest_neighbors(item_similarity_df, movie_id, k)
    if nearest_neighbors.empty:
        return 0
    neighbor_ratings = user_item_matrix.loc[user_id, nearest_neighbors]
    if neighbor_ratings.isna().all():
        return 0
    return Counter(neighbor_ratings.dropna()).most_common(1)[0][0]

# Evaluate at K=255 for combined predictions
K = 255
y_true_all = []
y_pred_all = []

for index, row in test_ratings.iterrows():
    user_id = row['userId']
    movie_id = row['movieId']
    y_true_all.append(row['rating'])
    
    # Predictions from the three methods
    pred_cf = predict_rating(user_item_matrix, item_similarity_df_cf, user_id, movie_id, k=K)
    pred_metadata = predict_rating(user_item_matrix, item_similarity_df_metadata_thresholded, user_id, movie_id, k=K)
    pred_bert = predict_rating(user_item_matrix, item_similarity_df_bert, user_id, movie_id, k=K)
    
    # Combine predictions using majority voting
    combined_prediction = Counter([pred_cf, pred_metadata, pred_bert]).most_common(1)[0][0]
    y_pred_all.append(combined_prediction)

# Convert to integers
y_true_all = [int(val) for val in y_true_all]
y_pred_all = [int(val) for val in y_pred_all]

# Generate classification report and confusion matrix
print("Classification Report:")
print(classification_report(y_true_all, y_pred_all))

print("Confusion Matrix:")
print(confusion_matrix(y_true_all, y_pred_all))


  item_similarity_df_metadata_thresholded = item_similarity_df_metadata.applymap(lambda x: x if x > threshold else 0)


Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.86      0.86      7851
           1       0.87      0.86      0.87      8417

    accuracy                           0.86     16268
   macro avg       0.86      0.86      0.86     16268
weighted avg       0.86      0.86      0.86     16268

Confusion Matrix:
[[6726 1125]
 [1138 7279]]
