In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import ast
from collections import Counter
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split


In [2]:
ratings = pd.read_csv(r'filtered_ratings.csv')
ratings.rename(columns={'Rating': 'rating'}, inplace=True)
ratings.rename(columns={'UserID': 'userId'}, inplace=True)
ratings.rename(columns={'MovieID': 'movieId'}, inplace=True)
ratings['rating'] = ratings['rating'].apply(lambda x: 1 if x > 3.5 else 0)
ratings

Unnamed: 0,userId,movieId,rating,Timestamp
0,1,1193,1,2000-12-31 22:12:40
1,1,661,0,2000-12-31 22:35:09
2,1,914,0,2000-12-31 22:32:48
3,1,3408,1,2000-12-31 22:04:35
4,1,2355,1,2001-01-06 23:38:11
...,...,...,...,...
997294,6040,1091,0,2000-04-26 02:35:41
997295,6040,1094,1,2000-04-25 23:21:27
997296,6040,562,1,2000-04-25 23:19:06
997297,6040,1096,1,2000-04-26 02:20:48


In [3]:
All_parts_objects = pd.read_csv(r'1M_YOLO_features.csv', index_col=1)
columns_to_keep = [str(i) for i in range(80)]
movies_features = All_parts_objects[columns_to_keep]
movies_features

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,70,71,72,73,74,75,76,77,78,79
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,194,0,1,1,3,0,0,1,0,0,...,0,0,1,33,0,1,0,5,0,1
2,629,0,43,0,3,0,0,5,0,2,...,0,0,0,9,16,14,4,0,0,3
3,709,0,11,0,0,0,0,2,26,0,...,0,0,0,0,0,2,0,0,0,0
4,1141,0,23,0,0,0,0,8,0,1,...,0,0,0,0,2,0,0,0,0,0
6,1049,0,115,0,12,5,1,11,0,9,...,0,0,2,0,0,0,1,0,0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3948,1040,0,45,0,0,0,0,0,0,0,...,0,0,6,2,2,0,0,0,0,0
3949,373,0,4,0,0,0,0,0,0,1,...,0,0,19,0,7,0,5,0,0,4
3950,1129,0,3,0,0,0,1,6,0,0,...,0,0,5,0,0,0,0,0,0,0
3951,468,0,37,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1


In [4]:
# Remove rows where all features are 0
movies_features = movies_features[(movies_features != 0).any(axis=1)]
movies_features

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,70,71,72,73,74,75,76,77,78,79
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,194,0,1,1,3,0,0,1,0,0,...,0,0,1,33,0,1,0,5,0,1
2,629,0,43,0,3,0,0,5,0,2,...,0,0,0,9,16,14,4,0,0,3
3,709,0,11,0,0,0,0,2,26,0,...,0,0,0,0,0,2,0,0,0,0
4,1141,0,23,0,0,0,0,8,0,1,...,0,0,0,0,2,0,0,0,0,0
6,1049,0,115,0,12,5,1,11,0,9,...,0,0,2,0,0,0,1,0,0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3948,1040,0,45,0,0,0,0,0,0,0,...,0,0,6,2,2,0,0,0,0,0
3949,373,0,4,0,0,0,0,0,0,1,...,0,0,19,0,7,0,5,0,0,4
3950,1129,0,3,0,0,0,1,6,0,0,...,0,0,5,0,0,0,0,0,0,0
3951,468,0,37,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1


In [5]:
movies_metadata = pd.read_csv(r'1M_BERT_embeddings_tag_and_whisper_trans.csv')

In [6]:
movies_metadata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2410 entries, 0 to 2409
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   movieId         2410 non-null   int64 
 1   tags            2410 non-null   object
 2   bert_embedding  2410 non-null   object
dtypes: int64(1), object(2)
memory usage: 56.6+ KB


In [7]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 997299 entries, 0 to 997298
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   userId     997299 non-null  int64 
 1   movieId    997299 non-null  int64 
 2   rating     997299 non-null  int64 
 3   Timestamp  997299 non-null  object
dtypes: int64(3), object(1)
memory usage: 30.4+ MB


In [8]:
movies_features.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2415 entries, 1 to 3952
Data columns (total 80 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   0       2415 non-null   int64
 1   1       2415 non-null   int64
 2   2       2415 non-null   int64
 3   3       2415 non-null   int64
 4   4       2415 non-null   int64
 5   5       2415 non-null   int64
 6   6       2415 non-null   int64
 7   7       2415 non-null   int64
 8   8       2415 non-null   int64
 9   9       2415 non-null   int64
 10  10      2415 non-null   int64
 11  11      2415 non-null   int64
 12  12      2415 non-null   int64
 13  13      2415 non-null   int64
 14  14      2415 non-null   int64
 15  15      2415 non-null   int64
 16  16      2415 non-null   int64
 17  17      2415 non-null   int64
 18  18      2415 non-null   int64
 19  19      2415 non-null   int64
 20  20      2415 non-null   int64
 21  21      2415 non-null   int64
 22  22      2415 non-null   int64
 23  23      2415 non-n

In [9]:
unique_movieids_ratings = ratings['movieId'].unique()
len(unique_movieids_ratings)

3626

In [10]:
unique_movieids_movies_metadata = movies_metadata['movieId'].unique()
len(unique_movieids_movies_metadata)

2410

In [11]:
unique_movieids_movies_features = movies_features.index.unique()
len(unique_movieids_movies_features)

2415

In [12]:
common_movieids = list(set(unique_movieids_movies_metadata) & set(unique_movieids_ratings) & set(unique_movieids_movies_features))
len(common_movieids)

2407

In [13]:
ratings = ratings[ratings['movieId'].isin(common_movieids)]
ratings.info()

<class 'pandas.core.frame.DataFrame'>
Index: 733247 entries, 1 to 997297
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   userId     733247 non-null  int64 
 1   movieId    733247 non-null  int64 
 2   rating     733247 non-null  int64 
 3   Timestamp  733247 non-null  object
dtypes: int64(3), object(1)
memory usage: 28.0+ MB


In [14]:
movies_metadata = movies_metadata[movies_metadata['movieId'].isin(common_movieids)]
movies_metadata.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2407 entries, 0 to 2409
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   movieId         2407 non-null   int64 
 1   tags            2407 non-null   object
 2   bert_embedding  2407 non-null   object
dtypes: int64(1), object(2)
memory usage: 75.2+ KB


In [15]:
movies_features = movies_features.iloc[movies_features.index.isin(common_movieids)]
movies_features.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2407 entries, 1 to 3952
Data columns (total 80 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   0       2407 non-null   int64
 1   1       2407 non-null   int64
 2   2       2407 non-null   int64
 3   3       2407 non-null   int64
 4   4       2407 non-null   int64
 5   5       2407 non-null   int64
 6   6       2407 non-null   int64
 7   7       2407 non-null   int64
 8   8       2407 non-null   int64
 9   9       2407 non-null   int64
 10  10      2407 non-null   int64
 11  11      2407 non-null   int64
 12  12      2407 non-null   int64
 13  13      2407 non-null   int64
 14  14      2407 non-null   int64
 15  15      2407 non-null   int64
 16  16      2407 non-null   int64
 17  17      2407 non-null   int64
 18  18      2407 non-null   int64
 19  19      2407 non-null   int64
 20  20      2407 non-null   int64
 21  21      2407 non-null   int64
 22  22      2407 non-null   int64
 23  23      2407 non-n

### creation of cosine matrix

In [16]:
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split

# Assuming ratings, movies_features, movies_metadata DataFrames are already loaded

# Method 1: Collaborative Filtering
user_item_matrix = ratings.pivot(index='userId', columns='movieId', values='rating')
item_user_matrix_filled = user_item_matrix.T.fillna(0.5)
item_similarity_cf = cosine_similarity(item_user_matrix_filled)
item_similarity_df_cf = pd.DataFrame(item_similarity_cf, index=user_item_matrix.columns, columns=user_item_matrix.columns)

# Method 2: Metadata-based Similarity
item_metadata_matrix_filled = movies_features.fillna(0)
item_similarity_metadata = cosine_similarity(item_metadata_matrix_filled)
item_similarity_df_metadata = pd.DataFrame(item_similarity_metadata, index=item_metadata_matrix_filled.index, columns=item_metadata_matrix_filled.index)
threshold = 0.3
item_similarity_df_metadata_thresholded = item_similarity_df_metadata.applymap(lambda x: x if x > threshold else 0)

# Method 3: BERT-based Similarity
def string_to_array(s):
    s = s.strip('[]')
    return np.array([float(x) for x in s.split()])

movies_metadata['bert_embedding'] = movies_metadata['bert_embedding'].apply(string_to_array)
movie_embeddings = np.stack(movies_metadata['bert_embedding'].values)
cosine_sim_bert = cosine_similarity(movie_embeddings, movie_embeddings)
item_similarity_df_bert = pd.DataFrame(cosine_sim_bert, index=movies_metadata['movieId'], columns=movies_metadata['movieId'])

# Split the ratings data into training and testing sets
train_ratings, test_ratings = train_test_split(ratings, test_size=0.2, random_state=42)

# Function to get k-nearest neighbors
def get_k_nearest_neighbors(item_similarity_df, movie_id, k=255):
    if movie_id not in item_similarity_df.columns:
        return pd.Index([])
    sim_scores = item_similarity_df[movie_id]
    nearest_neighbors = sim_scores.sort_values(ascending=False).index[:k]
    return nearest_neighbors

# Function to predict rating
def predict_rating(user_item_matrix, item_similarity_df, user_id, movie_id, k=255):
    nearest_neighbors = get_k_nearest_neighbors(item_similarity_df, movie_id, k)
    if nearest_neighbors.empty:
        return 0
    neighbor_ratings = user_item_matrix.loc[user_id, nearest_neighbors]
    if neighbor_ratings.isna().all():
        return 0
    return Counter(neighbor_ratings.dropna()).most_common(1)[0][0]

# Evaluate at K=255 for combined predictions
K = 255
y_true_all = []
y_pred_all = []

for index, row in test_ratings.iterrows():
    user_id = row['userId']
    movie_id = row['movieId']
    y_true_all.append(row['rating'])
    
    # Predictions from the three methods
    pred_cf = predict_rating(user_item_matrix, item_similarity_df_cf, user_id, movie_id, k=K)
    pred_metadata = predict_rating(user_item_matrix, item_similarity_df_metadata_thresholded, user_id, movie_id, k=K)
    pred_bert = predict_rating(user_item_matrix, item_similarity_df_bert, user_id, movie_id, k=K)
    
    # Combine predictions using majority voting
    combined_prediction = Counter([pred_cf, pred_metadata, pred_bert]).most_common(1)[0][0]
    y_pred_all.append(combined_prediction)

# Convert to integers
y_true_all = [int(val) for val in y_true_all]
y_pred_all = [int(val) for val in y_pred_all]

# Generate classification report and confusion matrix
print("Classification Report:")
print(classification_report(y_true_all, y_pred_all))

print("Confusion Matrix:")
print(confusion_matrix(y_true_all, y_pred_all))


  item_similarity_df_metadata_thresholded = item_similarity_df_metadata.applymap(lambda x: x if x > threshold else 0)


Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.69      0.74     61450
           1       0.79      0.88      0.83     85200

    accuracy                           0.80    146650
   macro avg       0.80      0.78      0.79    146650
weighted avg       0.80      0.80      0.79    146650

Confusion Matrix:
[[42122 19328]
 [10494 74706]]


In [19]:
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_fscore_support, accuracy_score

k_values = [5, 20, 50, 100, 150, 255, 500, 750, 1000]
results = {}

for K in k_values:
    print(f"\nEvaluating at K={K}")
    y_true_all = []
    y_pred_all = []
    for index, row in test_ratings.iterrows():
        user_id = row['userId']
        movie_id = row['movieId']
        y_true_all.append(row['rating'])
        
        # Predictions from the three methods
        pred_cf = predict_rating(user_item_matrix, item_similarity_df_cf, user_id, movie_id, k=K)
        pred_metadata = predict_rating(user_item_matrix, item_similarity_df_metadata_thresholded, user_id, movie_id, k=K)
        pred_bert = predict_rating(user_item_matrix, item_similarity_df_bert, user_id, movie_id, k=K)
        
        # Combine predictions using majority voting
        combined_prediction = Counter([pred_cf, pred_metadata, pred_bert]).most_common(1)[0][0]
        y_pred_all.append(combined_prediction)

    # Convert to integers
    y_true_all = [int(val) for val in y_true_all]
    y_pred_all = [int(val) for val in y_pred_all]

    # Generate and print classification report
    print("Classification Report:")
    print(classification_report(y_true_all, y_pred_all))
    print("Confusion Matrix:")
    print(confusion_matrix(y_true_all, y_pred_all))

    # Store results for plotting
    precision, recall, _, _ = precision_recall_fscore_support(y_true_all, y_pred_all, average='weighted')
    accuracy = accuracy_score(y_true_all, y_pred_all)
    results[K] = {'accuracy': accuracy, 'precision': precision, 'recall': recall}

# Plotting
plt.figure(figsize=(12, 6))
plt.plot(k_values, [results[k]['accuracy'] for k in k_values], color='blue', label='Accuracy')
plt.plot(k_values, [results[k]['precision'] for k in k_values], color='red', label='Precision')
plt.plot(k_values, [results[k]['recall'] for k in k_values], color='green', label='Recall')

plt.xlabel('K Value')
plt.ylabel('Score')
plt.title('Accuracy, Precision, and Recall vs K Value')
plt.legend()
plt.grid(True)
plt.show()


Evaluating at K=5
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     61450
           1       1.00      1.00      1.00     85200

    accuracy                           1.00    146650
   macro avg       1.00      1.00      1.00    146650
weighted avg       1.00      1.00      1.00    146650

Confusion Matrix:
[[61254   196]
 [  181 85019]]

Evaluating at K=20
Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.95      0.96     61450
           1       0.96      0.98      0.97     85200

    accuracy                           0.96    146650
   macro avg       0.96      0.96      0.96    146650
weighted avg       0.96      0.96      0.96    146650

Confusion Matrix:
[[58393  3057]
 [ 2123 83077]]

Evaluating at K=50
Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.87      0.89     61450
           

KeyboardInterrupt: 