### Data preprocessing

In [2]:
# ! pip install networkx
# ! pip install keras
# ! pip install tensorflow 
import keras
import pandas as pd
import numpy as np
import networkx as nx
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score
from keras.layers import Input, Embedding, Dot, Concatenate, Dense, Flatten, Dropout
from keras.models import Model
import tensorflow as tf


In [42]:
# Load the necessary data
ratings_df = pd.read_csv('ml-100k/u.data', sep='\t', names=['userId', 'movieId', 'rating', 'timestamp'])
movies_df = pd.read_csv('ml-100k/u.item', sep='|', encoding='latin-1', usecols=[0, 1, 2], names=['movieId', 'title', 'release_date'])[:50]
users_df = pd.read_csv('ml-100k/u.user', sep='|', names=['userId', 'age', 'gender', 'occupation', 'zip_code'])[:100]

# Extract the first 100 users' and 50 movies' ratings
user_ids = users_df['userId'].values
movie_ids = movies_df['movieId'].values
ratings_df = ratings_df[(ratings_df['userId'].isin(user_ids)) & (ratings_df['movieId'].isin(movie_ids))]


In [43]:
print(ratings_df.head())
print(ratings_df.shape)
print(movies_df.head())
print(movies_df.shape)
print(users_df.head())
print(users_df.shape)


     userId  movieId  rating  timestamp
39        7       32       4  891350932
40       10       16       4  877888877
48       99        4       5  886519097
117      62       21       3  879373460
136      59       23       5  888205300
(834, 4)
   movieId              title release_date
0        1   Toy Story (1995)  01-Jan-1995
1        2   GoldenEye (1995)  01-Jan-1995
2        3  Four Rooms (1995)  01-Jan-1995
3        4  Get Shorty (1995)  01-Jan-1995
4        5     Copycat (1995)  01-Jan-1995
(50, 3)
   userId  age gender  occupation zip_code
0       1   24      M  technician    85711
1       2   53      F       other    94043
2       3   23      M      writer    32067
3       4   24      M  technician    43537
4       5   33      F       other    15213
(100, 5)


In [44]:
ratings_df = pd.merge(ratings_df, movies_df, on='movieId')
ratings_df = pd.merge(ratings_df, users_df, on='userId')
ratings_df.shape


(834, 10)

In [45]:
ratings_df.head()


Unnamed: 0,userId,movieId,rating,timestamp,title,release_date,age,gender,occupation,zip_code
0,7,32,4,891350932,Crumb (1994),01-Jan-1994,57,M,administrator,91344
1,7,4,5,891351772,Get Shorty (1995),01-Jan-1995,57,M,administrator,91344
2,7,23,3,891351383,Taxi Driver (1976),16-Feb-1996,57,M,administrator,91344
3,7,47,5,891352692,Ed Wood (1994),01-Jan-1994,57,M,administrator,91344
4,7,25,3,891352451,"Birdcage, The (1996)",08-Mar-1996,57,M,administrator,91344


In [46]:
ratings_df = ratings_df.loc[:, ['userId', 'movieId', 'rating', 'title']]
ratings_df.head()


Unnamed: 0,userId,movieId,rating,title
0,7,32,4,Crumb (1994)
1,7,4,5,Get Shorty (1995)
2,7,23,3,Taxi Driver (1976)
3,7,47,5,Ed Wood (1994)
4,7,25,3,"Birdcage, The (1996)"


In [47]:
ratings_df.shape


(834, 4)

In [49]:
# checking null value
ratings_df.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 834 entries, 0 to 833
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   userId   834 non-null    int64 
 1   movieId  834 non-null    int64 
 2   rating   834 non-null    int64 
 3   title    834 non-null    object
dtypes: int64(3), object(1)
memory usage: 32.6+ KB


In [51]:
ratings_df.isnull().sum()

userId     0
movieId    0
rating     0
title      0
dtype: int64

In [54]:
# check number of unique movie, user
print(ratings_df['movieId'].nunique())
print(ratings_df['userId'].nunique())    # Because of merging lost some user


50
88


In [None]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(ratings, test_size=0.2, random_state=42)

train_matrix = train.pivot_table(index='user_id', columns='title', values='rating').fillna(0)
test_matrix = test.pivot_table(index='user_id', columns='title', values='rating').fillna(0)

mean_ratings = train_matrix.mean(axis=1)
train_matrix = train_matrix.sub(mean_ratings, axis=0)
test_matrix = test_matrix.sub(mean_ratings, axis=0)


In [None]:
from scipy.sparse import csr_matrix

train_sparse = csr_matrix(train_matrix.values)
test_sparse = csr_matrix(test_matrix.values)


In [None]:
# ! pip install networkx
# ! pip install scikit-surprise

from surprise import Dataset
from surprise import Reader
import networkx as nx

# Define the rating scale
reader = Reader(rating_scale=(1, 5))

# Load train and test matrices into surprise datasets
trainset = Dataset.load_from_df(train[['user_id', 'item_id', 'rating']], reader)
testset = Dataset.load_from_df(test[['user_id', 'item_id', 'rating']], reader)

G = nx.DiGraph()
for uid, iid, rating in trainset.build_full_trainset().all_ratings():
    G.add_edge(uid, iid, weight=rating)

from networkx.algorithms.link_analysis.pagerank_alg import pagerank

user_pageranks = {}
for user_id in trainset.build_full_trainset().all_users():
    user_pageranks[user_id] = pagerank(G, alpha=0.85, personalization={user_id: 1}, weight='weight')


In [None]:
def get_recommendations(user_id, num_items=10):
    scores = user_pageranks[user_id]
    rated_items = set([iid for uid, iid, _ in trainset.ur[user_id]])
    sorted_items = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    recommendations = []
    for iid, score in sorted_items:
        if iid not in rated_items:
            recommendations.append((iid, score))
            if len(recommendations) == num_items:
                break
    return recommendations


In [None]:
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import train_test_split
from surprise import BaselineOnly
from surprise import accuracy

reader = Reader(rating_scale=(1, 5))

# Load data from DataFrame
data = Dataset.load_from_df(ratings[['user_id', 'item_id', 'rating']], reader)

# Split data into train and test sets
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

# Fit the baseline model on the trainset
algo = BaselineOnly()
algo.fit(trainset)

# Test the model on the testset
predictions = algo.test(testset)
rmse = accuracy.rmse(predictions)
mae = accuracy.mae(predictions)


Estimating biases using als...
RMSE: 0.9503
MAE:  0.7558


In [None]:
from surprise import BaselineOnly
from surprise import accuracy

algo = BaselineOnly()
algo.fit(trainset)
predictions = algo.test(testset)
rmse = accuracy.rmse(predictions)
mae = accuracy.mae(predictions)


Estimating biases using als...
RMSE: 0.9503
MAE:  0.7558


In [None]:
new_ratings = pd.DataFrame({'user_id': [1000, 1000, 1000, 1000],
                            'item_id': [1, 2, 6, 10],
                            'rating': [5, 4, 2, 3],
                            'timestamp': [0, 0, 0, 0]})

In [None]:
ratings = pd.concat([ratings, new_ratings])

In [None]:
recommendations = get_recommendations(101, 1)
print(recommendations)


### 2nd approch CNN

In [None]:
# Compute the mean rating for each movie
movie_means = ratings_df.groupby('movieId').rating.mean()

# Compute the similarity between movies based on user ratings
movie_similarities = pd.DataFrame(np.zeros((len(movie_means), len(movie_means))), index=movie_means.index, columns=movie_means.index)
for movie1 in movie_means.index:
    for movie2 in movie_means.index:
        if movie1 == movie2:
            movie_similarities.loc[movie1, movie2] = 1.0
        else:
            ratings1 = ratings_df[ratings_df.movieId == movie1].set_index('userId')['rating']
            ratings2 = ratings_df[ratings_df.movieId == movie2].set_index('userId')['rating']
            if len(ratings1) == 0 or len(ratings2) == 0:
                similarity = 0.0
            else:
                similarity = ratings1.corr(ratings2)
            movie_similarities.loc[movie1, movie2] = similarity

# Construct a graph of movies with edges representing similarity
movie_graph = nx.from_numpy_array(movie_similarities.values)


In [None]:
# Define a set of seed movies for Personalized PageRank
seed_movies = [49, 18, 25, 2, 31]

# Calculate the Personalized PageRank scores for the seed movies
pr = nx.pagerank(movie_graph, alpha=0.75, personalization={movie: 1 if movie in seed_movies else 0 for movie in movie_means.index}, max_iter=1000, tol=1e-7)


In [None]:
# Join the Personalized PageRank dataframe with the movies dataframe
movies_df_with_pr = movies_df.join(pd.Series(pr, name='pagerank'))

# Encode the genres as dummy variables
movie_genres = movies_df_with_pr.title.str.extract('\((.*?)\)', expand=False).str.split('|')
genres = set([genre for movie in movie_genres.dropna() for genre in movie])
for genre in genres:
    movies_df_with_pr['genre_' + genre] = movie_genres.apply(lambda x: pd.Series([1 if genre in x else 0 for genre in genres]))
    movies_df_with_pr = pd.concat([movies_df_with_pr, movie_genres_encoded], axis=1)



In [None]:
# Split the data into training and test sets
train_df, test_df = train_test_split(ratings_df, test_size=0.2)

# Define the input layers for the CNN
user_input = Input(shape=(1,))
movie_input = Input(shape=(1,))
movie_input_encoded = Input(shape=(len(genres),))


In [None]:
# Define the embedding layers for the users and movies
user_embedding = Embedding(input_dim=len(users_df), output_dim=50)(user_input)
movie_embedding = Embedding(input_dim=len(movies_df), output_dim=50)(movie_input)
movie_embedding_encoded = Flatten()(Embedding(input_dim=len(movies_df_with_pr), output_dim=len(genres), input_length=1, name='movie_embedding_encoded')(movie_input))

# Concatenate the embeddings with the genre encoding
input_layer = Concatenate()([user_embedding, movie_embedding, movie_embedding_encoded])


In [None]:
# Define the CNN layers
cnn_layer = Conv1D(filters=32, kernel_size=3, activation='relu')(input_layer)
cnn_layer = Dropout(rate=0.2)(cnn_layer)
cnn_layer = MaxPooling1D(pool_size=2)(cnn_layer)
cnn_layer = Flatten()(cnn_layer)
cnn_layer = Dense(units=64, activation='relu')(cnn_layer)
cnn_layer = Dropout(rate=0.5)(cnn_layer)
output_layer = Dense(units=1, activation='sigmoid')(cnn_layer)

# Compile the model
model = Model(inputs=[user_input, movie_input, movie_input_encoded], outputs=output_layer)
model.compile(optimizer='adam', loss='binary_crossentropy')

# Fit the model
model.fit([train_df.userId, train_df.movieId, movies_df_with_pr.loc[train_df.movieId][['genre_' + genre for genre in genres]].values], train_df.rating >= 4, epochs=10, batch_size=32, validation_split=0.2)


In [None]:
# Predict the ratings for the test set
y_pred = model.predict([test_df.userId, test_df.movieId, movies_df_with_pr.loc[test_df.movieId][['genre_' + genre for genre in genres]].values])

# Convert the predicted ratings to binary
y_pred_binary = (y_pred >= 0.5).astype(int)

# Compute the accuracy, precision, and recall
accuracy = accuracy_score(test_df.rating >= 4, y_pred_binary)
precision = precision_score(test_df.rating >= 4, y_pred_binary)
recall = recall_score(test_df.rating >= 4, y_pred_binary)

print('Accuracy: {:.2f}%'.format(accuracy * 100))
print('Precision: {:.2f}%'.format(precision * 100))
print('Recall: {:.2f}%'.format(recall * 100))
