<a href="https://colab.research.google.com/github/Kkhokho/Movie_Recommendation_System/blob/main/Final_MF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

In [2]:
user_cols = ['user_id','age','sex','job','zipcode']
ratings_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
genre_cols = [
    "genre_unknown", "Action", "Adventure", "Animation", "Children", "Comedy",
    "Crime", "Documentary", "Drama", "Fantasy", "Film-Noir", "Horror",
    "Musical", "Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western"
]
movies_cols = [
    'movie_id', 'title', 'release_date', "video_release_date", "imdb_url"
] + genre_cols

In [72]:
users = pd.read_csv('/content/drive/MyDrive/ml-100k/u.user',sep='|', names=user_cols, encoding='latin-1')
ratings = pd.read_csv('/content/drive/MyDrive/ml-100k/u.data',sep='\t', names=ratings_cols, encoding='latin-1')
movies = pd.read_csv('/content/drive/MyDrive/ml-100k/u.item', sep='|', names=movies_cols, encoding='latin-1')

In [None]:
users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 943 entries, 0 to 942
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   user_id  943 non-null    int64 
 1   age      943 non-null    int64 
 2   sex      943 non-null    object
 3   job      943 non-null    object
 4   zipcode  943 non-null    object
dtypes: int64(2), object(3)
memory usage: 37.0+ KB


In [None]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 4 columns):
 #   Column          Non-Null Count   Dtype
---  ------          --------------   -----
 0   user_id         100000 non-null  int64
 1   movie_id        100000 non-null  int64
 2   rating          100000 non-null  int64
 3   unix_timestamp  100000 non-null  int64
dtypes: int64(4)
memory usage: 3.1 MB


In [None]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1682 entries, 0 to 1681
Data columns (total 24 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   movie_id            1682 non-null   int64  
 1   title               1682 non-null   object 
 2   release_date        1681 non-null   object 
 3   video_release_date  0 non-null      float64
 4   imdb_url            1679 non-null   object 
 5   genre_unknown       1682 non-null   int64  
 6   Action              1682 non-null   int64  
 7   Adventure           1682 non-null   int64  
 8   Animation           1682 non-null   int64  
 9   Children            1682 non-null   int64  
 10  Comedy              1682 non-null   int64  
 11  Crime               1682 non-null   int64  
 12  Documentary         1682 non-null   int64  
 13  Drama               1682 non-null   int64  
 14  Fantasy             1682 non-null   int64  
 15  Film-Noir           1682 non-null   int64  
 16  Horror

In [73]:
users["user_id"] = users["user_id"].astype(int)
movies['release_date'] = movies['release_date'].fillna('0-0-0')
# Since the ids start at 1, we shift them to start at 0
users["user_id"] = users["user_id"].apply(lambda x: str(x-1))
movies["movie_id"] = movies["movie_id"].apply(lambda x: str(x-1))
movies["year"] = movies['release_date'].apply(lambda x: int(str(x).split('-')[-1]) if x != 'nan' else np.nan)
ratings["movie_id"] = ratings["movie_id"].apply(lambda x: str(x-1))
ratings["user_id"] = ratings["user_id"].apply(lambda x: str(x-1))
ratings["rating"] = ratings["rating"].apply(lambda x: float(x))

In [57]:
def map_ids_to_indices(data):
    user_mapping = {user_id: index for index, user_id in enumerate(data["user_id"].unique())}
    movie_mapping = {movie_id: index for index, movie_id in enumerate(data["movie_id"].unique())}

    data["user_index"] = data["user_id"].map(user_mapping)
    data["movie_index"] = data["movie_id"].map(movie_mapping)

    return data

In [44]:
# Create mappings for movieId and userId to unique indices
ratings = map_ids_to_indices(ratings)

# Define a custom dataset class
class Dataset(Dataset):
    def __init__(self, ratings_df):
        self.user_indices = torch.tensor(ratings_df["user_index"].values, dtype=torch.long)
        self.movie_indices = torch.tensor(ratings_df["movie_index"].values, dtype=torch.long)
        self.ratings = torch.tensor(ratings_df["rating"].values, dtype=torch.float32)

    def __len__(self):
        return len(self.user_indices)

    def __getitem__(self, idx):
        return self.user_indices[idx], self.movie_indices[idx], self.ratings[idx]

# Create an instance of the custom dataset
dataset = Dataset(ratings)
# Create a batch for dataloader
batch_size = 64
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)



In [45]:
num_users = len(ratings["user_id"].unique())
num_items = len(ratings["movie_id"].unique())
latent_dim = 5
learning_rate = 0.001

In [46]:
class Matrix_Factorization(nn.Module):
  def __init__(self,num_users,num_items,latent_dim):
    super(Matrix_Factorization,self).__init__()
    self.user_embeddings = nn.Embedding(num_users,latent_dim)
    self.item_embeddings = nn.Embedding(num_items,latent_dim)

  def forward(self,user_indices,item_indices):
      user_latent = self.user_embeddings(user_indices)
      item_latent = self.item_embeddings(item_indices)
      return torch.sum(user_latent*item_latent,dim=1)

model = Matrix_Factorization(num_users,num_items,latent_dim)

In [47]:
criterion = nn.MSELoss()
optimizer = optim.SGD(model.parameters(), lr=learning_rate)

In [48]:
num_epochs = 5
for epoch in range(num_epochs):
    for user_indices, item_indices, true_ratings in dataloader:
        optimizer.zero_grad()
        predicted_ratings = model(user_indices, item_indices)
        loss = criterion(predicted_ratings, true_ratings)
        loss.backward()
        optimizer.step()

In [None]:
# new_user = {
#     'user_id': ...,
#     'age': ...,
#     'sex': ...,
#     'job': ...,
#     'zipcode': ...
# }
def solve_newuser(new_user,users_data,ratings_data,movies_data):
  users_data.append(new_user,ignore_index=True)
  ratings_data.drop("unix_timestamp",axis=1,inplace=True)
  new_data_users = [{'user_id': new_user.get("user_id"), 'movie_id': movie_id, 'rating': ratings_data["rating"].mean()}for movie_id in movies_data['movie_id']]
  ratings_data = ratings_data.append(new_data_users,ignore_index=True)

In [108]:
# new_movie = {
#     'movie_id': 1682,
#     'title': "Time Travel (2023)",
#     'imdb_url': "",
#     'release_date' :'10/10/2023',
#     'genre':"Action, Adventure"
# }
def solve_newmovie(new_movie,users_data,ratings_data,movies_data):
    movies_data.drop("video_release_date",axis=1,inplace=True)
    movies_data.drop("release_date",axis=1,inplace=True)

    dict = []
    for s in new_movie.get("genre").split(','):
      s = s.replace(' ','')
      dict.append(s)

    genre_mapping = {}
    for genre in genre_cols:
      if genre in dict:
          genre_mapping[genre] = 1
      else:
          genre_mapping[genre] = 0

    new_movie.update(genre_mapping)

    new_movie["year"] = new_movie["release_date"].split('/')[-1]
    new_movie.pop("release_date")
    new_movie.pop("genre")
    movies_data = movies_data.append(new_movie,ignore_index=True)
    # movies_data = movies_data.set_index("movie_id")

    X = movies_data[["genre_unknown","Action","Adventure","Animation","Children","Comedy","Crime","Documentary","Drama","Fantasy","Film-Noir","Horror","Musical","Mystery","Romance","Sci-Fi","Thriller","War","Western","year"]]
    X["year"] = X["year"].astype(int)

    from sklearn.cluster import KMeans
    kmeans = KMeans(n_clusters=5, random_state=0).fit(X)
    pred_label = kmeans.predict(X)
    new_movie_data = pd.DataFrame([new_movie])
    new_movie_data = new_movie_data[["genre_unknown", "Action", "Adventure", "Animation", "Children", "Comedy", "Crime", "Documentary", "Drama", "Fantasy", "Film-Noir", "Horror", "Musical", "Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western", "year"]]
    predict = kmeans.predict(new_movie_data)
    labels = kmeans.labels_
    cluster_0_data = X[labels == 0]
    cluster_1_data = X[labels == 1]
    cluster_2_data = X[labels == 2]
    cluster_3_data = X[labels == 3]
    cluster_4_data = X[labels == 4]
    index_values = cluster_0_data.index
    # Convert the index values to a list if needed
    index_list = index_values.tolist()
    filtered_movies = ratings_data[ratings_data['movie_id'].isin(index_list)]
    new_data_movies = [{'user_id': user_id, 'movie_id': new_movie.get("movie_id"), 'rating': ratings_data["rating"].mean()}for user_id in users_data['user_id']]
    # ratings_data.drop("movie_index",axis=1,inplace=True)
    # ratings_data.drop("user_index",axis=1,inplace=True)
    ratings_data = ratings_data.append(new_data_movies,ignore_index=True)

In [111]:
from torch.utils.data import DataLoader, TensorDataset
# Đọc dữ liệu từ tệp u1.base và xây dựng tập dữ liệu huấn luyện
# train_data = pd.read_csv("/content/drive/MyDrive/ml-100k/u1.base", sep='\t', names=["user_id", "movie_id", "rating", "timestamp"])

train_data = map_ids_to_indices(train_data)
# Xây dựng mô hình Matrix Factorization
model = Matrix_Factorization(num_users, num_items, latent_dim)


# Chuyển dữ liệu huấn luyện thành các tensors PyTorch
train_dataset = Dataset(train_data)

# Tạo DataLoader cho batching
batch_size = 64
dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
num_epochs = 20
for epoch in range(num_epochs):
    for user_indices, item_indices, true_ratings in dataloader:
        optimizer.zero_grad()
        predicted_ratings = model(user_indices, item_indices)
        loss = criterion(predicted_ratings, true_ratings)
        loss.backward()
        optimizer.step()


In [56]:
test_ratings = pd.read_csv("/content/drive/MyDrive/ml-100k/u2.test", sep='\t', names=["user_id", "movie_id", "rating", "timestamp"])

# Chuyển đổi chỉ mục cho dữ liệu kiểm thử
test_ratings = map_ids_to_indices(test_ratings)
user_indices = torch.tensor(test_ratings["user_index"].values, dtype=torch.long)
movie_indices = torch.tensor(test_ratings["movie_index"].values, dtype=torch.long)

# Sử dụng mô hình để đưa ra dự đoán
predicted_ratings = model(user_indices,movie_indices)
predicted_ratings = predicted_ratings.detach().numpy()
# So sánh dự đoán và giá trị thực tế
from sklearn.metrics import mean_squared_error
rmse = mean_squared_error(predicted_ratings, test_ratings["rating"], squared=False)
print(f"RMSE on test data: {rmse}")


RMSE on test data: 1.5172575434821483


In [None]:
predicted_ratings

array([-0.91061246,  0.21900567, -0.96782583, ...,  2.5534167 ,
        0.39067698, -1.6449723 ], dtype=float32)

In [109]:
new_movie = {
    'movie_id' : 0,
    'title' : "Toy Story (1995)",
    'imdb_url' : "http://us.imdb.com/M/title-exact?Toy%20Story%20(1995)",
    'release_date' : '01/01/1995',
    'genre' : "Animation, Children, Comedy, Crime"
}

In [110]:
users = pd.read_csv('/content/drive/MyDrive/ml-100k/u.user',sep='|', names=user_cols, encoding='latin-1')
movies = pd.read_csv('/content/drive/MyDrive/ml-100k/u.item', sep='|', names=movies_cols, encoding='latin-1')
train_data = pd.read_csv("/content/drive/MyDrive/ml-100k/u1.base", sep='\t', names=["user_id", "movie_id", "rating", "timestamp"])
users["user_id"] = users["user_id"].astype(int)
movies['release_date'] = movies['release_date'].fillna('0-0-0')
# Since the ids start at 1, we shift them to start at 0
users['user_id'] = users['user_id'].apply(lambda x: str(x-1))
movies['movie_id'] = movies['movie_id'].apply(lambda x: str(x-1))
movies['year'] = movies['release_date'].apply(lambda x: int(str(x).split('-')[-1]) if x != 'nan' else np.nan)
movies = movies[movies['movie_id'] != '0']
train_data = train_data[train_data['movie_id'] != '0']

solve_newmovie(new_movie=new_movie,
               users_data = users,
               movies_data = movies,
               ratings_data=train_data)

  movies_data = movies_data.append(new_movie,ignore_index=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["year"] = X["year"].astype(int)
  ratings_data = ratings_data.append(new_data_movies,ignore_index=True)


In [112]:
test_ratings = pd.read_csv("/content/drive/MyDrive/ml-100k/u1.test", sep='\t', names=["user_id", "movie_id", "rating", "timestamp"])
# Chuyển đổi chỉ mục cho dữ liệu kiểm thử
test_ratings = map_ids_to_indices(test_ratings)

user_indices = torch.tensor(test_ratings["user_index"].values, dtype=torch.long)
movie_indices = torch.tensor(test_ratings["movie_index"].values, dtype=torch.long)

# Sử dụng mô hình để đưa ra dự đoán
predicted_ratings = model(user_indices,movie_indices)
predicted_ratings = predicted_ratings.detach().numpy()
# So sánh dự đoán và giá trị thực tế
from sklearn.metrics import mean_squared_error
rmse = mean_squared_error(predicted_ratings, test_ratings["rating"], squared=False)
print(f"RMSE on test data: {rmse}")


RMSE on test data: 1.599934231853058


In [104]:
movies

Unnamed: 0,movie_id,title,imdb_url,genre_unknown,Action,Adventure,Animation,Children,Comedy,Crime,...,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,year
1,1,GoldenEye (1995),http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1995
2,2,Four Rooms (1995),http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1995
3,3,Get Shorty (1995),http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1995
4,4,Copycat (1995),http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,0,1,...,0,0,0,0,0,0,1,0,0,1995
5,5,Shanghai Triad (Yao a yao yao dao waipo qiao) ...,http://us.imdb.com/Title?Yao+a+yao+yao+dao+wai...,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1995
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1677,1677,Mat' i syn (1997),http://us.imdb.com/M/title-exact?Mat%27+i+syn+...,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1998
1678,1678,B. Monkey (1998),http://us.imdb.com/M/title-exact?B%2E+Monkey+(...,0,0,0,0,0,0,0,...,0,0,0,0,1,0,1,0,0,1998
1679,1679,Sliding Doors (1998),http://us.imdb.com/Title?Sliding+Doors+(1998),0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1998
1680,1680,You So Crazy (1994),http://us.imdb.com/M/title-exact?You%20So%20Cr...,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1994


In [106]:
train_data

Unnamed: 0,user_id,movie_id,rating,timestamp,user_index,movie_index
0,1,1,5,874965758,0,0
1,1,2,3,876893171,0,1
2,1,3,4,878542960,0,2
3,1,4,3,876893119,0,3
4,1,5,3,889751712,0,4
...,...,...,...,...,...,...
79995,943,1067,2,875501756,942,901
79996,943,1074,4,888640250,942,906
79997,943,1188,3,888640250,942,1027
79998,943,1228,3,888640275,942,1082


In [107]:
new_movie

{'movie_id': 0,
 'title': 'Toy Story (1995)',
 'imdb_url': 'http://us.imdb.com/M/title-exact?Toy%20Story%20(1995)',
 'genre_unknown': 0,
 'Action': 0,
 'Adventure': 0,
 'Animation': 1,
 'Children': 1,
 'Comedy': 1,
 'Crime': 1,
 'Documentary': 0,
 'Drama': 0,
 'Fantasy': 0,
 'Film-Noir': 0,
 'Horror': 0,
 'Musical': 0,
 'Mystery': 0,
 'Romance': 0,
 'Sci-Fi': 0,
 'Thriller': 0,
 'War': 0,
 'Western': 0,
 'year': '1995'}