In [10]:
!pip install torch==1.12.0 torchvision==0.13.0

Collecting torch==1.12.0
  Downloading torch-1.12.0-cp310-none-macosx_10_9_x86_64.whl (133.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m133.6/133.6 MB[0m [31m664.2 kB/s[0m eta [36m0:00:00[0m00:01[0m00:06[0m
[?25hCollecting torchvision==0.13.0
  Downloading torchvision-0.13.0-cp310-cp310-macosx_10_9_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m658.0 kB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: torch, torchvision
  Attempting uninstall: torch
    Found existing installation: torch 2.0.1
    Uninstalling torch-2.0.1:
      Successfully uninstalled torch-2.0.1
  Attempting uninstall: torchvision
    Found existing installation: torchvision 0.15.2
    Uninstalling torchvision-0.15.2:
      Successfully uninstalled torchvision-0.15.2
Successfully installed torch-1.12.0 torchvision-0.13.0


In [7]:
!pip install torchvision

Collecting torchvision
  Downloading torchvision-0.15.2-cp310-cp310-macosx_10_9_x86_64.whl (1.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m0m
[?25hCollecting torch==2.0.1
  Using cached torch-2.0.1-cp310-none-macosx_10_9_x86_64.whl (143.4 MB)
Collecting sympy
  Downloading sympy-1.12-py3-none-any.whl (5.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.7/5.7 MB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting filelock
  Downloading filelock-3.12.2-py3-none-any.whl (10 kB)
Collecting mpmath>=0.19
  Downloading mpmath-1.3.0-py3-none-any.whl (536 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m536.2/536.2 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: mpmath, sympy, filelock, torch, torchvision
Successfully installed filelock-3.12.2 mpmath-1.3.0 sympy-1.12 torch-2.0.1 tor

In [16]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

# Define the DeepFM model
class DeepFM(nn.Module):
    def __init__(self, num_genres, embedding_dim, hidden_units):
        super(DeepFM, self).__init__()
        
        # Embedding layers
        self.genre_embedding = nn.Embedding(num_genres, embedding_dim)
        
        # FM layers
        self.fm_linear = nn.Linear(embedding_dim + 1, 1)
        
        # Deep layers
        self.deep_layers = nn.Sequential()
        input_dim = embedding_dim + 1
        for i, units in enumerate(hidden_units):
            self.deep_layers.add_module(f"linear_{i}", nn.Linear(input_dim, units))
            self.deep_layers.add_module(f"relu_{i}", nn.ReLU())
            input_dim = units
        
        self.deep_output = nn.Linear(input_dim, 1)
        
    def forward(self, genres, ratings):
        genre_embedding = self.genre_embedding(genres)
        
        # FM component
        fm_input = torch.cat([genre_embedding, ratings.unsqueeze(1)], dim=1)
        fm_output = torch.sum(fm_input, dim=1)
        
        # Deep component
        deep_input = torch.cat([genre_embedding.flatten(start_dim=1), ratings.unsqueeze(1)], dim=1)
        deep_output = self.deep_layers(deep_input)
        deep_output = self.deep_output(deep_output)
        
        # Combine FM and Deep outputs
        output = fm_output + deep_output
        
        return output.squeeze()

# Load the dataset
movies = pd.read_csv('data/new_movies_metadata.csv')

# Preprocess the data
movies = movies[['original_title', 'name_genres', 'vote_average']]
movies = movies.dropna()
movies = movies.reset_index(drop=True)

# Encode genres using LabelEncoder
genres_encoder = LabelEncoder()
movies['genres_encoded'] = genres_encoder.fit_transform(movies['name_genres'])

# Scale vote_average using MinMaxScaler
scaler = MinMaxScaler()
movies['vote_average_scaled'] = scaler.fit_transform(movies[['vote_average']])

# Split data into training and testing sets
train_data, test_data = train_test_split(movies, test_size=0.2, random_state=42)

# Define the dataset class
class MovieDataset(Dataset):
    def __init__(self, data):
        self.genres = torch.LongTensor(data['genres_encoded'].values)
        self.ratings = torch.FloatTensor(data['vote_average_scaled'].values)
        self.labels = torch.FloatTensor(data['vote_average_scaled'].values)
        
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        return self.genres[idx], self.ratings[idx], self.labels[idx]

# Prepare the training and testing datasets
train_dataset = MovieDataset(train_data)
test_dataset = MovieDataset(test_data)

# Define the data loaders
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

# Initialize the model
num_genres = len(genres_encoder.classes_)
embedding_dim = 32
hidden_units = [64, 32]
model = DeepFM(num_genres, embedding_dim, hidden_units)

# Define the loss function and optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters())


In [17]:
num_epochs = 10
for epoch in range(num_epochs):
    train_loss = 0.0
    model.train()
    for users, items, ratings in train_loader:
        optimizer.zero_grad()
        predictions = model(users, items)
        loss = criterion(predictions, ratings)
        loss.backward()
        optimizer.step()
        train_loss += loss.item() * len(users)
    train_loss /= len(train_dataset)
    print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}")

# Evaluation
model.eval()
test_loss = 0.0
with torch.no_grad():
    for users, items, ratings in test_loader:
        predictions = model(users, items)
        loss = criterion(predictions, ratings)
        test_loss += loss.item() * len(users)
test_loss /= len(test_dataset)
print(f"Test Loss: {test_loss:.4f}")

  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch 1/10, Train Loss: 13.1765
Epoch 2/10, Train Loss: 4.3117
Epoch 3/10, Train Loss: 2.4780
Epoch 4/10, Train Loss: 1.6380
Epoch 5/10, Train Loss: 1.1370
Epoch 6/10, Train Loss: 0.8048
Epoch 7/10, Train Loss: 0.5711
Epoch 8/10, Train Loss: 0.4043
Epoch 9/10, Train Loss: 0.2855
Epoch 10/10, Train Loss: 0.2006
Test Loss: 2.0160


  return F.mse_loss(input, target, reduction=self.reduction)


In [25]:
def recommend_movies(user_ids, model, top_k=5):
    model.eval()
    user_ids = torch.LongTensor(user_ids)
    genres = torch.arange(num_genres).unsqueeze(0).expand(len(user_ids), -1)
    ratings = torch.zeros(len(user_ids), genres.shape[1])  # Modify the ratings tensor dimensions
    
    with torch.no_grad():
        predictions = model(genres, ratings)  # Pass the modified ratings tensor
        _, top_indices = torch.topk(predictions, top_k)
    
    recommended_movies = []
    for i, user_id in enumerate(user_ids):
        user_movies = movies[movies['genres_encoded'] == user_id.item()]
        top_movies = user_movies.iloc[top_indices[i]].reset_index(drop=True)
        recommended_movies.append(top_movies)
    
    return recommended_movies


In [21]:
ratings = pd.read_csv('data/ratings.csv')
ratings


Unnamed: 0,userId,movieId,rating,timestamp
0,1,110,1.0,1425941529
1,1,147,4.5,1425942435
2,1,858,5.0,1425941523
3,1,1221,5.0,1425941546
4,1,1246,5.0,1425941556
...,...,...,...,...
26024284,270896,58559,5.0,1257031564
26024285,270896,60069,5.0,1257032032
26024286,270896,63082,4.5,1257031764
26024287,270896,64957,4.5,1257033990


In [32]:
ratings[ratings["userId"] == 123]

Unnamed: 0,userId,movieId,rating,timestamp
10422,123,1,3.0,968555453
10423,123,21,5.0,968555495
10424,123,34,2.0,968555249
10425,123,39,4.0,968555549
10426,123,45,5.0,968555344
...,...,...,...,...
10516,123,3264,3.0,968569637
10517,123,3476,5.0,968569430
10518,123,3499,5.0,968569383
10519,123,3550,2.0,968569430


In [34]:
user_id = 123
recommended_movies = recommend_movies(ratings[ratings["userId"] == 123], model, top_k=5)
print(recommended_movies)

ValueError: could not determine the shape of object type 'DataFrame'

In [35]:
import torch
import numpy as np

def recommend_movies(user_ids, model, top_k=5):
    model.eval()
    user_ids = torch.LongTensor(user_ids["movieId"].values)  # Extracting the movieId column
    genres = torch.arange(num_genres).unsqueeze(0).expand(len(user_ids), -1)
    ratings = torch.zeros(len(user_ids), genres.shape[1])  # Modify the ratings tensor dimensions

    with torch.no_grad():
        predictions = model(user_ids, genres)
        _, movie_indices = torch.topk(predictions, top_k, dim=1)
        recommended_movies = movie_indices.tolist()

    return recommended_movies

user_id = 123
recommended_movies = recommend_movies(ratings[ratings["userId"] == user_id], model, top_k=5)
print(recommended_movies)


RuntimeError: Tensors must have same number of dimensions: got 2 and 3

In [14]:
movies = pd.read_csv('data/new_movies_metadata.csv')
movies.columns

Index(['adult', 'budget', 'homepage', 'id', 'imdb_id', 'original_language',
       'original_title', 'overview', 'popularity', 'poster_path',
       'production_companies', 'release_date', 'revenue', 'runtime', 'status',
       'tagline', 'title', 'video', 'vote_average', 'vote_count',
       'name_belongs_to_collection', 'id_belongs_to_collection',
       'poster_path_belongs_to_collection',
       'backdrop_path_belongs_to_collection', 'name_genres', 'id_genres',
       'name_production_countries', 'iso_3166_1_production_companies',
       'name_production_companies', 'id_production_companies'],
      dtype='object')