In [1]:
import pandas as pd
import numpy as np
# from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
import re
import os


In [3]:
ratings_df = pd.read_csv('/home/manishn/recommend/data/ml-latest-small/ratings.csv')

# Load the movies data
movies_df = pd.read_csv('/home/manishn/recommend/data/ml-latest-small/movies.csv')

# Merge the DataFrames on 'movieId'
merged_df = pd.merge(ratings_df, movies_df, on='movieId')

# Optionally, reorder the columns if needed
merged_df = merged_df[['userId', 'movieId', 'title', 'genres', 'rating', 'timestamp']]


In [4]:

# Save the merged DataFrame to a new CSV file
merged_df.to_csv('merged_ratings.csv', index=False)
merged_df
data=pd.DataFrame()
data=merged_df
# data=pd.read_csv('/home/manishn/recommend/data/ml-latest-small/ratings.csv')
data.head()


Unnamed: 0,userId,movieId,title,genres,rating,timestamp
0,1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,4.0,964982703
1,5,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,4.0,847434962
2,7,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,4.5,1106635946
3,15,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,2.5,1510577970
4,17,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,4.5,1305696483


In [55]:
MatrixData=data.drop(['timestamp','genres','title'], axis=1)
MovieUserMatrix = MatrixData.pivot_table(index='movieId',columns='userId',values='rating')
MovieUserMatrix.head()
MovieUserMatrix_Y=MovieUserMatrix.fillna(5)
MovieUserMatrix_Y
MovieUserMatrix_R=MovieUserMatrix.copy()
MovieUserMatrix_R=MovieUserMatrix_R.where(~MovieUserMatrix_R.notna(), 1)
MovieUserMatrix_R=MovieUserMatrix_R.fillna(0)
MovieUserMatrix_R
Y=MovieUserMatrix_Y.values #converting to numpy array
R=MovieUserMatrix_R.values
Y.shape

(9724, 610)

In [56]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

def split_data(Y, R, test_size=0.2):
    num_movies, num_users = Y.shape
    test_indices = np.random.choice(num_movies, size=int(num_movies * test_size), replace=False)
    
    # Initialize the test set
    Y_test = np.full_like(Y, fill_value=-1, dtype=np.float32)  # Use -1 for missing ratings
    R_test = np.zeros_like(R, dtype=np.float32)  # Binary indicator for test set
    
    # Fill the test set
    for idx in test_indices:
        Y_test[idx, :] = Y[idx, :]
        R_test[idx, :] = R[idx, :]
    
    # Create the training set by masking the test indices
    Y_train = Y.copy()
    R_train = R.copy()
    
    Y_train[test_indices, :] = -1  # Set test entries to -1 in the training set
    R_train[test_indices, :] = 0    # Set test entries to 0 in the training set
    
    return Y_train, R_train, Y_test, R_test


In [None]:

Y_train, R_train, Y_test, R_test = split_data(Y, R)

# Step 2: Convert the NumPy arrays to PyTorch tensors
Y_train_tensor = torch.FloatTensor(Y_train)
R_train_tensor = torch.FloatTensor(R_train)
Y_test_tensor = torch.FloatTensor(Y_test)
R_test_tensor = torch.FloatTensor(R_test)

# Step 3: Create a PyTorch Dataset and DataLoader for training
train_dataset = TensorDataset(Y_train_tensor, R_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=256, shuffle=True)


In [57]:
class Autoencoder(nn.Module):
    def __init__(self, num_users, latent_dim,dropout=0.2):
        super(Autoencoder, self).__init__()
        self.dropout=nn.Dropout(dropout)
        self.encoder = nn.Linear(num_users, latent_dim)
        self.decoder = nn.Linear(latent_dim, num_users)
    
    def forward(self, x):
        encoded = self.dropout(torch.relu(self.encoder(x)))
        decoded = torch.sigmoid(self.dropout(self.decoder(encoded)))
        return decoded


In [61]:
num_movies, num_users = Y.shape
latent_dim = 200

model = Autoencoder(num_users, latent_dim)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Move model to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
Y_train_tensor = Y_train_tensor.to(device)
R_train_tensor = R_train_tensor.to(device)

# Step 5: Training loop
num_epochs = 500

for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0
    
    for batch_Y, batch_R in train_loader:
        batch_Y = batch_Y.to(device)
        batch_R = batch_R.to(device)
        
        optimizer.zero_grad()
        
        # Forward pass
        output = model(batch_Y)
        
        # Only compute the loss where R == 1 (i.e., where the user provided a rating)
        loss = criterion(output * batch_R, batch_Y * batch_R)
        epoch_loss += loss.item()
        
        # Backward pass and optimization
        loss.backward()
        optimizer.step()
    
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss/len(train_loader):.4f}')

# Step 6: Testing the model after training
model.eval()
with torch.no_grad():
    predicted_ratings = model(Y_test_tensor.to(device))  # Get predicted ratings for the test set
    predicted_ratings = predicted_ratings.cpu().numpy()



Epoch 1/500, Loss: 0.1129
Epoch 2/500, Loss: 0.1083
Epoch 3/500, Loss: 0.1082
Epoch 4/500, Loss: 0.1082
Epoch 5/500, Loss: 0.1082
Epoch 6/500, Loss: 0.1083
Epoch 7/500, Loss: 0.1082
Epoch 8/500, Loss: 0.1082
Epoch 9/500, Loss: 0.1082
Epoch 10/500, Loss: 0.1083
Epoch 11/500, Loss: 0.1082
Epoch 12/500, Loss: 0.1083
Epoch 13/500, Loss: 0.1082
Epoch 14/500, Loss: 0.1082
Epoch 15/500, Loss: 0.1083
Epoch 16/500, Loss: 0.1082
Epoch 17/500, Loss: 0.1083
Epoch 18/500, Loss: 0.1083
Epoch 19/500, Loss: 0.1082
Epoch 20/500, Loss: 0.1082
Epoch 21/500, Loss: 0.1082
Epoch 22/500, Loss: 0.1082
Epoch 23/500, Loss: 0.1082
Epoch 24/500, Loss: 0.1082
Epoch 25/500, Loss: 0.1082
Epoch 26/500, Loss: 0.1082
Epoch 27/500, Loss: 0.1082
Epoch 28/500, Loss: 0.1083
Epoch 29/500, Loss: 0.1082
Epoch 30/500, Loss: 0.1082
Epoch 31/500, Loss: 0.1082
Epoch 32/500, Loss: 0.1082
Epoch 33/500, Loss: 0.1081
Epoch 34/500, Loss: 0.1082
Epoch 35/500, Loss: 0.1082
Epoch 36/500, Loss: 0.1082
Epoch 37/500, Loss: 0.1083
Epoch 38/5

In [78]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np

# Assuming Y_train_tensor and R_train_tensor are already defined
num_movies, num_users = Y.shape
latent_dim = 200

model = Autoencoder(num_users, latent_dim)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Move model to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
Y_train_tensor = Y_train_tensor.to(device)
R_train_tensor = R_train_tensor.to(device)

# Step 5: Training loop
num_epochs = 500

# Define RMSE computation function
def computeRMSE(Y, R, Y_pred):
    error = (Y_pred - Y) * R  # Only consider entries where R == 1
    squared_error = np.square(error)
    mse = np.sum(squared_error) / np.count_nonzero(R)
    return np.sqrt(mse)

for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0
    
    for batch_Y, batch_R in train_loader:
        batch_Y = batch_Y.to(device)
        batch_R = batch_R.to(device)
        
        optimizer.zero_grad()
        
        # Forward pass
        output = model(batch_Y)
        
        # Only compute the loss where R == 1 (i.e., where the user provided a rating)
        loss = criterion(output * batch_R, batch_Y * batch_R)
        epoch_loss += loss.item()
        
        # Backward pass and optimization
        loss.backward()
        optimizer.step()
    
    # Calculate training RMSE after each epoch
    with torch.no_grad():
        predicted_ratings = model(Y_train_tensor)  # Get predicted ratings for the training set
        predicted_ratings = predicted_ratings.cpu().numpy()  # Convert to NumPy for RMSE calculation
        train_rmse = computeRMSE(Y_train_tensor.cpu().numpy(), R_train_tensor.cpu().numpy(), predicted_ratings)
    
    print(f'Epoch {epoch + 1}/{num_epochs}, Loss: {epoch_loss / len(train_loader):.4f}, Train RMSE: {train_rmse:.4f}')

# Step 6: Testing the model after training
model.eval()
with torch.no_grad():
    predicted_ratings = model(Y_test_tensor.to(device))  # Get predicted ratings for the test set
    predicted_ratings = predicted_ratings.cpu().numpy()

# Step 8: Compute RMSE for the test set
rmse = computeRMSE(Y_test, R_test, predicted_ratings)
print(f"Test RMSE: {rmse:.4f}")


Epoch 1/500, Loss: 0.1129, Train RMSE: 2.8166
Epoch 2/500, Loss: 0.1085, Train RMSE: 2.8146
Epoch 3/500, Loss: 0.1085, Train RMSE: 2.8152
Epoch 4/500, Loss: 0.1086, Train RMSE: 2.8144
Epoch 5/500, Loss: 0.1085, Train RMSE: 2.8151
Epoch 6/500, Loss: 0.1086, Train RMSE: 2.8151
Epoch 7/500, Loss: 0.1085, Train RMSE: 2.8157
Epoch 8/500, Loss: 0.1086, Train RMSE: 2.8155
Epoch 9/500, Loss: 0.1085, Train RMSE: 2.8159
Epoch 10/500, Loss: 0.1085, Train RMSE: 2.8159
Epoch 11/500, Loss: 0.1085, Train RMSE: 2.8149
Epoch 12/500, Loss: 0.1086, Train RMSE: 2.8148
Epoch 13/500, Loss: 0.1086, Train RMSE: 2.8157
Epoch 14/500, Loss: 0.1085, Train RMSE: 2.8153
Epoch 15/500, Loss: 0.1086, Train RMSE: 2.8156
Epoch 16/500, Loss: 0.1085, Train RMSE: 2.8170
Epoch 17/500, Loss: 0.1085, Train RMSE: 2.8174
Epoch 18/500, Loss: 0.1085, Train RMSE: 2.8158
Epoch 19/500, Loss: 0.1085, Train RMSE: 2.8149
Epoch 20/500, Loss: 0.1085, Train RMSE: 2.8153
Epoch 21/500, Loss: 0.1085, Train RMSE: 2.8148
Epoch 22/500, Loss: 0.

In [79]:

# Convert predicted ratings back to NumPy for RMSE calculation
# predicted_ratings = predicted_ratings.cpu().numpy()

# Step 7: Define RMSE computation function
def computeTestError(Y_test, R_test, Y_pred):
    error = (Y_pred - Y_test) * R_test  # Only consider entries where R_test == 1
    squared_error = np.square(error)
    mse = np.sum(squared_error) / np.count_nonzero(R_test)
    return np.sqrt(mse)

# Step 8: Compute RMSE
rmse = computeTestError(Y_test, R_test, predicted_ratings)
print(f"Test RMSE: {rmse}")


Test RMSE: 2.6885017663140407


**improved model**

In [72]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

def split_data(Y, R, test_size=0.2):
    num_movies, num_users = Y.shape
    test_indices = np.random.choice(num_movies, size=int(num_movies * test_size), replace=False)
    
    # Initialize the test set
    Y_test = np.full_like(Y, fill_value=-1, dtype=np.float32)  # Use -1 for missing ratings
    R_test = np.zeros_like(R, dtype=np.float32)  # Binary indicator for test set
    
    # Fill the test set
    for idx in test_indices:
        Y_test[idx, :] = Y[idx, :]
        R_test[idx, :] = R[idx, :]
    
    # Create the training set by masking the test indices
    Y_train = Y.copy()
    R_train = R.copy()
    
    Y_train[test_indices, :] = -1  # Set test entries to -1 in the training set
    R_train[test_indices, :] = 0    # Set test entries to 0 in the training set
    
    return Y_train, R_train, Y_test, R_test

Y_train, R_train, Y_test, R_test = split_data(Y, R)

# Step 2: Convert the NumPy arrays to PyTorch tensors
Y_train_tensor = torch.FloatTensor(Y_train)
R_train_tensor = torch.FloatTensor(R_train)
Y_test_tensor = torch.FloatTensor(Y_test)
R_test_tensor = torch.FloatTensor(R_test)

# Step 3: Create a PyTorch Dataset and DataLoader for training
train_dataset = TensorDataset(Y_train_tensor, R_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=256, shuffle=True)


In [71]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

# Define the Improved Autoencoder model
class ImprovedAutoencoder(nn.Module):
    def __init__(self, num_users, latent_dim, dropout=0.2):
        super(ImprovedAutoencoder, self).__init__()
        self.dropout = nn.Dropout(dropout)
        
        # Encoder with multiple hidden layers
        self.encoder = nn.Sequential(
            nn.Linear(num_users, 512),
            nn.BatchNorm1d(512),
            nn.ReLU(),
            nn.Linear(512, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Linear(256, latent_dim)
        )
        
        # Decoder with residual connections
        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Linear(256, 512),
            nn.BatchNorm1d(512),
            nn.ReLU(),
            nn.Linear(512, num_users),
            nn.Sigmoid()
        )

    def forward(self, x):
        encoded = self.dropout(self.encoder(x))
        decoded = self.dropout(self.decoder(encoded))
        return decoded + x  # Residual connection

In [76]:



# Initialize the model, loss function, and optimizer
num_movies, num_users = Y.shape  # (9724, 610)
latent_dim = 200  # Latent feature size

model = ImprovedAutoencoder(num_users, latent_dim)
criterion = nn.MSELoss()  # Mean Squared Error Loss
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)

# Learning rate scheduler
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=5, factor=0.5)

# Move model to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

Y_train_tensor = Y_train_tensor.to(device)
R_train_tensor = R_train_tensor.to(device)

Y_tensor = Y_train_tensor.to(device)
R_tensor = R_train_tensor.to(device)

# Training loop with early stopping
num_epochs = 100
best_loss = float('inf')
patience = 20
patience_counter = 0

for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0
    
    for batch_Y, batch_R in train_loader:
        batch_Y = batch_Y.to(device)
        batch_R = batch_R.to(device)
        
        optimizer.zero_grad()
        
        # Forward pass
        output = model(batch_Y)
        
        # Only compute the loss where R == 1 (i.e., where the user provided a rating)
        loss = criterion(output * batch_R, batch_Y * batch_R)
        epoch_loss += loss.item()
        
        # Backward pass and optimizationa
        loss.backward()
        optimizer.step()
    
    # Update the learning rate scheduler
    scheduler.step(epoch_loss)
    
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss/len(train_loader):.4f}')

    # Early stopping
    if epoch_loss < best_loss:
        best_loss = epoch_loss
        patience_counter = 0
        # Save model
        torch.save(model.state_dict(), 'best_model.pth')
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print("Early stopping!")
            break

# Load the best model for testing
model.load_state_dict(torch.load('best_model.pth'))

# Testing the model after training
model.eval()
with torch.no_grad():
    reconstructed = model(Y_tensor)
    test_loss = criterion(reconstructed * R_tensor, Y_tensor * R_tensor)
    print(f'Test Loss: {test_loss.item():.4f}')


    


Epoch 1/100, Loss: 0.0006
Epoch 2/100, Loss: 0.0000
Epoch 3/100, Loss: 0.0001
Epoch 4/100, Loss: 0.0001
Epoch 5/100, Loss: 0.0001
Epoch 6/100, Loss: 0.0001
Epoch 7/100, Loss: 0.0001
Epoch 8/100, Loss: 0.0001
Epoch 9/100, Loss: 0.0001
Epoch 10/100, Loss: 0.0001
Epoch 11/100, Loss: 0.0001
Epoch 12/100, Loss: 0.0001
Epoch 13/100, Loss: 0.0001
Epoch 14/100, Loss: 0.0001
Epoch 15/100, Loss: 0.0001
Epoch 16/100, Loss: 0.0001
Epoch 17/100, Loss: 0.0001
Epoch 18/100, Loss: 0.0001
Epoch 19/100, Loss: 0.0001
Epoch 20/100, Loss: 0.0001
Epoch 21/100, Loss: 0.0001
Epoch 22/100, Loss: 0.0001
Early stopping!
Test Loss: 0.0008


In [77]:

# Convert predicted ratings back to NumPy for RMSE calculation
reconstructed = reconstructed.cpu().numpy()

# Step 7: Define RMSE computation function
def computeTestError(Y_test, R_test, Y_pred):
    error = (Y_pred - Y_test) * R_test  # Only consider entries where R_test == 1
    squared_error = np.square(error)
    mse = np.sum(squared_error) / np.count_nonzero(R_test)
    return np.sqrt(mse)

# Step 8: Compute RMSE
rmse = computeTestError(Y_test, R_test, reconstructed)
print(f"Test RMSE: {rmse}")


Test RMSE: 4.151440080718873
