In [44]:
import torch
print(torch.__version__)

2.5.1


In [45]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("MovieRecSys").getOrCreate()
print(spark.version)

4.0.0


In [46]:
from pyspark.sql.functions import col

# Read ratings.csv
df = spark.read.csv("ratings.csv", header=True, inferSchema=True)

# Select first 1000 unique users
unique_users = df.select("userId").distinct().limit(50)

# Keep only rows from these users
df_limited = df.join(unique_users, on="userId", how="inner")

df_limited.show(5)
print("Rows after limiting:", df_limited.count())
print("Unique users:", df_limited.select("userId").distinct().count())

+------+-------+------+
|userId|movieId|rating|
+------+-------+------+
|    31|     47|   5.0|
|    31|     50|   4.0|
|    31|    296|   4.5|
|    31|    318|   5.0|
|    31|    608|   4.5|
+------+-------+------+
only showing top 5 rows
Rows after limiting: 8683
Unique users: 50


In [47]:
from pyspark.ml.feature import StringIndexer, MinMaxScaler, VectorAssembler
from pyspark.sql.functions import col, round as spark_round
from pyspark.ml.functions import vector_to_array

# Encode users
user_indexer = StringIndexer(inputCol="userId", outputCol="user")
df_small = user_indexer.fit(df_limited).transform(df_limited)

# Encode movies
movie_indexer = StringIndexer(inputCol="movieId", outputCol="movie")
df_small = movie_indexer.fit(df_small).transform(df_small)

# Min-Max on Ratings
assembler = VectorAssembler(inputCols=["rating"], outputCol="rating_vec")
df_vec = assembler.transform(df_small)

scaler = MinMaxScaler(inputCol="rating_vec", outputCol="rating_scaled")
scaler_model = scaler.fit(df_vec)
scaled_df = scaler_model.transform(df_vec)

# Flatten vector -> float, then round to 1 decimal
scaled_df = scaled_df.withColumn(
    "rating_scaled",
    spark_round(vector_to_array(col("rating_scaled"))[0], 1)
)

# Keep only necessary columns
df_final = scaled_df.select("user", "movie", "rating_scaled")
df_final.show(5)


+----+-----+-------------+
|user|movie|rating_scaled|
+----+-----+-------------+
|31.0| 15.0|          1.0|
|31.0|  7.0|          0.8|
|31.0|  3.0|          0.9|
|31.0|  4.0|          1.0|
|31.0| 12.0|          0.9|
+----+-----+-------------+
only showing top 5 rows


In [48]:
# 80% train, 20% test
train_df, test_df = df_final.randomSplit([0.8, 0.2], seed=42)

#Converting PySpark DF to Pandas to Tensors
train_pd = train_df.toPandas()
test_pd = test_df.toPandas()

X_train = torch.tensor(train_pd[["user", "movie"]].values)
y_train = torch.tensor(train_pd["rating_scaled"].values)

X_test = torch.tensor(test_pd[["user", "movie"]].values)
y_test = torch.tensor(test_pd["rating_scaled"].values)

print("Train count:", X_train.shape[0])
print("Train Label:",y_train.shape[0])
print("Test count:", X_test.shape[0])
print("Test Label:",y_test.shape[0])

Train count: 7010
Train Label: 7010
Test count: 1673
Test Label: 1673


In [49]:
from torch.utils.data import TensorDataset, DataLoader

# Create datasets
train_dataset = TensorDataset(X_train, y_train)
test_dataset = TensorDataset(X_test, y_test)
#print(train_dataset[2200])

# DataLoaders
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64)

In [88]:
import torch
import torch.nn as nn

class CollabFiltering(nn.Module):
    def __init__(self, n_users, n_movies, emb_dim=8,hidden=8,dropout_p=0.3):
        super().__init__()
        self.user_emb = nn.Embedding(n_users, emb_dim)
        self.movie_emb = nn.Embedding(n_movies, emb_dim)

        self.mlp = nn.Sequential(
            nn.Linear(n_users * 2, hidden),  # user+movie embeddings concatenated
            nn.ReLU(),
            nn.Dropout(dropout_p),
            nn.Linear(hidden, 1),
            nn.Sigmoid()  # predict rating
        )

    def forward(self,user,movie):
        u = self.user_emb(user)
        m = self.movie_emb(movie)
        
        # Concatenate embeddings (instead of dot product)
        x = torch.cat([u, m], dim=1)
        
        # Pass through MLP
        return self.mlp(x).squeeze()

In [89]:
n_users = df_final.select("user").distinct().count()
n_movies = df_final.select("movie").distinct().count()

model = CollabFiltering(n_users, n_movies, emb_dim=50, hidden=64)

In [106]:
loss_fn = nn.SmoothL1Loss()  # regression on ratings
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
#optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9, weight_decay=1e-5)
#optimizer = torch.optim.RMSprop(model.parameters(), lr=0.001, weight_decay=1e-5)
#optimizer = torch.optim.Adagrad(model.parameters(), lr=0.01, weight_decay=1e-5)


In [107]:
n_epochs = 20
train_losses, test_losses = [], []
train_rmses, test_rmses = [], []

for epoch in range(n_epochs):
    # ---------- Training ----------
    model.train()
    total_loss, total_sq_error, total_samples = 0, 0, 0
    
    for X_batch, y_batch in train_loader:
        user_batch = X_batch[:, 0].long()
        movie_batch = X_batch[:, 1].long()
        
        preds = model(user_batch, movie_batch).squeeze()
        loss = loss_fn(preds, y_batch.float())
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        total_sq_error += torch.sum((preds - y_batch) ** 2).item()
        total_samples += len(y_batch)
    
    train_loss = total_loss / len(train_loader)
    train_rmse = (total_sq_error / total_samples) ** 0.5
    train_losses.append(train_loss)
    train_rmses.append(train_rmse)
    
    # ---------- Testing ----------
    model.eval()
    total_loss, total_sq_error, total_samples = 0, 0, 0
    with torch.no_grad():
        for X_batch, y_batch in test_loader:
            user_batch = X_batch[:, 0].long()
            movie_batch = X_batch[:, 1].long()
            
            preds = model(user_batch, movie_batch).squeeze()
            loss = loss_fn(preds, y_batch.float())
            
            total_loss += loss.item()
            total_sq_error += torch.sum((preds - y_batch) ** 2).item()
            total_samples += len(y_batch)
    
    test_loss = total_loss / len(test_loader)
    test_rmse = (total_sq_error / total_samples) ** 0.5
    test_losses.append(test_loss)
    test_rmses.append(test_rmse)
    
    # ---------- Summary ----------
    print(f"\nEpoch {epoch+1}/{n_epochs}")
    print(f"Train Loss: {train_loss:.4f}, Train RMSE: {train_rmse:.4f}")
    print(f"Test  Loss: {test_loss:.4f}, Test  RMSE: {test_rmse:.4f}")



Epoch 1/20
Train Loss: 0.0020, Train RMSE: 0.0640
Test  Loss: 0.0284, Test  RMSE: 0.2264

Epoch 2/20
Train Loss: 0.0028, Train RMSE: 0.0742
Test  Loss: 0.0284, Test  RMSE: 0.2264

Epoch 3/20
Train Loss: 0.0024, Train RMSE: 0.0689
Test  Loss: 0.0284, Test  RMSE: 0.2264

Epoch 4/20
Train Loss: 0.0023, Train RMSE: 0.0677
Test  Loss: 0.0291, Test  RMSE: 0.2284

Epoch 5/20
Train Loss: 0.0021, Train RMSE: 0.0655
Test  Loss: 0.0291, Test  RMSE: 0.2287

Epoch 6/20
Train Loss: 0.0021, Train RMSE: 0.0654
Test  Loss: 0.0292, Test  RMSE: 0.2289

Epoch 7/20
Train Loss: 0.0021, Train RMSE: 0.0646
Test  Loss: 0.0291, Test  RMSE: 0.2284

Epoch 8/20
Train Loss: 0.0021, Train RMSE: 0.0642
Test  Loss: 0.0293, Test  RMSE: 0.2296

Epoch 9/20
Train Loss: 0.0020, Train RMSE: 0.0631
Test  Loss: 0.0292, Test  RMSE: 0.2286

Epoch 10/20
Train Loss: 0.0020, Train RMSE: 0.0632
Test  Loss: 0.0291, Test  RMSE: 0.2283

Epoch 11/20
Train Loss: 0.0019, Train RMSE: 0.0622
Test  Loss: 0.0291, Test  RMSE: 0.2280

Epoch 1