In [1]:
import pandas as pd
from surprise import Dataset, Reader
from surprise.model_selection import train_test_split
from surprise import SVD
from surprise import accuracy

In [62]:
ids = pd.read_csv("ml-32m/links.csv")

In [None]:
print(ids)
print(ids.isnull().sum())

       movieId    imdbId     tmdbId
0            1    114709      862.0
1            2    113497     8844.0
2            3    113228    15602.0
3            4    114885    31357.0
4            5    113041    11862.0
...        ...       ...        ...
87580   292731  26812510  1032473.0
87581   292737  14907358   986674.0
87582   292753  12388280   948139.0
87583   292755     64027   182776.0
87584   292757  28995566  1174725.0

[87585 rows x 3 columns]
movieId      0
imdbId       0
tmdbId     124
dtype: int64


In [66]:
df_cleaned = ids.dropna(subset=["tmdbId"])

# Save the cleaned CSV file
output_path = "ml-32m/links_cleaned.csv"
df_cleaned.to_csv(output_path, index=False)

In [67]:
print(ids.isnull().sum())
print(df_cleaned.isnull().sum())

movieId      0
imdbId       0
tmdbId     124
dtype: int64
movieId    0
imdbId     0
tmdbId     0
dtype: int64


In [None]:
column_names = ["user_id", "item_id", "rating", "timestamp"]
dtype = {"user_id": int, "item_id": int, "rating": float, "timestamp": int}
ratings = pd.read_csv("ml-32m/ratings.csv", names=column_names, nrows=500000, dtype=dtype, header=0)

ratings["rating"] = pd.to_numeric(ratings["rating"], errors="coerce")  # Converts non-numeric values to NaN
ratings = ratings.dropna(subset=["rating"])

In [45]:


movie_columns = [
    "item_id",
    "title",
    "genres"
]
df_movies = pd.read_csv("ml-32m/movies.csv", names=movie_columns, encoding="latin-1", header=0)
df_movies["item_id"] = df_movies["item_id"].astype(int)
movie_titles = dict(zip(df_movies["item_id"], df_movies["title"]))
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(ratings[["user_id", "item_id", "rating"]], reader)

# Split the data into training and test sets
trainset, testset = train_test_split(data, test_size=0.2)

# Use SVD (Singular Value Decomposition) algorithm
model = SVD()

# Train the model on the training set
model.fit(trainset)

# Predict ratings for the test set
predictions = model.test(testset)

# Compute and print the accuracy
accuracy.rmse(predictions)

RMSE: 0.8531


0.8531334767970149

In [46]:
print(df_movies)

       item_id                               title  \
0            1                    Toy Story (1995)   
1            2                      Jumanji (1995)   
2            3             Grumpier Old Men (1995)   
3            4            Waiting to Exhale (1995)   
4            5  Father of the Bride Part II (1995)   
...        ...                                 ...   
87580   292731           The Monroy Affaire (2022)   
87581   292737          Shelter in Solitude (2023)   
87582   292753                         Orca (2023)   
87583   292755              The Angry Breed (1968)   
87584   292757           Race to the Summit (2023)   

                                            genres  
0      Adventure|Animation|Children|Comedy|Fantasy  
1                       Adventure|Children|Fantasy  
2                                   Comedy|Romance  
3                             Comedy|Drama|Romance  
4                                           Comedy  
...                              

In [48]:
print(movie_titles[589])

Terminator 2: Judgment Day (1991)


In [55]:
new_user_id = max(ratings["user_id"]) + 1  # Automatically assign a new user ID (if necessary)

# Example preferences for the new user (you can change these based on the actual preferences)
new_user_preferences = {
    1: 4.0,  # Toy Story (1995)
    2: 5.0,  # Jumanji (1995)
    3: 3.5   # Grumpier Old Men (1995)
}

# Add these ratings to the original ratings DataFrame (this could be a temporary addition)
new_ratings = pd.DataFrame(list(new_user_preferences.items()), columns=["item_id", "rating"])
new_ratings["user_id"] = new_user_id

# Combine with the existing ratings data
ratings = pd.concat([ratings, new_ratings], ignore_index=True)



In [56]:
print(ratings)

        user_id  item_id  rating    timestamp
0             1       17     4.0  944249077.0
1             1       25     1.0  944250228.0
2             1       29     2.0  943230976.0
3             1       30     5.0  944249077.0
4             1       32     5.0  943228858.0
...         ...      ...     ...          ...
500001     3239        2     5.0          NaN
500002     3239        3     3.5          NaN
500003     3240        1     4.0          NaN
500004     3240        2     5.0          NaN
500005     3240        3     3.5          NaN

[500006 rows x 4 columns]


In [60]:
# Function to recommend top N items for a given user
def recommend(user_id, num_recommendations=20):
    # Get a list of all item_ids
    all_items = ratings["item_id"].unique()
    print(len(all_items))
    # Predict ratings for all items
    predicted_ratings = [model.predict(user_id, item_id).est for item_id in all_items]

    # Create a list of item_id and their predicted ratings
    item_ratings = list(zip(all_items, predicted_ratings))

    # Sort the items by predicted ratings in descending order
    item_ratings.sort(key=lambda x: x[1], reverse=True)

    # Get the top N items
    top_items = item_ratings[:num_recommendations]
    # Convert item_ids to movie titles
    top_items_with_titles = [
        (movie_titles[item_id], rating) for item_id, rating in top_items
    ]

    # Return the top N recommended items with titles
    return top_items_with_titles


# Example usage: Recommend top 5 items for user with user_id 196
user_id = 2
recommendations = recommend(new_user_id, 20)
print("Top 5 recommendations for user {}:".format(user_id))
for title, rating in recommendations:
    print(f"{title}: Predicted Rating {rating:.2f}")

84432
Top 5 recommendations for user 2:
Band of Brothers (2001): Predicted Rating 4.46
Shawshank Redemption, The (1994): Predicted Rating 4.44
Planet Earth (2006): Predicted Rating 4.39
Parasite (2019): Predicted Rating 4.38
Witness for the Prosecution (1957): Predicted Rating 4.35
Big Sleep, The (1946): Predicted Rating 4.32
Killing Fields, The (1984): Predicted Rating 4.31
Wild Strawberries (SmultronstÃ¤llet) (1957): Predicted Rating 4.31
Thin Blue Line, The (1988): Predicted Rating 4.31
Persona (1966): Predicted Rating 4.31
Godfather, The (1972): Predicted Rating 4.31
12 Angry Men (1957): Predicted Rating 4.30
All About Eve (1950): Predicted Rating 4.30
Sound of Metal (2019): Predicted Rating 4.29
His Girl Friday (1940): Predicted Rating 4.29
Double Indemnity (1944): Predicted Rating 4.29
Schindler's List (1993): Predicted Rating 4.29
Raise the Red Lantern (Da hong deng long gao gao gua) (1991): Predicted Rating 4.28
Rear Window (1954): Predicted Rating 4.28
Day of the Jackal, The (

In [61]:
def predict_for_new_user(new_user_id, new_user_ratings, model, all_movies):
    """
    Generate recommendations for a new user.

    Parameters:
    - new_user_id: ID of the new user (can be any integer)
    - new_user_ratings: Dictionary {item_id: rating} of movies rated by the new user
    - model: Trained SVD model
    - all_movies: List of all movies in the dataset

    Returns:
    - Top 10 recommended movie IDs
    """
    
    # Create a fake testset for the model with known ratings
    fake_testset = [(new_user_id, item_id, rating) for item_id, rating in new_user_ratings.items()]
    
    # Pretend this new user has watched some movies (helps calibrate predictions)
    model.test(fake_testset)
    
    # Get all movies the user hasn't rated yet
    unrated_movies = [item_id for item_id in all_movies if item_id not in new_user_ratings]
    
    # Predict ratings for all unrated movies
    predicted_ratings = {
        item_id: model.predict(new_user_id, item_id).est for item_id in unrated_movies
    }
    
    # Sort movies by predicted rating and return top 10
    recommendations = sorted(predicted_ratings.items(), key=lambda x: x[1], reverse=True)
    return recommendations[:10]  # Return top 10 movie recommendations


new_user_ratings = {1: 5, 50: 4, 100: 3}  # User likes movie ID 1, 50, 100

# List of all movies in the dataset
all_movies = ratings["item_id"].unique()

# Get recommendations
top_movies = predict_for_new_user(99999, new_user_ratings, model, all_movies)  # 99999 is a fake user ID
print(top_movies)

[(170705, 4.460166787708317), (318, 4.439737491688534), (159817, 4.386524624073444), (202439, 4.382842156535695), (5008, 4.3532903845526585), (1284, 4.320604114898792), (1299, 4.312329842064727), (5147, 4.308010802806914), (1189, 4.307981603625407), (7327, 4.305945417160865)]
