In [2]:
import kagglehub
from kagglehub import KaggleDatasetAdapter
# Download latest version
path = kagglehub.dataset_download("netflix-inc/netflix-prize-data")

print("Path to dataset folder:", path)

  from .autonotebook import tqdm as notebook_tqdm


Path to dataset folder: C:\Users\Nirmal Choyal\.cache\kagglehub\datasets\netflix-inc\netflix-prize-data\versions\2


In [3]:
import os
import csv
import pandas as pd

In [4]:
# List files in the dataset folder
print("\nFiles inside dataset folder:")
print(os.listdir(path))


Files inside dataset folder:
['combined_data_1.txt', 'combined_data_2.txt', 'combined_data_3.txt', 'combined_data_4.txt', 'movie_titles.csv', 'probe.txt', 'qualifying.txt', 'README']


In [10]:
# Read and display the first 5 lines of combined_data_1.txt 
file_path = os.path.join(path, "combined_data_1.txt")
with open(file_path, "r") as f:
    for _ in range(5):
        print(f.readline().strip())

1:
1488844,3,2005-09-06
822109,5,2005-05-13
885013,4,2005-10-19
30878,4,2005-12-26


In [6]:
# Read movie titles from movie_titles.csv
path1 = os.path.join(path, "movie_titles.csv")

rows = []
with open(path1, "r", encoding="latin-1") as f:
    reader = csv.reader(f)
    for row in reader:
        movie_id = int(row[0])
        year = row[1] if row[1] != "" else None
        title = ",".join(row[2:])   # join remaining parts safely
        rows.append([movie_id, year, title])

movies = pd.DataFrame(rows, columns=["MovieID", "Year", "Title"])
movies.head()


Unnamed: 0,MovieID,Year,Title
0,1,2003,Dinosaur Planet
1,2,2004,Isle of Man TT 2004 Review
2,3,1997,Character
3,4,1994,Paula Abdul's Get Up & Dance
4,5,2004,The Rise and Fall of ECW


## Creating subset of the dataset

In [14]:
import os
import pandas as pd

file_path = os.path.join(path, "combined_data_1.txt")

rows = []
movie_id = None
MAX_ROWS = 500000   # adjust: 100k / 500k / 1M

with open(file_path, "r") as f:
    for line in f:
        line = line.strip()
        if line.endswith(":"):
            movie_id = int(line[:-1])
        else:
            user_id, rating, date = line.split(",")
            rows.append([int(user_id), movie_id, int(rating), date])

            if len(rows) >= MAX_ROWS:
                break

ratings = pd.DataFrame(rows, columns=["UserID", "MovieID", "Rating", "Date"])
print("Loaded ratings shape:", ratings.shape)
ratings.head()
# Merge ratings with movie titles merged = ratings.merge(movies, on="MovieID", how="left") print(merged.shape) merged.head()

Loaded ratings shape: (500000, 4)


Unnamed: 0,UserID,MovieID,Rating,Date
0,1488844,1,3,2005-09-06
1,822109,1,5,2005-05-13
2,885013,1,4,2005-10-19
3,30878,1,4,2005-12-26
4,823519,1,3,2004-05-03


In [15]:
top_users = ratings["UserID"].value_counts().head(10000).index
ratings_u = ratings[ratings["UserID"].isin(top_users)].copy()

print("After top users filter:", ratings_u.shape)


After top users filter: (95701, 4)


In [16]:
top_movies = ratings_u["MovieID"].value_counts().head(1000).index
ratings_um = ratings_u[ratings_u["MovieID"].isin(top_movies)].copy()

print("After top movies filter:", ratings_um.shape)


After top movies filter: (95701, 4)


In [18]:
print("Unique users in ratings:", ratings["UserID"].nunique())
print("Unique movies in ratings:", ratings["MovieID"].nunique())

print("Unique users after top-users:", ratings_u["UserID"].nunique())
print("Unique movies after top-users:", ratings_u["MovieID"].nunique())

print("Unique movies after top-movies:", ratings_um["MovieID"].nunique())


Unique users in ratings: 215008
Unique movies in ratings: 148
Unique users after top-users: 10000
Unique movies after top-users: 148
Unique movies after top-movies: 66


In [17]:
Y = 200  # example threshold

movie_counts = ratings_u["MovieID"].value_counts()
valid_movies = movie_counts[movie_counts >= Y].index

ratings_um = ratings_u[ratings_u["MovieID"].isin(valid_movies)].copy()

# If more than 1000 movies remain, keep top 1000
top_movies = ratings_um["MovieID"].value_counts().head(1000).index
ratings_um = ratings_um[ratings_um["MovieID"].isin(top_movies)].copy()
