In [11]:
import pandas as pd
import zipfile
import urllib.request
from sklearn.decomposition import TruncatedSVD
import numpy as np

# Download the dataset
url = 'http://files.grouplens.org/datasets/movielens/ml-100k.zip'
output_file = 'ml-100k.zip'
urllib.request.urlretrieve(url, output_file)

# Extract the dataset
with zipfile.ZipFile(output_file, 'r') as zip_ref:
    zip_ref.extractall('ml-100k')


In [12]:
#panda dataframe
u_data_path = 'ml-100k/ml-100k/u.data'
u_item_path = 'ml-100k/ml-100k/u.item'

#naming
data_columns = ['UserId', 'ItemId', 'Rating', 'Timestamp']
item_columns = ['ItemId', 'Title'] + [f'col{i}' for i in range(22)]  # 24 columns in total

# Load data
ratings = pd.read_csv(u_data_path, sep='\t', names=data_columns)
items = pd.read_csv(u_item_path, sep='|', names=item_columns, encoding='latin-1')
items = items[['ItemId', 'Title']]
print(ratings.head())
print(items.head())


   UserId  ItemId  Rating  Timestamp
0     196     242       3  881250949
1     186     302       3  891717742
2      22     377       1  878887116
3     244      51       2  880606923
4     166     346       1  886397596
   ItemId              Title
0       1   Toy Story (1995)
1       2   GoldenEye (1995)
2       3  Four Rooms (1995)
3       4  Get Shorty (1995)
4       5     Copycat (1995)


In [13]:

# Keep only necessary columns from items
items = items[['ItemId', 'Title']]

# Filter users and items to create a manageable dataset
unique_users = ratings['UserId'].unique()
unique_items = ratings['ItemId'].unique()

print(f'Number of unique Users: {len(unique_users)}')
print(f'Number of unique Items: {len(unique_items)}')



Number of unique Users: 943
Number of unique Items: 1682


In [14]:
# Filter users and items to ensure they have a minimum number of ratings
min_user_ratings = 50
min_item_ratings = 50

filtered_users = ratings['UserId'].value_counts()[ratings['UserId'].value_counts() >= min_user_ratings].index
filtered_items = ratings['ItemId'].value_counts()[ratings['ItemId'].value_counts() >= min_item_ratings].index

filtered_ratings = ratings[ratings['UserId'].isin(filtered_users) & ratings['ItemId'].isin(filtered_items)]

print(f'Filtered number of unique Users: {filtered_users.shape[0]}')
print(f'Filtered number of unique Items: {filtered_items.shape[0]}')

# Create the utility matrix
utility_matrix = filtered_ratings.pivot_table(values='Rating', index='UserId', columns='ItemId', fill_value=0)
print(f'Utility matrix shape: {utility_matrix.shape}')

Filtered number of unique Users: 568
Filtered number of unique Items: 603
Utility matrix shape: (568, 603)


In [15]:
# Apply SVD
svd = TruncatedSVD(n_components=20)
latent_matrix = svd.fit_transform(utility_matrix)

# Function for  recommendations
def get_recommendations(item_id, latent_matrix, item_names, num_recommendations=5):
    item_index = utility_matrix.columns.get_loc(item_id)
    item_latent_vector = svd.components_[:, item_index]
    
    # Calculate similarities with other items
    similarities = np.dot(svd.components_.T, item_latent_vector)
    
    # Get top N recommendations
    similar_indices = np.argsort(similarities)[-num_recommendations-1:-1][::-1]
    similar_items = [utility_matrix.columns[i] for i in similar_indices]
    
    # Get item names
    similar_item_names = item_names[item_names['ItemId'].isin(similar_items)]['Title']
    return similar_item_names

In [16]:
# Test1
sample_item_id = filtered_items[0]
recommendations = get_recommendations(sample_item_id, latent_matrix, items)
print(f'Recommended Items for {items[items["ItemId"] == sample_item_id]["Title"].values[0]}:')
print(recommendations)

Recommended Items for Star Wars (1977):
0                     Toy Story (1995)
126              Godfather, The (1972)
171    Empire Strikes Back, The (1980)
173     Raiders of the Lost Ark (1981)
180          Return of the Jedi (1983)
Name: Title, dtype: object
