In [90]:
import pandas as pd
import glob
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
import seaborn as sns 
from sklearn.preprocessing import LabelEncoder,MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
import numpy as np
from math import sqrt
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import mean_squared_error
from scipy.sparse.linalg import svds


## Collecting Data

In [52]:

# 1. Load ratings data
ratings = pd.read_csv(r'C:\Projects\elevvo\Movie Recommendation (Task 5)\ml-100k\u.data',
    sep='\t', names=['user_id', 'item_id', 'rating', 'timestamp'])

# 2. Load movie titles
movies = pd.read_csv(r'C:\Projects\elevvo\Movie Recommendation (Task 5)\ml-100k\u.item',
                        sep='|', encoding='latin-1', header=None,
                        names=['item_id', 'title'], usecols=[0, 1])

# 3. Merge ratings and movies
df = pd.merge(ratings, movies, on='item_id')

# 4. Create user-movie rating matrix
user_movie_matrix = df.pivot_table(index='user_id', columns='title', values='rating')
matrix_filled = user_movie_matrix.fillna(0)

# 5. Compute item-item similarity
item_sim = cosine_similarity(matrix_filled.T)
item_sim_df = pd.DataFrame(item_sim, index=matrix_filled.columns, columns=matrix_filled.columns)

# 6. Recommend function
def recommend(movie_name, n=5):
    return item_sim_df[movie_name].sort_values(ascending=False).iloc[1:n+1]

# Example
print(recommend("Star Wars (1977)"))


title
Return of the Jedi (1983)          0.884476
Raiders of the Lost Ark (1981)     0.764885
Empire Strikes Back, The (1980)    0.749819
Toy Story (1995)                   0.734572
Godfather, The (1972)              0.697332
Name: Star Wars (1977), dtype: float64


In [53]:
df

Unnamed: 0,user_id,item_id,rating,timestamp,title
0,196,242,3,881250949,Kolya (1996)
1,186,302,3,891717742,L.A. Confidential (1997)
2,22,377,1,878887116,Heavyweights (1994)
3,244,51,2,880606923,Legends of the Fall (1994)
4,166,346,1,886397596,Jackie Brown (1997)
...,...,...,...,...,...
99995,880,476,3,880175444,"First Wives Club, The (1996)"
99996,716,204,5,879795543,Back to the Future (1985)
99997,276,1090,1,874795795,Sliver (1993)
99998,13,225,2,882399156,101 Dalmatians (1996)


In [54]:


# 1. Set path
path = r'C:\Projects\elevvo\Movie Recommendation (Task 5)\ml-100k'

# 2. Match all u1–u5 base & test files
files = glob.glob(path + r'\u[1-5].*')  # matches u1.base, u1.test, ..., u5.base, u5.test

# 3. Load and combine
df_list = [pd.read_csv(f, sep='\t', names=['user_id', 'item_id', 'rating', 'timestamp']) for f in files]
all_ratings = pd.concat(df_list, ignore_index=True)


In [55]:
# Load movies
movies = pd.read_csv(path + r'\u.item', sep='|', encoding='latin-1',
                        names=['item_id', 'title'], usecols=[0, 1], header=None)

# Load users
users = pd.read_csv(path + r'\u.user', sep='|',
                    names=['user_id', 'age', 'gender', 'occupation', 'zip_code'])

# Merge
df = pd.merge(all_ratings, movies, on='item_id')
df = pd.merge(df, users, on='user_id')



In [56]:
df

Unnamed: 0,user_id,item_id,rating,timestamp,title,age,gender,occupation,zip_code
0,1,1,5,874965758,Toy Story (1995),24,M,technician,85711
1,1,2,3,876893171,GoldenEye (1995),24,M,technician,85711
2,1,3,4,878542960,Four Rooms (1995),24,M,technician,85711
3,1,4,3,876893119,Get Shorty (1995),24,M,technician,85711
4,1,5,3,889751712,Copycat (1995),24,M,technician,85711
...,...,...,...,...,...,...,...,...,...
499995,943,1028,2,875502096,Grumpier Old Men (1995),22,M,student,77841
499996,943,1044,3,888639903,"Paper, The (1994)",22,M,student,77841
499997,943,1047,2,875502146,Multiplicity (1996),22,M,student,77841
499998,943,1228,3,888640275,Under Siege 2: Dark Territory (1995),22,M,student,77841


## Data Cleaning

In [57]:
df.describe()

Unnamed: 0,user_id,item_id,rating,timestamp,age
count,500000.0,500000.0,500000.0,500000.0,500000.0
mean,462.48475,425.53013,3.52986,883528900.0,32.96985
std,266.613354,330.797033,1.125669,5343835.0,11.562577
min,1.0,1.0,1.0,874724700.0,7.0
25%,254.0,175.0,3.0,879448700.0,24.0
50%,447.0,322.0,4.0,882826900.0,30.0
75%,682.0,631.0,4.0,888260000.0,40.0
max,943.0,1682.0,5.0,893286600.0,73.0


In [58]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500000 entries, 0 to 499999
Data columns (total 9 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   user_id     500000 non-null  int64 
 1   item_id     500000 non-null  int64 
 2   rating      500000 non-null  int64 
 3   timestamp   500000 non-null  int64 
 4   title       500000 non-null  object
 5   age         500000 non-null  int64 
 6   gender      500000 non-null  object
 7   occupation  500000 non-null  object
 8   zip_code    500000 non-null  object
dtypes: int64(5), object(4)
memory usage: 34.3+ MB


In [59]:
df.isnull().sum()

user_id       0
item_id       0
rating        0
timestamp     0
title         0
age           0
gender        0
occupation    0
zip_code      0
dtype: int64

In [60]:
df.head()

Unnamed: 0,user_id,item_id,rating,timestamp,title,age,gender,occupation,zip_code
0,1,1,5,874965758,Toy Story (1995),24,M,technician,85711
1,1,2,3,876893171,GoldenEye (1995),24,M,technician,85711
2,1,3,4,878542960,Four Rooms (1995),24,M,technician,85711
3,1,4,3,876893119,Get Shorty (1995),24,M,technician,85711
4,1,5,3,889751712,Copycat (1995),24,M,technician,85711


In [61]:
df['age'].value_counts()

age
27    32115
24    22780
20    20445
25    20065
22    19895
      ...  
62      230
7       215
66      185
10      155
11      135
Name: count, Length: 61, dtype: int64

In [62]:
df['gender'].value_counts()

gender
M    371300
F    128700
Name: count, dtype: int64

In [63]:
df['occupation'].value_counts()

occupation
student          109785
other             53315
educator          47210
engineer          40875
programmer        39005
administrator     37395
writer            27680
librarian         26365
technician        17530
executive         17015
healthcare        14020
artist            11540
entertainment     10475
scientist         10290
marketing          9750
retired            8045
lawyer             6725
none               4505
salesman           4280
doctor             2700
homemaker          1495
Name: count, dtype: int64

In [64]:
df['zip_code'].value_counts()

zip_code
55414    5515
20009    4390
10019    4250
22902    4160
61820    4085
         ... 
08832     100
75230     100
30803     100
15017     100
23112     100
Name: count, Length: 795, dtype: int64

In [65]:
df['rating'].value_counts()

rating
4    170870
3    135725
5    106005
2     56850
1     30550
Name: count, dtype: int64

In [66]:
df.columns

Index(['user_id', 'item_id', 'rating', 'timestamp', 'title', 'age', 'gender',
       'occupation', 'zip_code'],
      dtype='object')

In [67]:
# movie_avg = df.groupby('item_id')['rating'].mean()
# keep_movies = movie_avg[movie_avg >= 2.5].index        # dropping movies with low raiting for better acc
# df = df[df['item_id'].isin(keep_movies)]

In [68]:
# # dropping unimportant cols
# df = df.drop(columns=['user_id', 'item_id', 'timestamp', 'zip_code'])


In [69]:
df

Unnamed: 0,user_id,item_id,rating,timestamp,title,age,gender,occupation,zip_code
0,1,1,5,874965758,Toy Story (1995),24,M,technician,85711
1,1,2,3,876893171,GoldenEye (1995),24,M,technician,85711
2,1,3,4,878542960,Four Rooms (1995),24,M,technician,85711
3,1,4,3,876893119,Get Shorty (1995),24,M,technician,85711
4,1,5,3,889751712,Copycat (1995),24,M,technician,85711
...,...,...,...,...,...,...,...,...,...
499995,943,1028,2,875502096,Grumpier Old Men (1995),22,M,student,77841
499996,943,1044,3,888639903,"Paper, The (1994)",22,M,student,77841
499997,943,1047,2,875502146,Multiplicity (1996),22,M,student,77841
499998,943,1228,3,888640275,Under Siege 2: Dark Territory (1995),22,M,student,77841


### Encoding


In [70]:
df = pd.get_dummies(df, columns=['gender'], drop_first=False)
le = LabelEncoder()
df['occupation'] = le.fit_transform(df['occupation'])

In [71]:
df['gender_F'] = df['gender_F'].astype(int)
df['gender_M'] = df['gender_M'].astype(int)


In [72]:
df

Unnamed: 0,user_id,item_id,rating,timestamp,title,age,occupation,zip_code,gender_F,gender_M
0,1,1,5,874965758,Toy Story (1995),24,19,85711,0,1
1,1,2,3,876893171,GoldenEye (1995),24,19,85711,0,1
2,1,3,4,878542960,Four Rooms (1995),24,19,85711,0,1
3,1,4,3,876893119,Get Shorty (1995),24,19,85711,0,1
4,1,5,3,889751712,Copycat (1995),24,19,85711,0,1
...,...,...,...,...,...,...,...,...,...,...
499995,943,1028,2,875502096,Grumpier Old Men (1995),22,18,77841,0,1
499996,943,1044,3,888639903,"Paper, The (1994)",22,18,77841,0,1
499997,943,1047,2,875502146,Multiplicity (1996),22,18,77841,0,1
499998,943,1228,3,888640275,Under Siege 2: Dark Territory (1995),22,18,77841,0,1


### Normalization

In [73]:
df.describe()

Unnamed: 0,user_id,item_id,rating,timestamp,age,occupation,gender_F,gender_M
count,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0
mean,462.48475,425.53013,3.52986,883528900.0,32.96985,11.07634,0.2574,0.7426
std,266.613354,330.797033,1.125669,5343835.0,11.562577,6.622823,0.437202,0.437202
min,1.0,1.0,1.0,874724700.0,7.0,0.0,0.0,0.0
25%,254.0,175.0,3.0,879448700.0,24.0,4.0,0.0,0.0
50%,447.0,322.0,4.0,882826900.0,30.0,13.0,0.0,1.0
75%,682.0,631.0,4.0,888260000.0,40.0,18.0,1.0,1.0
max,943.0,1682.0,5.0,893286600.0,73.0,20.0,1.0,1.0


In [74]:
scaler = MinMaxScaler()
df[['age', 'occupation']] = scaler.fit_transform(df[['age', 'occupation']])
df

Unnamed: 0,user_id,item_id,rating,timestamp,title,age,occupation,zip_code,gender_F,gender_M
0,1,1,5,874965758,Toy Story (1995),0.257576,0.95,85711,0,1
1,1,2,3,876893171,GoldenEye (1995),0.257576,0.95,85711,0,1
2,1,3,4,878542960,Four Rooms (1995),0.257576,0.95,85711,0,1
3,1,4,3,876893119,Get Shorty (1995),0.257576,0.95,85711,0,1
4,1,5,3,889751712,Copycat (1995),0.257576,0.95,85711,0,1
...,...,...,...,...,...,...,...,...,...,...
499995,943,1028,2,875502096,Grumpier Old Men (1995),0.227273,0.90,77841,0,1
499996,943,1044,3,888639903,"Paper, The (1994)",0.227273,0.90,77841,0,1
499997,943,1047,2,875502146,Multiplicity (1996),0.227273,0.90,77841,0,1
499998,943,1228,3,888640275,Under Siege 2: Dark Territory (1995),0.227273,0.90,77841,0,1


## Evaluation

In [75]:
print(df.columns)


Index(['user_id', 'item_id', 'rating', 'timestamp', 'title', 'age',
       'occupation', 'zip_code', 'gender_F', 'gender_M'],
      dtype='object')


In [86]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split

# Load dataset (adjust path as needed)
ratings = pd.read_csv(r"C:\Projects\elevvo\Movie Recommendation (Task 5)\ml-100k\u.data", sep="\t", names=["user_id", "item_id", "rating", "timestamp"])
movies = pd.read_csv(r"C:\Projects\elevvo\Movie Recommendation (Task 5)\ml-100k\u.item", sep="|", encoding="latin-1", names=["item_id", "title"], usecols=[0, 1])

# Merge ratings with movie titles
df = ratings.merge(movies, on="item_id")

# Drop unnecessary columns
df = df.drop(columns=["timestamp", "item_id"])

# Step 1: Split into train and test
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Step 2: Create user-item matrix from training data
user_item_matrix = train_df.pivot_table(index='user_id', columns='title', values='rating')
user_item_filled = user_item_matrix.fillna(0)

# Step 3: Compute user similarity
user_sim = cosine_similarity(user_item_filled)
np.fill_diagonal(user_sim, 0)
user_sim_df = pd.DataFrame(user_sim, index=user_item_filled.index, columns=user_item_filled.index)

# Step 4: Get top-K recommendations for a user
def get_top_k_recommendations(user_id, k):
    if user_id not in user_item_matrix.index:
        return []

    similar_users = user_sim_df[user_id].sort_values(ascending=False).index
    user_seen = set(user_item_matrix.loc[user_id].dropna().index)
    scores = {}

    for sim_user in similar_users:
        sim_user_ratings = user_item_matrix.loc[sim_user].dropna()
        for movie, rating in sim_user_ratings.items():
            if movie not in user_seen:
                scores[movie] = scores.get(movie, 0) + rating
        if len(scores) >= k:
            break

    top_k = sorted(scores, key=scores.get, reverse=True)[:k]
    return top_k

# Step 5: Get actual liked movies from test set
def get_actual_liked(user_id, threshold=4):
    return test_df[(test_df['user_id'] == user_id) & (test_df['rating'] >= threshold)]['title'].tolist()

# Step 6: Precision@K calculation
def precision_at_k(actual, predicted, k):
    if not actual:
        return 0
    hits = sum([1 for item in predicted if item in actual])
    return hits / k

# Step 7: Evaluate average Precision@K
def evaluate_precision_at_k(k):
    common_users = list(set(train_df['user_id']) & set(test_df['user_id']))
    precisions = []

    for user_id in common_users:
        predicted = get_top_k_recommendations(user_id, k)
        actual = get_actual_liked(user_id)
        if predicted:
            precisions.append(precision_at_k(actual, predicted, k))

    avg_precision = np.mean(precisions) if precisions else 0
    print(f"Average Precision@{k}: {avg_precision:.4f}")

# Step 8: Try multiple k values
for k in [5, 10, 20]:
    evaluate_precision_at_k(k)


Average Precision@5: 0.1232
Average Precision@10: 0.1146
Average Precision@20: 0.1033


## Bonus 

In [88]:
item_user_matrix = df.pivot_table(index="title", columns="user_id", values="rating").fillna(0)
user_item_matrix = df.pivot_table(index="user_id", columns="title", values="rating").fillna(0)

### 1. Item-Based Collaborative Filtering

item_similarity = cosine_similarity(item_user_matrix)
item_sim_df = pd.DataFrame(item_similarity, index=item_user_matrix.index, columns=item_user_matrix.index)

def recommend_similar_items(movie_title, k=5):
    if movie_title not in item_sim_df.index:
        return []
    scores = item_sim_df[movie_title].drop(movie_title).sort_values(ascending=False)
    return scores.head(k)

### 2. Matrix Factorization (SVD)

svd = TruncatedSVD(n_components=20, random_state=42)
user_factors = svd.fit_transform(user_item_matrix)
item_factors = svd.components_
reconstructed_matrix = np.dot(user_factors, item_factors)
reconstructed_df = pd.DataFrame(reconstructed_matrix, index=user_item_matrix.index, columns=user_item_matrix.columns)

def recommend_svd(user_id, k=5):
    user_rated = user_item_matrix.loc[user_id]
    preds = reconstructed_df.loc[user_id]
    unseen = user_rated[user_rated == 0].index
    recommendations = preds[unseen].sort_values(ascending=False).head(k)
    return recommendations

### Examples
print("Top 5 similar to 'Star Wars (1977)':")
print(recommend_similar_items("Star Wars (1977)", k=5))

print("\nTop 5 recommended for user 1 via SVD:")
print(recommend_svd(user_id=1, k=5))

Top 5 similar to 'Star Wars (1977)':
title
Return of the Jedi (1983)          0.884476
Raiders of the Lost Ark (1981)     0.764885
Empire Strikes Back, The (1980)    0.749819
Toy Story (1995)                   0.734572
Godfather, The (1972)              0.697332
Name: Star Wars (1977), dtype: float64

Top 5 recommended for user 1 via SVD:
title
Trainspotting (1996)                 3.838705
Sense and Sensibility (1995)         3.307991
Close Shave, A (1995)                3.306180
E.T. the Extra-Terrestrial (1982)    3.279469
Heathers (1989)                      3.216939
Name: 1, dtype: float64


In [92]:

# 1. Create user-item matrix
user_item_matrix = df.pivot_table(index='user_id', columns='title', values='rating').fillna(0)
user_item_np = user_item_matrix.values

# 2. Apply SVD
U, sigma, Vt = svds(user_item_np, k=20)  # k = number of latent factors
sigma = np.diag(sigma)

# 3. Reconstruct predicted matrix
pred_ratings = np.dot(np.dot(U, sigma), Vt)
pred_df = pd.DataFrame(pred_ratings, index=user_item_matrix.index, columns=user_item_matrix.columns)

# 4. Recommend top-N unseen movies for a given user
def recommend_svd(user_id, num_recommendations=5):
    if user_id not in pred_df.index:
        print("User not found.")
        return []

    # Movies the user has already rated
    known_rated = user_item_matrix.loc[user_id][user_item_matrix.loc[user_id] > 0].index

    # Drop already-rated movies and recommend top-rated
    user_predictions = pred_df.loc[user_id].drop(index=known_rated)
    recommendations = user_predictions.sort_values(ascending=False).head(num_recommendations)
    return recommendations

# Example
user_id = 10
recs = recommend_svd(user_id, num_recommendations=5)
print(f"Top 5 SVD recommendations for User {user_id}:\n")
print(recs)

Top 5 SVD recommendations for User 10:

title
Annie Hall (1977)                 3.920136
Schindler's List (1993)           3.852087
To Kill a Mockingbird (1962)      3.804486
Godfather: Part II, The (1974)    3.726570
Babe (1995)                       3.215682
Name: 10, dtype: float64
