## Loading Dataset

In [4]:
import pandas as pd

ratings=pd.read_csv('u.data', sep='\t', names=['user_id', 'movie_id', 'ratings', 'timespam' ])

print(ratings.head())


   user_id  movie_id  ratings   timespam
0      196       242        3  881250949
1      186       302        3  891717742
2       22       377        1  878887116
3      244        51        2  880606923
4      166       346        1  886397596


In [10]:
movies=pd.read_csv('u.item',sep='|', encoding='latin-1',names=['movie_id','Title','release_date','video_release_date','url','unknown',
                                                               'Actions','Adventure','Animations','Childern','Comdey','Crime',
                                                              'Documentary','Drama','Fantasy','Film-Noir','Horror','Musical',
                                                              'Mystery','Romance','Sci-fi','Thriller','War','Western'],
                  usecols=[0,1])

print(movies.head())

   movie_id              Title
0         1   Toy Story (1995)
1         2   GoldenEye (1995)
2         3  Four Rooms (1995)
3         4  Get Shorty (1995)
4         5     Copycat (1995)


In [11]:
ratings_with_titles=ratings.merge(movies,on='movie_id')
print(ratings_with_titles.head())

   user_id  movie_id  ratings   timespam                       Title
0      196       242        3  881250949                Kolya (1996)
1      186       302        3  891717742    L.A. Confidential (1997)
2       22       377        1  878887116         Heavyweights (1994)
3      244        51        2  880606923  Legends of the Fall (1994)
4      166       346        1  886397596         Jackie Brown (1997)


In [12]:
user_item_matrics=ratings_with_titles.pivot_table(index='user_id', columns='Title',values='ratings')

print(user_item_matrics.head())

Title    'Til There Was You (1997)  1-900 (1994)  101 Dalmatians (1996)  \
user_id                                                                   
1                              NaN           NaN                    2.0   
2                              NaN           NaN                    NaN   
3                              NaN           NaN                    NaN   
4                              NaN           NaN                    NaN   
5                              NaN           NaN                    2.0   

Title    12 Angry Men (1957)  187 (1997)  2 Days in the Valley (1996)  \
user_id                                                                 
1                        5.0         NaN                          NaN   
2                        NaN         NaN                          NaN   
3                        NaN         2.0                          NaN   
4                        NaN         NaN                          NaN   
5                        NaN        

## Preprocessing

### Removing the Missing values

In [13]:
print(ratings.isnull().sum())

user_id     0
movie_id    0
ratings     0
timespam    0
dtype: int64


In [14]:
print(movies.isnull().sum())

movie_id    0
Title       0
dtype: int64


In [16]:
ratings.dropna(inplace=True)

In [17]:
print(ratings.head())

   user_id  movie_id  ratings   timespam
0      196       242        3  881250949
1      186       302        3  891717742
2       22       377        1  878887116
3      244        51        2  880606923
4      166       346        1  886397596


In [18]:
movies.dropna(inplace=True)

### Remove the Duplicate Values

In [20]:
duplicates=ratings.duplicated(subset=['user_id','movie_id'])

print(duplicates.sum())

0


In [22]:
ratings.drop_duplicates(subset=['user_id','movie_id'], keep='last', inplace=True)

### Handle NaNs Values

In [23]:
#filled the NaNs with 0

user_item_matrics_filled=user_item_matrics.fillna(0)


In [26]:
#Pearson Correlation & mean Center

user_mean=user_item_matrics.mean(axis=0)
user_item_matrics_center=user_item_matrics.sub(user_mean,axis=0).fillna(0)


### Remove Rare Movies

In [27]:
min_ratings=5
movies_counts=ratings['movie_id'].value_counts()
popular_movies=movies_counts[movies_counts>=min_ratings].index
ratings=ratings[ratings['movie_id'].isin(popular_movies

### Remove Inactive User

In [30]:
min_rated_user=5
user_counts=ratings['user_id'].value_counts()
active_user=user_counts[user_counts>=min_rated_user].index
ratings=ratings[ratings['user_id'].isin(active_user)]




### Merge the new ratings data with the ratings_with_titles

In [34]:
ratings_with_titles=ratings.merge(movies,on='movie_id')



In [41]:
## user_item_matrics_filled with ratings_with_titles

user_item_matrics=ratings_with_titles.pivot_table(index='user_id' , columns='Title' , values='ratings')

user_item_matrics=user_item_matrics.fillna(0)

print(user_item_matrics.head())

Title    'Til There Was You (1997)  1-900 (1994)  101 Dalmatians (1996)  \
user_id                                                                   
1                              0.0           0.0                    2.0   
2                              0.0           0.0                    0.0   
3                              0.0           0.0                    0.0   
4                              0.0           0.0                    0.0   
5                              0.0           0.0                    2.0   

Title    12 Angry Men (1957)  187 (1997)  2 Days in the Valley (1996)  \
user_id                                                                 
1                        5.0         0.0                          0.0   
2                        0.0         0.0                          0.0   
3                        0.0         2.0                          0.0   
4                        0.0         0.0                          0.0   
5                        0.0        

### Compute User Similarity User

In [45]:
from sklearn.metrics.pairwise import cosine_similarity

similarity_matrics=cosine_similarity(user_item_matrics)

similarity_matrics_df=pd.DataFrame(similarity_matrics,index=user_item_matrics.index , columns=user_item_matrics.index)

print(similarity_matrics_df.head())

user_id       1         2         3         4         5         6         7    \
user_id                                                                         
1        1.000000  0.169473  0.048542  0.064766  0.380984  0.431048  0.447028   
2        0.169473  1.000000  0.113393  0.179694  0.073644  0.242106  0.109221   
3        0.048542  0.113393  1.000000  0.349781  0.021598  0.074018  0.067805   
4        0.064766  0.179694  0.349781  1.000000  0.031813  0.068431  0.092027   
5        0.380984  0.073644  0.021598  0.031813  1.000000  0.238703  0.376967   

user_id       8         9         10   ...       934       935       936  \
user_id                                ...                                 
1        0.321096  0.078634  0.379850  ...  0.373396  0.120240  0.274454   
2        0.104257  0.162470  0.161664  ...  0.147095  0.310661  0.368342   
3        0.084419  0.062039  0.066377  ...  0.033885  0.043453  0.169447   
4        0.188060  0.101284  0.061007  ...  0.054615

### Recommend top rated unseen movies

In [79]:
import numpy as np

def recommend_movies(user_id,user_item_matrics,similarity_matrics_df,k_neighbor=5,top_n=5):
    similar_user=similarity_matrics_df.loc[user_id].sort_values(ascending=False).drop(user_id).head(k_neighbor)
    similar_users_ratings = user_item_matrics.loc[similar_user.index]
    weight_sum=np.dot(similar_user, similar_users_ratings)
    sum_of_weights=similar_user.sum()
    predicted_ratings = pd.Series(weight_sum / sum_of_weights, index=user_item_matrics.columns)
    rated_movies=user_item_matrics.loc[user_id]
    unrated_movies=rated_movies[rated_movies==0].index
    recommendations=predicted_ratings[unrated_movies].sort_values(ascending=False).head(top_n)

    return recommendations

print(recommend_movies(2,user_item_matrics,similarity_matrics_df))

Title
Amistad (1997)                4.015391
Lone Star (1996)              3.244770
Michael Collins (1996)        2.336432
Spitfire Grill, The (1996)    2.307649
Game, The (1997)              2.049479
dtype: float64


In [86]:
import random

def precision_at_k_hidden(user_id, k, user_item_matrix, similarity_matrix, threshold=3.5, hide_n=3):
    
    relevant_movies = user_item_matrix.loc[user_id]
    relevant_movies = relevant_movies[relevant_movies >= threshold].index.tolist()
    
    # Step 2: Randomly select some movies to hide
    if len(relevant_movies) < hide_n:
        return "Not enough movies to hide"
    hidden_movies = random.sample(relevant_movies, hide_n)
    
    # Step 3: Create a copy of the matrix & hide ratings
    modified_matrix = user_item_matrix.copy()
    modified_matrix.loc[user_id, hidden_movies] = 0  # Hide ratings
    
    # Step 4: Get recommendations
    recs = recommend_movies(user_id, modified_matrix, similarity_matrix, top_n=k)
    
    # Step 5: Check overlap (how many hidden movies appear in recommendations)
    recommended_movies = recs.index[:k]
    intersection = set(recommended_movies) & set(hidden_movies)
    
    precision = len(intersection) / k
    
    print(f"Hidden movies: {hidden_movies}")
    print(f"Recommended movies: {recommended_movies.tolist()}")
    print(f"Intersection: {intersection}")
    
    return precision

# Test for User 8
precision = precision_at_k_hidden(8, k=5, user_item_matrix=user_item_matrics, similarity_matrix=similarity_matrics_df)
print(f"Precision : {precision:.2f}")



Hidden movies: ['Star Trek VI: The Undiscovered Country (1991)', 'Godfather: Part II, The (1974)', 'Butch Cassidy and the Sundance Kid (1969)']
Recommended movies: ['Godfather: Part II, The (1974)', 'Die Hard 2 (1990)', 'Alien 3 (1992)', 'Titanic (1997)', 'Top Gun (1986)']
Intersection: {'Godfather: Part II, The (1974)'}
Precision : 0.20
