# Movie Recommendation System

This notebook builds and evaluates two collaborative filtering models: User-Based and Item-Based.

## 1. Imports

In [107]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split

## Configure Parameters

In [108]:
test_user_id = 10    # The ID of the user to get recommendations for
k = 10               # The number of movies to recommend

## 2. Load Data

Defining column names and loading the `u.data` and `u.item` files.

In [109]:
r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']

m_cols = ["movie_id", "title", "release date", "video release date", "IMDb URL"] + [f"genre_{i}" for i in range(19)]

ratings_df = pd.read_csv("u.data", sep="\t", names=r_cols, encoding="latin-1")
movies_df = pd.read_csv("u.item", sep="|", names=m_cols, usecols=["movie_id", "title"], encoding="latin-1", index_col="movie_id")

## 3. Inspect & Merge

Printing the first few rows of the dataframes and then merging them into a single `df`.

In [110]:
print(f"RATINGS : \n-----------------------------------------------\n{ratings_df}\n\n")
print(f"MOVIES : \n------------------------------------------------\n{movies_df}\n\n")
print("MERGED : \n------------------------------------------------")
df = pd.merge(movies_df, ratings_df, on="movie_id")
df

RATINGS : 
-----------------------------------------------
       user_id  movie_id  rating  timestamp
0          196       242       3  881250949
1          186       302       3  891717742
2           22       377       1  878887116
3          244        51       2  880606923
4          166       346       1  886397596
...        ...       ...     ...        ...
99995      880       476       3  880175444
99996      716       204       5  879795543
99997      276      1090       1  874795795
99998       13       225       2  882399156
99999       12       203       3  879959583

[100000 rows x 4 columns]


MOVIES : 
------------------------------------------------
                                              title
movie_id                                           
1                                  Toy Story (1995)
2                                  GoldenEye (1995)
3                                 Four Rooms (1995)
4                                 Get Shorty (1995)
5            

Unnamed: 0,movie_id,title,user_id,rating,timestamp
0,1,Toy Story (1995),308,4,887736532
1,1,Toy Story (1995),287,5,875334088
2,1,Toy Story (1995),148,4,877019411
3,1,Toy Story (1995),280,4,891700426
4,1,Toy Story (1995),66,3,883601324
...,...,...,...,...,...
99995,1678,Mat' i syn (1997),863,1,889289570
99996,1679,B. Monkey (1998),863,3,889289491
99997,1680,Sliding Doors (1998),863,2,889289570
99998,1681,You So Crazy (1994),896,3,887160722


## 4. Configs & Train/Test Split

Setting up the configuration variables (`test_user_id`, `K`) and splitting the main dataframe into training and testing sets.

## 5. Build User-Item Matrix (Training Data)

Creating a user-item matrix using only the 75% training data.

### Build User-Based Similarity Matrix & Item-Based Similarity Matrix (From Training Data)

In [111]:
train_df, test_df = train_test_split(df, test_size= 0.25, random_state=42)
train_user_item_matrix = train_df.pivot_table(index="user_id", columns="title", values="rating")

train_user_item_matrix.fillna(0, inplace=True)

# user-based
train_user_similarity = cosine_similarity(train_user_item_matrix)
train_user_similarity_df = pd.DataFrame(train_user_similarity, index=train_user_item_matrix.index, columns=train_user_item_matrix.index)

# item-based
item_user_matrix = train_user_item_matrix.T
item_similarity = cosine_similarity(item_user_matrix)
item_similarity_df = pd.DataFrame(item_similarity, item_user_matrix.index, item_user_matrix.index)

In [112]:
def get_user_based_recommendations(user_id, k):
    similar_users = train_user_similarity_df[user_id].sort_values(ascending=False)[1:k+1]
    user_ratings = train_user_item_matrix.loc[user_id]
    movies_seen_by_user = user_ratings[user_ratings > 0].index
    
    recommendations = {}
    for similar_user, score in similar_users.items():
        similar_users_ratings = train_user_item_matrix.loc[similar_user]
        seen_by_similar_users = similar_users_ratings[similar_users_ratings > 0]
        high_ratings = seen_by_similar_users[seen_by_similar_users > 3]
       
        for movie, rating in high_ratings.items():
            if movie not in movies_seen_by_user:
                if movie not in recommendations:
                    recommendations[movie] = 0
                recommendations[movie] += score
                
    sorted_r = sorted(recommendations.items(), key=lambda item: item[1], reverse=True)
    
    return pd.DataFrame(sorted_r, columns=['Movie', 'Score']).head(k)

In [113]:
def get_item_based_recommendations(user_id, k):
    user_ratings = train_user_item_matrix.loc[user_id]
    movies_seen = user_ratings[user_ratings > 0].index.tolist()
    top_movies = user_ratings[user_ratings >= 4].index.tolist()

    recommendations = {}
    for movie in top_movies:
        similar_movies = item_similarity_df[movie].sort_values(ascending=False)[1:k+1]

        for s_movie, score in similar_movies.items():
            if s_movie not in recommendations:
                recommendations[s_movie] = 0
            recommendations[s_movie] += score * user_ratings[movie]

    r = {movie: score for movie, score in recommendations.items() if movie not in movies_seen}    
    sorted_r = sorted(r.items(), key=lambda item: item[1], reverse=True)

    return pd.DataFrame(sorted_r, columns=['Movie', 'Score']).head(k)

In [129]:
def get_percision(user_based="user"):
    if user_based in ["user", "user"] :
        r = get_user_based_recommendations(test_user_id, k)
        m = "User_based"
    else :    
        r = get_item_based_recommendations(test_user_id, k)
        m = "Item_based"
    recommended_list = r["Movie"].tolist()
    
    test_user_df = test_df[test_df["user_id"] == test_user_id]
    actual_list = test_user_df[test_user_df["rating"] > 3]["title"].tolist()
    
    hits = [movie for movie in recommended_list if movie in actual_list]
    num_hits = len(hits)
    
    precision = num_hits / k if k > 0 else 0.0 

    print(f"using {m}_Matrix : \n---------------------")
    print(f"Recs for User {test_user_id}: {recommended_list}")
    print(f"\nTrue Hits in Test Set: {actual_list}\n\nLen : {len(actual_list)}")
    print(f"\nHits: {num_hits}")
    print(f"\nPrecision@{k} for this user is: {precision:.0%}\n---------------------------------------------------\n")

In [130]:
get_percision() # user Based
get_percision("item")  # item Based

using User_based_Matrix : 
---------------------
Recs for User 10: ['North by Northwest (1959)', 'Taxi Driver (1976)', 'Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1963)', 'Babe (1995)', 'To Kill a Mockingbird (1962)', 'Lawrence of Arabia (1962)', 'Annie Hall (1977)', 'This Is Spinal Tap (1984)', 'Streetcar Named Desire, A (1951)', 'Searching for Bobby Fischer (1993)']

True Hits in Test Set: ['My Left Foot (1989)', 'Die Hard (1988)', 'Shine (1996)', 'Hoop Dreams (1994)', 'Better Off Dead... (1985)', 'Rear Window (1954)', 'Deer Hunter, The (1978)', 'African Queen, The (1951)', 'Lawrence of Arabia (1962)', 'When Harry Met Sally... (1989)', 'Emma (1996)', 'Patton (1970)', 'Secret of Roan Inish, The (1994)', 'Blues Brothers, The (1980)', 'Forrest Gump (1994)', 'North by Northwest (1959)', 'Substance of Fire, The (1996)', 'Psycho (1960)', 'Game, The (1997)', 'Full Monty, The (1997)', "It's a Wonderful Life (1946)", 'Gandhi (1982)', 'Wild Bunch, The (1969)', 'Cluel