In [11]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

# Load the datasets
columns_u_data = ['user_id', 'item_id', 'rating', 'timestamp']
u_data = pd.read_csv('../data/raw/ml-100k/u1.base', sep='\t', names=columns_u_data)

columns_u_user = ['user_id', 'age', 'gender', 'occupation', 'zip_code']
u_user = pd.read_csv('../data/raw/ml-100k/u.user', sep='|', names=columns_u_user)

columns_u_item = ['movie_id', 'title', 'release_date', 'video_release_date', 'IMDB_URL',
                  'unknown', 'Action', 'Adventure', 'Animation', 'Children', 'Comedy',
                  'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror',
                  'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
u_item = pd.read_csv('../data/raw/ml-100k/u.item', sep='|', names=columns_u_item, encoding='latin-1')

print(u_data.head())
print(u_user.head())
print(u_item.head())

   user_id  item_id  rating  timestamp
0        1        1       5  874965758
1        1        2       3  876893171
2        1        3       4  878542960
3        1        4       3  876893119
4        1        5       3  889751712
   user_id  age gender  occupation zip_code
0        1   24      M  technician    85711
1        2   53      F       other    94043
2        3   23      M      writer    32067
3        4   24      M  technician    43537
4        5   33      F       other    15213
   movie_id              title release_date  video_release_date  \
0         1   Toy Story (1995)  01-Jan-1995                 NaN   
1         2   GoldenEye (1995)  01-Jan-1995                 NaN   
2         3  Four Rooms (1995)  01-Jan-1995                 NaN   
3         4  Get Shorty (1995)  01-Jan-1995                 NaN   
4         5     Copycat (1995)  01-Jan-1995                 NaN   

                                            IMDB_URL  unknown  Action  \
0  http://us.imdb.com/M/ti

## 1. Recommending Movies Similar to Favorite Movies
In this section, we will be working on recommending for the user movies similar to the movies he liked before. This type of recommendation is called "Content Based Recommendation" as we are recommending movies to the user according to the content they like regardless of the user information itself.

We will implement this using the following steps:
1. Form a pivot table that contains the user ratings for each movie.
2. Extract the user's ratings
3. Compute the TF-IDF matrix for the movies
4. Compute the cosine similarity
5. Sort the movies based on similarity scores
6. Get the top k movie recommendations

In [12]:
# Merge user data with movies data
users_and_movies = pd.merge(u_data, u_item[['movie_id', 'title']], left_on='item_id', right_on='movie_id')

# Create a user-item matrix with ratings
user_item_movies = users_and_movies.pivot_table(index='user_id', columns='title', values='rating')

# Fill NaN values with 0 (assuming no rating means a rating of 0)
user_item_movies = user_item_movies.fillna(0)

print(user_item_movies)

title    'Til There Was You (1997)  1-900 (1994)  101 Dalmatians (1996)  \
user_id                                                                   
1                              0.0           0.0                    0.0   
2                              0.0           0.0                    0.0   
3                              0.0           0.0                    0.0   
4                              0.0           0.0                    0.0   
5                              0.0           0.0                    0.0   
...                            ...           ...                    ...   
939                            0.0           0.0                    0.0   
940                            0.0           0.0                    0.0   
941                            0.0           0.0                    0.0   
942                            0.0           0.0                    0.0   
943                            0.0           0.0                    0.0   

title    12 Angry Men (1

In [13]:
from sklearn.metrics.pairwise import cosine_similarity

def get_content_based_recommendations(userID, user_movies_matrix, top_k=5):
    # Extract the user's ratings
    user_ratings = user_movies_matrix.loc[userID]

    # Create a text representation of the movies, weighted by the user's ratings
    weighted_movies = []
    for title, rating in user_ratings.items():
        if rating > 0:
            weighted_movies.extend([title] * int(rating * 2))  # Weighting by rating

    #Compute the TF-IDF matrix for the movies
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform(user_movies_matrix.columns)

    # Transform the weighted movies list into a vector
    user_profile_vector = tfidf_vectorizer.transform([" ".join(weighted_movies)])

    # Compute the cosine similarity
    cosine_similarities = cosine_similarity(user_profile_vector, tfidf_matrix)

    # Sort the movies based on similarity scores
    similarity_scores = list(enumerate(cosine_similarities.flatten()))
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)

    # Get the top k movie recommendations
    movie_indices = [x[0] for x in similarity_scores[:top_k]]
    recommended_movies = [user_movies_matrix.columns[i] for i in movie_indices]

    return recommended_movies


In [14]:
# Example
userId = 1
recommendations = get_content_based_recommendations(userId, user_item_movies)
print(recommendations)

['Quick and the Dead, The (1995)', 'Man of the House (1995)', 'Run of the Country, The (1995)', 'Man of the Year (1995)', 'Show, The (1995)']


## 2. Recommended movies based on user's demographic information
In this section, we will be working on recommending for the users movies based on his demographic information such as their age, gender, and occupation.

We will implement this using the following steps:
1. Group users by demographics and calculate the average rating for each movie within each group.
2. Find the demographic group of the target user.
3. Recommend movies that are highly rated within that demographic group.

In [15]:
def get_demographic_based_recommendations(userID, user_movies_matrix, top_k=5):
    # Merge the user demographic data with movie ratings
    merged_data = pd.merge(u_user, user_movies_matrix, left_on='user_id', right_index=True)

    # Group by demographics and calculate average ratings
    grouped_data = merged_data.groupby(['age', 'gender', 'occupation', 'zip_code']).mean()

    # Find the user's demographic group
    user_demographics = u_user.loc[u_user['user_id'] == userID, ['age', 'gender', 'occupation', 'zip_code']]
    if user_demographics.empty:
        return []  # Return an empty list if user's demographic data is not found

    # Extract the user's demographic information
    age, gender, occupation, zip_code = user_demographics.iloc[0]

    # Get the average ratings for this demographic group
    try:
        demographic_ratings = grouped_data.loc[age, gender, occupation, zip_code]
    except KeyError:
        return []  # Return an empty list if the demographic group is not found

    # Sort movies by their average rating within the demographic group
    recommended_movies = demographic_ratings.sort_values(ascending=False).head(top_k).index.tolist()

    return recommended_movies

In [16]:
# Example
userId = 1
recommendations = get_demographic_based_recommendations(userId, user_item_movies)
print(recommendations)

['Full Monty, The (1997)', 'Professional, The (1994)', 'Nikita (La Femme Nikita) (1990)', 'Graduate, The (1967)', 'Godfather, The (1972)']


## 3. Merging the two recommendation systems

In this section, we will be working on merging the two recommendation systems we have built in the previous sections. We will define the function that calls the previous two functions and merge the results.

In [17]:
# Function to get all recommedations for a user
def get_all_recommendations(userID, user_movies_matrix, top_k=10):
    content_based_recommendations = get_content_based_recommendations(userID, user_movies_matrix, top_k)
    demographic_based_recommendations = get_demographic_based_recommendations(userID, user_movies_matrix, top_k)

    # Merge the two recommendation lists
    all_recommendations = content_based_recommendations + demographic_based_recommendations

    # Remove duplicates from the list
    all_recommendations = list(set(all_recommendations))

    return all_recommendations

In [18]:
# Example
userId = 1
recommendations = get_all_recommendations(userId, user_item_movies)
print(recommendations)

['Full Monty, The (1997)', 'Godfather, The (1972)', 'Run of the Country, The (1995)', 'French Twist (Gazon maudit) (1995)', 'Professional, The (1994)', 'Wife, The (1995)', 'Man of the House (1995)', 'Man of the Year (1995)', 'Nikita (La Femme Nikita) (1990)', 'When Harry Met Sally... (1989)', 'Ghost and the Darkness, The (1996)', 'Graduate, The (1967)', 'Gattaca (1997)', 'Quick and the Dead, The (1995)', 'Show, The (1995)', 'Aliens (1986)', 'Remains of the Day, The (1993)', 'Last of the Mohicans, The (1992)', 'Amadeus (1984)']


# Evaluation

In [19]:
from sklearn.metrics import mean_squared_error
from math import sqrt

test_data = pd.read_csv('../data/raw/ml-100k/u1.test', sep='\t', names=['user_id', 'item_id', 'rating', 'timestamp'])

# Function to calculate MSE and RMSE
def evaluate_recommender(recommender_func, user_movies_matrix, test_data):
    predicted_ratings = []
    actual_ratings = []

    for _, row in test_data.iterrows():
        user_id, item_id, actual_rating, _ = row
        recommended_movies = recommender_func(user_id, user_movies_matrix)

        # Check if the recommended movie is in the test set
        if item_id in recommended_movies:
            predicted_ratings.append(1)  # Recommended
        else:
            predicted_ratings.append(0)  # Not recommended

        actual_ratings.append(actual_rating)

    mse = mean_squared_error(actual_ratings, predicted_ratings)
    rmse = sqrt(mse)

    return mse, rmse

In [20]:
# Evaluate the recommender system
mse, rmse = evaluate_recommender(get_all_recommendations, user_item_movies, test_data)
print(f'Mean Squared Error (MSE): {mse}')
print(f'Root Mean Squared Error (RMSE): {rmse}')

Mean Squared Error (MSE): 2.4435
Root Mean Squared Error (RMSE): 1.56316985641
