<a href="https://colab.research.google.com/github/Kyalo-oss/Movie-lens-recommendation-system-/blob/main/Movielens_solution.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'movielens:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F108335%2F258538%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240604%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240604T153623Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D9080a4071bec0153cbca2ebec73faf8dc724d367e1cdfb3b19bae70b2cb6678e3e4a331d6a35e744932078b257756c3509ee5423b0ce0ec1762754a7eb06789497f372d884f4b5fd476e0dd26e1c0f04d62af354dc9214f6f0e558877ee0728c8e3be7c2fedd9e1d4aba04f04c72c6f486c1ca6f4564d6132ea40555b1f958cc3cd7ba371a2042667953483c84cbec2b351cf967953b7dc9fd8e921eecb529bd46a7ed480e5ae73102f362b7a17026a4c80e1cfe29eccd120127d2d6a214c2f558182c1c1a554663a74d8efb389eada89c0e6971ed7f629d30e98283d20512bf044e51fa6ebdc029cba862fb1eac924d55b11f5d2a6faab1e2af9974eb69904f'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# MOVIELENS RECOMMENDATION SYSTEM

Movie recommendation systems are essential in the current digital era because people looking for personalized recommendations may become overwhelmed by the sheer amount of available movie content. ⁤⁤This project seeks to create a strong recommendation system by utilizing the MovieLens dataset. ⁤⁤We try to find patterns and trends by carefully prepping and examining the dataset for insights. ⁤⁤Our goal is to develop a dynamic recommendation system that not only makes movie selection easier but also improves user experiences by offering personalized suggestions, boosting content discovery, and encouraging engagement in the varied world of cinema through the application of collaborative filtering techniques like matrix factorization or nearest neighbors methods.

In [None]:
# Import Necessary Libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from surprise.model_selection import train_test_split
from surprise import Dataset, Reader, SVD, accuracy



# DATA PREPARATION AND EXPLORATION


In [None]:
# Loading movies.dat
movies = pd.read_csv('/kaggle/input/movielens/movies.dat', sep='::', names=['MovieID', 'Title', 'Genres'], engine='python', encoding='latin1')

print("Movies:")
print(movies.head(10))

# Checking for missing values
print("Movies missing values:")
print(movies.isnull().sum())




In [None]:
# loading Users.dat file
users = pd.read_csv('/kaggle/input/movielens/users.dat', sep='::', names=['UserID', 'Gender', 'Age', 'Occupation', 'Zip-code'], engine='python', encoding='latin1')

print("Users:")
print(users.head(10))

print("Users missing values:")
print(users.isna().sum())

In [None]:
# Dropping Occupation and Zip-code columns
users.drop(['Occupation', 'Zip-code'],axis=1, inplace=True)


In [None]:
# loading Ratings.dat file
ratings = pd.read_csv('/kaggle/input/movielens/ratings.dat', sep='::', names=['UserID', 'MovieID', 'Rating', 'Timestamp'], engine='python', encoding='latin1')

print("Ratings:")
print(ratings.head(10))

print("ratings missing values:")
print(ratings.isna().sum())

In [None]:
#Dropping column Timestamp
ratings = ratings.drop("Timestamp", axis=1)


In [None]:
ratings.head(10)

# DATA ANALYSIS AND VISUALIZATION

# Analysis of User Engagement and Content Popularity


We explore the trends in user interaction and popular content on our platform in this section. Through the analysis of interaction frequency that is, the quantity of user-provided ratings we are able to derive important insights about user behavior and trends in content consumption.

**User Engagement Analysis**

First, we determine which users are the most engaged with our platform. Through this, we can identify the most active users—those who routinely watch a lot of movies—regardless of the ratings given. Comprehending the engagement patterns of users is imperative for proficient user segmentation, focused marketing campaigns, and the provision of customized suggestions based on individual preferences.

**Content Popularity Assessment**

⁤Furthermore, we explore the popularity of movies within our platform, irrespective of their ratings. ⁤⁤By analyzing the frequency of movie views and interactions, we identify the movies that garner the highest levels of viewership. ⁤⁤This insight is invaluable for content acquisition strategies, licensing negotiations, and decisions regarding future content production or acquisition. ⁤In this case, since we don't have the number of times each user has watched a movie, we use ratings count as the number each user has watched the movie.

This are crucial steps towards optimizing platform performance, enhancing user experience, and refining content curation algorithms to better align with user preferences and viewing habits


In [None]:

# Merge User and Ratings data on 'UserID'
user_ratings = ratings.merge(users, on='UserID')

# Display the merged DataFrame
user_ratings.head(10)

First and Foremost we will start by showing the total number of users on the platform, followed by a breakdown by gender

In [None]:
# Gender Distribution

gender_counts = user_ratings['Gender'].value_counts()

Male = gender_counts.get('M', 0)
Female = gender_counts.get('F', 0)

print("Number of users by Gender: ")
print("Males: ", Male)
print("Females: ", Female)
print("Total :", Male + Female)

# Colors for Gender bars
colors = ['blue' if gender == 'M' else 'coral' for gender in gender_counts.index]

# Gender Plot

plt.figure(figsize=(9,6))
plt.bar(gender_counts.index, gender_counts.values, width=0.5 ,color=colors)
plt.xlabel("Gender")
plt.ylabel("Number of Users")
plt.title("Gender Distribution")
plt.show()

In [None]:
#Top 10 users with the highest number of ratings

No_of_ratings = user_ratings['UserID'].value_counts()

top_10_users_ratings = No_of_ratings.head(10)

print("Top 10 Users with the highest number of ratings:")
print(top_10_users_ratings)

In [None]:
# Merge Movies and ratings data on "MovieID"

movie_ratings = movies.merge(ratings, on="MovieID")

movie_ratings.head(10)

In [None]:
# Top 10 Movies with the highest number of ratings

no_of_movie_ratings = movie_ratings[['MovieID','Title']].value_counts()

print("Movies with the hightest No of ratings: ")
print(no_of_movie_ratings.head(10))

In [None]:
no_of_movie_ratings.describe()

In [None]:
# Create subplots
fig, ax = plt.subplots(1, 2, figsize=(14, 10))

# Plot top 10 users with highest number of ratings
No_of_ratings.head(10).plot(kind='bar', color='salmon', ax=ax[0])
ax[0].set_title('Top 10 Users with Highest Number of Ratings')
ax[0].set_xlabel('User ID')
ax[0].set_ylabel('Number of Ratings')


# Plot top 10 movies with highest number of ratings
no_of_movie_ratings.head(10).plot(kind='bar', color='skyblue', ax=ax[1])
ax[1].set_title('Top 10 Movies with Highest Number of Ratings')
ax[1].set_xlabel('Movie Title')
ax[1].set_ylabel('Number of Ratings')
ax[1].tick_params(axis='x', rotation=90)

# Display plots
plt.tight_layout()
plt.show()

# Top 10 Users who rated the most movies

From the visualizations above we can see that the top 10 User ID's have rated over a 1000 movies. There are several insights that we can get from this:

- **Influence of Highly Active Users:** Highly active users might have an influence on other users through their ratings and reviews. Their engagement can help drive recommendations and influence the viewing choices of less active users.

- **Usage Patterns:** The data can also indicate different usage patterns. For instance, these top users might spend more time on the platform and might be more receptive to new features, premium content, or personalized recommendations.

- **Content Consumption Trends:** These top users can provide valuable insights into content consumption trends. Understanding what these highly engaged users are watching and rating can help in identifying popular genres or types of content that might appeal to other users.

Normally, we consider that the top 10 are outliers cause the description stats states otherwise as 75% of the users,have rated 350 movies or fewer. This further supports the idea that the majority of users are not as highly engaged as the top 10 users. 50% of the users have rated 123.5 movies which indicates a central tendency towards lower engagement levels compared to the highly active top 10 users. 25% of users have rated 33 movies or fewer. This suggests a significant portion of the user base has relatively low engagement.

# Top 10 Movies with the Highest Number of Ratings

This listing gives you a good idea about the number of movies watched as well as their level of engagement on this platform. Although the ratings may range from 1 to 5, the massive number of ratings indicates a high level of engagement and the interest in these titles among the users.

**Top Rated Movies:**

- American Beauty (1999) is the most popular or viewed movie in the online cinema with 3428 ratings.

- A greater number of people might watch movies like Star Wars, Jurassic Park, Saving Private Ryan, and The Matrix, which are some of the most popular films.

- The list predominantly features well-known and critically acclaimed movies from various genres and time periods.

**Genres and Eras:**

The movies span different genres such as drama (American Beauty), sci-fi (Star Wars, The Matrix), action (Terminator 2), and thriller (Silence of the Lambs).

The films also cover a range of release years, from the late 1970s (Star Wars: Episode IV) to the late 1990s (The Matrix).

**Consistent Popularity:**

The high number of ratings indicates sustained popularity over time, suggesting these movies have a lasting appeal and are frequently viewed and rated by new users.

In [None]:
# Genre with the most Ratings

Genres_rating = movies['Genres'].value_counts()

Genres_rating.head(50)


In [None]:
# Plot Top 50 rated Genres

plt.figure(figsize=(20, 10))
Genres_rating.head(50).plot(kind='bar', color='coral')
plt.xlabel('Genre')
plt.ylabel('Rating')
plt.title('Top 50 Rated Genres')
plt.show()

**This visualization shows genres which are rated the most.**

In [None]:
movies = pd.DataFrame(movies)

# Merge User_ratings with movies dataframe based on 'MovieID'
Master_data = user_ratings.merge(movies, on='MovieID')

# Display the final merged DataFrame
Master_data

In [None]:
Master_data.describe()

In [None]:
Master_data.info()

In [None]:
# Creating column AgeGroup for the model

bins = [0, 10, 20, 30, 40, 50, 60, float('inf')]
labels = ['0-9', '10-19', '20-29', '30-39', '40-49', '50-59', '60+']
Master_data['AgeGroup'] = pd.cut(Master_data['Age'], bins=bins, labels=labels, right=False)

# Print the categorized age groups
print(Master_data[['UserID', 'Age', 'AgeGroup']])



In [None]:
# Distribution of Age

plt.figure(figsize=(10,6))
plt.hist(Master_data['Age'], bins=5, edgecolor='k')
plt.title('Age Distribution')
plt.xlabel('Age')
plt.ylabel('No of Users')
plt.show()

In [None]:
# Ratings distribution

ratings_count = Master_data.groupby('MovieID')['Rating'].count()

plt.figure(figsize=(10, 6))
plt.hist(ratings_count, bins=50, edgecolor='k', alpha=0.7)
plt.title('Movie Ratings Distribution')
plt.xlabel('Number of Ratings')
plt.ylabel('Number of Movies')
plt.show()



In [None]:
# Create a histogram of movie ratings
plt.hist(Master_data['Rating'], bins=5, range=(1, 5), edgecolor='black')
plt.xlabel('Rating')
plt.ylabel('Number of Movies')
plt.title('Movie Ratings Distribution')
plt.show()

In [None]:
# Filter the rows with a rating of 5
highly_rated_movies = Master_data[Master_data['Rating'] == 5]

# Display the movies with a rating of 5
highly_rated_movies

In [None]:
# Group by movie title and count 5-star ratings
movie_ratings = Master_data[Master_data['Rating'] == 5].groupby('Title')['Rating'].count().reset_index()

# Movies with the highest ratings
top_rated_movies = movie_ratings.sort_values(by='Rating', ascending=False)

# bar plot for the top-rated movies
plt.figure(figsize=(12, 6))
plt.bar(top_rated_movies['Title'][:20], top_rated_movies['Rating'][:20])
plt.xlabel('Movie Title')
plt.ylabel('Number of 5-Star Ratings')
plt.title('Top-Rated Movies with Many 5-Star Ratings')
plt.xticks(rotation=90)  # Rotate x-axis labels for better readability
plt.show()

In [None]:
# Filter the rows with a rating of 1
low_rated_movies  = Master_data[Master_data['Rating'] == 1]

# Display the movies with a rating of 1
low_rated_movies

In [None]:

# Grouping by movie title and count the number of ratings
movie_ratings_count = low_rated_movies['Title'].value_counts().reset_index()
movie_ratings_count.columns = ['Title', 'Number of 1-Star Ratings']

# Sorting the movies by the number of 1-star ratings in descending order
top_low_rated_movies = movie_ratings_count.sort_values(by='Number of 1-Star Ratings', ascending=False)


# Create a bar plot for the top low-rated movies
plt.figure(figsize=(12, 6))
plt.bar(top_low_rated_movies['Title'][:20], top_low_rated_movies['Number of 1-Star Ratings'][:20])
plt.xlabel('Movie Title')
plt.ylabel('Number of 1-Star Ratings')
plt.title('Movies with Many 1-Star Ratings')
plt.xticks(rotation=90)
plt.show()


# Collaborative Filterting Model

**Singular Value Decomposition (SVD)**

In [None]:

# Prepare the dataset for Surprise

reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(Master_data[['UserID', 'MovieID', 'Rating']], reader)

# Split the data into a training set and a testing set
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

# Build a collaborative filtering model (SVD)
model = SVD(n_factors=100, n_epochs=20, lr_all=0.005, reg_all=0.02)
model.fit(trainset)

# Make predictions on the test set
predictions = model.test(testset)

# Evaluate the model using Root Mean Squared Error (RMSE)

rmse = accuracy.rmse(predictions)
print(f"RMSE: {rmse:.2f}")



#Print Mean Absolute Error (MAE)

mae = accuracy.mae(predictions)
print(f"MAE: {mae:.2f}")

# Get actual ratings and predicted ratings
actual_ratings = [pred.r_ui for pred in predictions]
predicted_ratings = [pred.est for pred in predictions]

# Calculate R-squared
mean_rating = sum(actual_ratings) / len(actual_ratings)
TSS = sum((rating - mean_rating) ** 2 for rating in actual_ratings)
RSS = sum((actual - predicted) ** 2 for actual, predicted in zip(actual_ratings, predicted_ratings))
rsquared = 1 - (RSS / TSS)

print(f"R-squared: {rsquared:.4f}")




# Recommend movies for a specific user
user_id = 5 # Users ID

movies_rated_by_user = Master_data[Master_data['UserID'] == user_id]['MovieID']
movies_not_rated_by_user = Master_data[~Master_data['MovieID'].isin(movies_rated_by_user)]
movies_to_recommend = movies_not_rated_by_user['MovieID'].unique()

user_predictions = [model.predict(user_id, movie_id) for movie_id in movies_to_recommend]

# Sort and filter the top movie recommendations by predicted rating
sorted_predictions = sorted(user_predictions, key=lambda x: x.est, reverse=True)
top_n = 10  # Number of top recommendations to display

# Keep track of recommended movies to ensure they are unique
recommended_movies = set()
top_movie_titles = []

for prediction in sorted_predictions:
    if len(recommended_movies) >= top_n:
        break
    if prediction.iid not in recommended_movies:
        recommended_movies.add(prediction.iid)
        top_movie_titles.append(Master_data[Master_data['MovieID'] == prediction.iid]['Title'].values[0])

print(f"Top {top_n} Movie Recommendations for User {user_id}:\n")
for i, title in enumerate(top_movie_titles):
    print(f"{i + 1}. {title}")

While the model achieved an RMSE of 0.87 and MAE of 0.69, its performance is considered moderate which is common for recommender systems based on collaborative filtering.