<a href="https://colab.research.google.com/github/Jesteban247/Movie-Recommendation-System-with-Hugging-Face-Transformers/blob/main/main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install Kaggle Dataset

In [1]:
# Download the dataset from Kaggle
!kaggle datasets download -d PromptCloudHQ/imdb-data

# Unzip the dataset
!unzip /content/imdb-data.zip

Dataset URL: https://www.kaggle.com/datasets/PromptCloudHQ/imdb-data
License(s): other
Downloading imdb-data.zip to /content
  0% 0.00/134k [00:00<?, ?B/s]
100% 134k/134k [00:00<00:00, 54.1MB/s]
Archive:  /content/imdb-data.zip
  inflating: IMDB-Movie-Data.csv     


# Load and Explore the Dataset

In [2]:
# Import necessary libraries
import pandas as pd

# Load the dataset
df = pd.read_csv('/content/IMDB-Movie-Data.csv')

# Display the first few rows of the dataframe
df.head()

Unnamed: 0,Rank,Title,Genre,Description,Director,Actors,Year,Runtime (Minutes),Rating,Votes,Revenue (Millions),Metascore
0,1,Guardians of the Galaxy,"Action,Adventure,Sci-Fi",A group of intergalactic criminals are forced ...,James Gunn,"Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S...",2014,121,8.1,757074,333.13,76.0
1,2,Prometheus,"Adventure,Mystery,Sci-Fi","Following clues to the origin of mankind, a te...",Ridley Scott,"Noomi Rapace, Logan Marshall-Green, Michael Fa...",2012,124,7.0,485820,126.46,65.0
2,3,Split,"Horror,Thriller",Three girls are kidnapped by a man with a diag...,M. Night Shyamalan,"James McAvoy, Anya Taylor-Joy, Haley Lu Richar...",2016,117,7.3,157606,138.12,62.0
3,4,Sing,"Animation,Comedy,Family","In a city of humanoid animals, a hustling thea...",Christophe Lourdelet,"Matthew McConaughey,Reese Witherspoon, Seth Ma...",2016,108,7.2,60545,270.32,59.0
4,5,Suicide Squad,"Action,Adventure,Fantasy",A secret government agency recruits some of th...,David Ayer,"Will Smith, Jared Leto, Margot Robbie, Viola D...",2016,123,6.2,393727,325.02,40.0


# Simple Exploratory Data Analysis (EDA)

In [3]:
# Display basic information about the dataset
df.info()

# Check for missing/null values in the dataset
print("\nNull values:\n", df.isnull().sum())

# Display summary statistics of the dataset
df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Rank                1000 non-null   int64  
 1   Title               1000 non-null   object 
 2   Genre               1000 non-null   object 
 3   Description         1000 non-null   object 
 4   Director            1000 non-null   object 
 5   Actors              1000 non-null   object 
 6   Year                1000 non-null   int64  
 7   Runtime (Minutes)   1000 non-null   int64  
 8   Rating              1000 non-null   float64
 9   Votes               1000 non-null   int64  
 10  Revenue (Millions)  872 non-null    float64
 11  Metascore           936 non-null    float64
dtypes: float64(3), int64(4), object(5)
memory usage: 93.9+ KB

Null values:
 Rank                    0
Title                   0
Genre                   0
Description             0
Director                0


Unnamed: 0,Rank,Year,Runtime (Minutes),Rating,Votes,Revenue (Millions),Metascore
count,1000.0,1000.0,1000.0,1000.0,1000.0,872.0,936.0
mean,500.5,2012.783,113.172,6.7232,169808.3,82.956376,58.985043
std,288.819436,3.205962,18.810908,0.945429,188762.6,103.25354,17.194757
min,1.0,2006.0,66.0,1.9,61.0,0.0,11.0
25%,250.75,2010.0,100.0,6.2,36309.0,13.27,47.0
50%,500.5,2014.0,111.0,6.8,110799.0,47.985,59.5
75%,750.25,2016.0,123.0,7.4,239909.8,113.715,72.0
max,1000.0,2016.0,191.0,9.0,1791916.0,936.63,100.0


# Install Sentence-Transformers

In [4]:
# Install Sentence-Transformers for text embeddings
!pip install -U sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-3.1.0-py3-none-any.whl.metadata (23 kB)
Downloading sentence_transformers-3.1.0-py3-none-any.whl (249 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m249.1/249.1 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentence-transformers
Successfully installed sentence-transformers-3.1.0


# Combine Genre and Description

In [5]:
# Combine 'Genre' and 'Description' columns into one text field
df['combined'] = df['Genre'] + " " + df['Description']

# Display the first few rows of the updated dataframe
df[['Title', 'Genre', 'Description', 'combined']].head()

Unnamed: 0,Title,Genre,Description,combined
0,Guardians of the Galaxy,"Action,Adventure,Sci-Fi",A group of intergalactic criminals are forced ...,"Action,Adventure,Sci-Fi A group of intergalact..."
1,Prometheus,"Adventure,Mystery,Sci-Fi","Following clues to the origin of mankind, a te...","Adventure,Mystery,Sci-Fi Following clues to th..."
2,Split,"Horror,Thriller",Three girls are kidnapped by a man with a diag...,"Horror,Thriller Three girls are kidnapped by a..."
3,Sing,"Animation,Comedy,Family","In a city of humanoid animals, a hustling thea...","Animation,Comedy,Family In a city of humanoid ..."
4,Suicide Squad,"Action,Adventure,Fantasy",A secret government agency recruits some of th...,"Action,Adventure,Fantasy A secret government a..."


# Encode Combined Text Using Sentence-Transformers

In [6]:
# Import Sentence-Transformers and initialize the model
from sentence_transformers import SentenceTransformer

# Initialize the sentence transformer model
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# Encode the combined text into embeddings
combined_embeddings = model.encode(df['combined'].tolist())

print("Embeddings created successfully")

  from tqdm.autonotebook import tqdm, trange
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Embeddings created successfully


In [7]:
# Preview the embeddings
combined_embeddings

array([[-0.01661812, -0.02497773, -0.0201374 , ..., -0.03324123,
        -0.01465607, -0.02214789],
       [-0.06069481,  0.00556127, -0.00047559, ..., -0.01428042,
        -0.04840579, -0.02556889],
       [ 0.00688116, -0.06274199, -0.04401769, ..., -0.00816624,
        -0.02621295, -0.04979355],
       ...,
       [-0.03342928, -0.04918838,  0.06239574, ...,  0.05788609,
        -0.02431428, -0.00781342],
       [-0.05868548,  0.00801864, -0.02061603, ...,  0.03522763,
        -0.00500898, -0.11482406],
       [ 0.03614448,  0.00701489, -0.05121483, ...,  0.06858651,
         0.07987772,  0.03546877]], dtype=float32)

# Define Recommendation Function with Similarity Scores

In [8]:
# Import the necessary function for similarity
from sklearn.metrics.pairwise import cosine_similarity

# Define the recommendation function that uses cosine similarity and displays similarity scores
def get_recommendations(query, combined_embeddings, df, top_n=5):
    # Encode the query
    query_embedding = model.encode([query])

    # Calculate cosine similarity between the query and combined embeddings
    similarities = cosine_similarity(query_embedding, combined_embeddings)[0]

    # Get the indices of top N recommendations based on the similarities
    top_indices = similarities.argsort()[-top_n:][::-1]

    # Create a DataFrame with the top results and their similarity scores
    recommendations = df.iloc[top_indices].copy()
    recommendations['similarity_score'] = similarities[top_indices]

    return recommendations

# Get Movie Recommendations

In [9]:
# Define a sample query for movie recommendations
query = "Funny movies with zombies"

# Get the top 10 recommendations using the defined function
recommendations = get_recommendations(query, combined_embeddings, df, top_n=10)

# Display the recommendations with relevant details and similarity scores
recommendations[['Title', 'Genre', 'Description', 'Rating', 'similarity_score']]

Unnamed: 0,Title,Genre,Description,Rating,similarity_score
876,Warm Bodies,"Comedy,Horror,Romance",After a highly unusual zombie saves a still-li...,6.9,0.654963
363,Zombieland,"Adventure,Comedy,Horror",A shy student trying to reach his family in Oh...,7.7,0.637174
908,Slither,"Comedy,Horror,Sci-Fi","A small town is taken over by an alien plague,...",6.5,0.63347
895,Planet Terror,"Action,Comedy,Horror","After an experimental bio-weapon is released, ...",7.1,0.599066
921,Scouts Guide to the Zombie Apocalypse,"Action,Comedy,Horror","Three scouts, on the eve of their last camp-ou...",6.3,0.563348
537,The Do-Over,"Action,Adventure,Comedy",Two down-on-their-luck guys decide to fake the...,5.7,0.533523
607,Horrible Bosses,"Comedy,Crime",Three friends conspire to murder their awful b...,6.9,0.520518
380,What We Do in the Shadows,"Comedy,Fantasy,Horror",A documentary team films the lives of a group ...,7.6,0.504286
378,Swiss Army Man,"Adventure,Comedy,Drama",A hopeless man stranded on a deserted island b...,7.1,0.503205
719,Neighbors,Comedy,After they are forced to live next to a frater...,6.4,0.492379
