https://www.kaggle.com/code/mehmetisik/content-based-recommendation

In [1]:
import os
# Comprueba si el código se está ejecutando en Google Colab
try:
    import google.colab
    IN_COLAB = True
except:
    IN_COLAB = False

path_absolute = ''
if IN_COLAB:
    print("El código se está ejecutando en Google Colab.")
    from google.colab import drive

    drive.mount('/content/drive')
    path_absolute = '/content/drive/Othercomputers/Mi_portátil/TFM/WorkSpace/'

    # Cambia al directorio de tu carpeta en Google Drive
    os.chdir(path_absolute)

    # Lista los archivos y carpetas en el directorio actual
    contenido_carpeta = os.listdir(path_absolute)
    print("Contenido de la carpeta en Google Drive:")
    print(contenido_carpeta)
else:
    print("El código se está ejecutando en un entorno local.")
    path_absolute = os.getcwd().replace("\\", "/")

datasets_path = "/datasets/"
path_absolute = path_absolute+datasets_path

El código se está ejecutando en un entorno local.


![CBR](https://miro.medium.com/v2/resize:fit:1400/1*H_MMnrpLQrqTSJHdDOCMoA.png)

# What is Content Based Recommendation

Content-based recommendation, also known as content-based filtering, is a type of system or algorithm that provides recommendations to a user based on their interests and preferences. Those with such recommendation systems analyze the user's past preferences and likes, and suggest new items based on similar content.

Content-based recommendation analyzes the content of items and determines the ones that are suitable for the user based on similarity criteria. For example, when making a movie recommendation, the system can take into account the genres, actors, directors, and other features of the movies the user has liked or watched. Based on this information, the system suggests other movies with similar characteristics.

This recommendation system can utilize text analysis, tagging, categorization, or other content features along with the user profile or history to better understand the user's preferences. For instance, when making a music recommendation, the system can analyze features such as genres, instruments, tempo, and rhythm.

Content-based recommendation systems can be effective in providing personalized recommendations based on user preferences. The recommended items based on the user's past data can capture their interest and provide a better user experience.

# Business Problem
To recommend movies similar to the movies that a person who comes to our site to watch movies.

# Road Map

- 1. Creating the **TF-IDF Matrix**
- 2. Creation of **Cosine Similarity Matrix**
- 3. Making Recommendations Based on Similarities
- 4. Preparation of the Study Script


In [2]:
# import Required Libraries

import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
# Adjusting Row Column Settings

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', 500)
pd.set_option('display.expand_frame_repr', False)

In [4]:
# Loading the Data Set
# df = pd.read_csv(path_absolute+"movies_metadata.csv")


In [5]:
df = pd.read_csv(path_absolute+"df_mezclado_tags_ratings_movies_links_genTags.csv")

In [6]:
df.head()

Unnamed: 0,userId,movieId,rating,timestamp_valoraciones,tag_df_mezclado_tags_ratings_movies_links_genMov,timestamp_etiquetas,title,genres,imdbId,tmdbId,tagId,relevance,tag_etiquetas_genómicas
0,3,260,4.0,1439472239,classic,1439472355,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi,76759,11.0,75,0.26025,art
1,3,260,4.0,1439472239,sci-fi,1439472256,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi,76759,11.0,75,0.26025,art
2,264,260,3.0,1543390081,aliens,1543390134,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi,76759,11.0,75,0.26025,art
3,264,260,3.0,1543390081,oldie but goodie,1543390130,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi,76759,11.0,75,0.26025,art
4,264,260,3.0,1543390081,scifi cult,1543390105,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi,76759,11.0,75,0.26025,art


In [7]:
# Eliminar las columnas imdbId y tmdbId
# Renombrar las columnas tal que: 'timestamp_valoraciones': 'timestamp_scr' y 'timestamp_etiquetas': 'timestamp_tags'
#Mover las columnas de rating y relevance al final y la de tag_etiquetas_genómicas despues de tag_df_mezclado...

# Inserta la columna 'rating' en la última posición del DataFrame
df.insert(len(df.columns)-1, 'rating', df.pop('rating'))
df.insert(len(df.columns)-2, 'relevance', df.pop('relevance'))
df.insert(5, 'tag_etiquetas_genómicas', df.pop('tag_etiquetas_genómicas'))

df.rename(columns={'timestamp_valoraciones': 'timestamp_rt', 'timestamp_etiquetas': 'timestamp_tags', 'tag_df_mezclado_tags_ratings_movies_links_genMov': 'tag_by_user', 'tag_etiquetas_genómicas': 'gen_tag'}, inplace=True)

columnas_a_eliminar = ['imdbId', 'tmdbId']
# Elimina las columnas especificadas del DataFrame 'data'
df = df.drop(columnas_a_eliminar, axis=1)

In [8]:
df.head()

Unnamed: 0,userId,movieId,timestamp_rt,tag_by_user,timestamp_tags,gen_tag,title,genres,tagId,relevance,rating
0,3,260,1439472239,classic,1439472355,art,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi,75,0.26025,4.0
1,3,260,1439472239,sci-fi,1439472256,art,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi,75,0.26025,4.0
2,264,260,1543390081,aliens,1543390134,art,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi,75,0.26025,3.0
3,264,260,1543390081,oldie but goodie,1543390130,art,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi,75,0.26025,3.0
4,264,260,1543390081,scifi cult,1543390105,art,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi,75,0.26025,3.0


In [9]:
df.shape

(8949372, 11)

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8949372 entries, 0 to 8949371
Data columns (total 11 columns):
 #   Column          Dtype  
---  ------          -----  
 0   userId          int64  
 1   movieId         int64  
 2   timestamp_rt    int64  
 3   tag_by_user     object 
 4   timestamp_tags  int64  
 5   gen_tag         object 
 6   title           object 
 7   genres          object 
 8   tagId           int64  
 9   relevance       float64
 10  rating          float64
dtypes: float64(2), int64(5), object(4)
memory usage: 751.1+ MB


In [11]:
# Cuenta los valores nulos en cada columna del DataFrame
valores_nulos_por_columna = df.isnull().sum()

# Cuenta los valores nulos en todo el DataFrame
total_valores_nulos = df.isnull().sum().sum()

# Imprime la cantidad de valores nulos por columna
print("Valores nulos por columna:")
print(valores_nulos_por_columna)

# Imprime el total de valores nulos en el DataFrame
print("\nTotal de valores nulos en el dataset:", total_valores_nulos)

Valores nulos por columna:
userId             0
movieId            0
timestamp_rt       0
tag_by_user       11
timestamp_tags     0
gen_tag            0
title              0
genres             0
tagId              0
relevance          0
rating             0
dtype: int64

Total de valores nulos en el dataset: 11


In [12]:
# Seleccionar las filas con valores nulos en la columna deseada
filas_con_nulos = df[df['tag_by_user'].isnull()]

# Mostrar las filas con valores nulos
print("Filas con valores nulos en la columna tag:")
print(filas_con_nulos)

# Eliminar las filas con valores nulos en la columna deseada
df = df.dropna(subset=['tag_by_user'])
print("DataFrame después de eliminar filas con valores nulos:")
# Cuenta los valores nulos en cada columna del DataFrame
valores_nulos_por_columna = df.isnull().sum()
# Cuenta los valores nulos en todo el DataFrame
total_valores_nulos = df.isnull().sum().sum()
# Imprime la cantidad de valores nulos por columna
print("Valores nulos por columna:")
print(valores_nulos_por_columna)
# Imprime el total de valores nulos en el DataFrame
print("\nTotal de valores nulos en el dataset:", total_valores_nulos)

Filas con valores nulos en la columna tag:
         userId  movieId  timestamp_rt tag_by_user  timestamp_tags              gen_tag               title        genres  tagId  relevance  rating
250975   121710    33826    1228449251         NaN      1221450908          archaeology  Saint Ralph (2004)  Comedy|Drama     71    0.00975     4.5
1359566  121710    33826    1228449251         NaN      1221450908            bollywood  Saint Ralph (2004)  Comedy|Drama    149    0.33825     4.5
1920495  121710    33826    1228449251         NaN      1221450908       figure skating  Saint Ralph (2004)  Comedy|Drama    392    0.03200     4.5
3036225  121710    33826    1228449251         NaN      1221450908    parallel universe  Saint Ralph (2004)  Comedy|Drama    765    0.18150     4.5
5347654  121710    33826    1228449251         NaN      1221450908                  007  Saint Ralph (2004)  Comedy|Drama      1    0.03275     4.5
6489243  121710    33826    1228449251         NaN      1221450908   

In [13]:
df.shape

(8949361, 11)

In [14]:
content_df = df[['title', 'genres', 'tag_by_user', 'gen_tag', 'relevance', 'rating']]

In [15]:
content_df = content_df.dropna(subset=['tag_by_user'])

In [16]:
import re
import string

def remove_bars(text):
    # Eliminar barras y dejar palabras separadas por un espacio
    cleaned = re.sub(r'\|', ' ', text)
    return cleaned

def separate(text):
    clean_text = []
    for t in text.split(','):
        cleaned = re.sub('\(.*\)', '', t) # Remove text inside parentheses
        cleaned = cleaned.translate(str.maketrans('','', string.digits))
        cleaned = cleaned.replace(' ', '')
        cleaned = cleaned.translate(str.maketrans('','', string.punctuation)).lower()
        clean_text.append(cleaned)
    return ' '.join(clean_text)

def remove_punc(text):
    try:
        cleaned = text.translate(str.maketrans('','', string.punctuation)).lower()
        clean_text = cleaned.translate(str.maketrans('','', string.digits))
    except Exception as e:
        print(f"ERROR -----------------------------------------> {e} AND {text}")
    return clean_text

In [17]:
# Ejemplo de uso: Pixar Animation Studios Warner Bros., Lancaster Gate
text_with_bars = "oldie but goodie"

# text_with_bars = "Pixar Animation Studios"

# text_with_bars = "Warner Bros., Lancaster Gate"

cleaned_text = remove_punc(text_with_bars)
print(cleaned_text)

oldie but goodie


In [18]:
content_df['genres'] = (content_df['genres'].apply(remove_bars)).apply(remove_punc)
content_df['tag_by_user'] = content_df['tag_by_user'].apply(remove_punc)

In [19]:
content_df.head()

Unnamed: 0,title,genres,tag_by_user,gen_tag,relevance,rating
0,Star Wars: Episode IV - A New Hope (1977),action adventure scifi,classic,art,0.26025,4.0
1,Star Wars: Episode IV - A New Hope (1977),action adventure scifi,scifi,art,0.26025,4.0
2,Star Wars: Episode IV - A New Hope (1977),action adventure scifi,aliens,art,0.26025,3.0
3,Star Wars: Episode IV - A New Hope (1977),action adventure scifi,oldie but goodie,art,0.26025,3.0
4,Star Wars: Episode IV - A New Hope (1977),action adventure scifi,scifi cult,art,0.26025,3.0


In [20]:
content_df_aux = content_df.copy()
content_df_aux = content_df_aux[['genres', 'tag_by_user', 'gen_tag']]
content_df_aux['bag_of_words'] = ''
content_df_aux.loc[:, 'bag_of_words'] = content_df_aux.loc[:, content_df_aux.columns[0:]].apply(lambda x: ' '.join(x), axis=1)
# content_df.set_index('original_title', inplace=True)
columnas_a_eliminar = ['tag_by_user', 'gen_tag']
# Elimina las columnas especificadas del DataFrame 'data'
content_df = content_df.drop(columnas_a_eliminar, axis=1)

content_df['genres'] = content_df_aux['bag_of_words']
content_df.rename(columns={'genres': 'bag_of_words'}, inplace=True)

content_df.head()

Unnamed: 0,title,bag_of_words,relevance,rating
0,Star Wars: Episode IV - A New Hope (1977),action adventure scifi classic art,0.26025,4.0
1,Star Wars: Episode IV - A New Hope (1977),action adventure scifi scifi art,0.26025,4.0
2,Star Wars: Episode IV - A New Hope (1977),action adventure scifi aliens art,0.26025,3.0
3,Star Wars: Episode IV - A New Hope (1977),action adventure scifi oldie but goodie art,0.26025,3.0
4,Star Wars: Episode IV - A New Hope (1977),action adventure scifi scifi cult art,0.26025,3.0


# 1. Creating the TF-IDF Matrix

In [21]:
df["overview"].head()

KeyError: 'overview'

In [None]:
df["overview"].isnull().sum()

In [None]:
# Let's remove the constructs like a, an, the, and, but that don't make sense for us from our DataFrames.

tfidf = TfidfVectorizer(stop_words="english")

In [None]:
# fill the null value in the cin overviev variable with nothing to avoid errors in the following steps

df['overview'] = df['overview'].fillna('')

In [None]:
df["overview"].isnull().sum()

In [None]:
# fit and transform according to the tfidf object
# Those in rows are texts 'overview'. Those in columns are unique words.

tfidf_matrix = tfidf.fit_transform(df['overview'])

In [None]:
tfidf_matrix.shape

In [None]:
#If we want to see all the unique words in the columns

#tfidf.get_feature_names()


In [None]:
# tfidf scores

tfidf_matrix.toarray()

# 2. Creation of Cosine Similarity Matrix

In [None]:
# Calculates cos sim for all possible document pairs one by one. In the cosine_sim matrix, each movie has similarities with each other

cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [None]:
cosine_sim.shape

In [None]:
cosine_sim

In [None]:
# To see how the movie in index 1 is similar to all the other movies

cosine_sim[1]

# 3. Making Recommendations Based on Similarities

In [None]:
# Let's create a pd series of indexes and movie names

indices = pd.Series(df.index, index=df['title'])

In [None]:
indices.head()

In [None]:
# let's count the index information of the movies and simplify the most repetitive movies to the most recent ones

indices.index.value_counts().head()

In [None]:
indices = indices[~indices.index.duplicated(keep='last')]

In [None]:
indices["Cinderella"]

In [None]:
indices["Sherlock Holmes"]

In [None]:
# I assign the index of the movie 'Sherlock Holmes' to the variable

movie_index = indices['Sherlock Holmes']

In [None]:
cosine_sim[movie_index]

In [None]:
cosine_sim[movie_index].shape

In [None]:
# Let's see the Smilarity Scores that express the similarities between the movie 'Sherlock Holmes' and other movies


similarity_scores = pd.DataFrame(cosine_sim[movie_index],
                                 columns=["score"])

In [None]:
# The similarities between the movie 'Sherlock Holmes' and all other movies

similarity_scores.head()

In [None]:
similarity_scores.shape

In [None]:
# Let's list the similarity scores of the movie 'Sherlock Holmes' in descending order. It starts at 1 because it's the first movie.

movie_indices = similarity_scores.sort_values("score", ascending=False)[1:11].index

In [None]:
# Go to the indexes we selected in our first data set

df['title'].iloc[movie_indices]

# 4. Preparation of the Study Script

In [None]:
def content_based_recommender(title, cosine_sim, dataframe):
    # create indexes
    indices = pd.Series(dataframe.index, index=dataframe['title'])
    indices = indices[~indices.index.duplicated(keep='last')]
    # capturing index of title
    movie_index = indices[title]
    # calculate similarity scores based on title
    similarity_scores = pd.DataFrame(cosine_sim[movie_index], columns=["score"])
    # Bringing the top 10 movies except for itself
    movie_indices = similarity_scores.sort_values("score", ascending=False)[1:11].index
    return dataframe['title'].iloc[movie_indices]

In [None]:
content_based_recommender("Sherlock Holmes", cosine_sim, df)

In [None]:
content_based_recommender("The Matrix", cosine_sim, df)

In [None]:
content_based_recommender("The Godfather", cosine_sim, df)

In [None]:
content_based_recommender('The Dark Knight Rises', cosine_sim, df)

In [None]:
def calculate_cosine_sim(dataframe):
    tfidf = TfidfVectorizer(stop_words='english')
    dataframe['overview'] = dataframe['overview'].fillna('')
    tfidf_matrix = tfidf.fit_transform(dataframe['overview'])
    cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
    return cosine_sim

In [None]:
# cosine_sim = calculate_cosine_sim(df)
# content_based_recommender('The Dark Knight Rises', cosine_sim, df)