# TF-IDF

In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv('..\Data\movies_features.csv')

In [2]:
df.head(1)

Unnamed: 0,genres,id,imdb_link,original_title,overview,poster_path,production_companies,release_date,runtime,vote_average,first_title_char
0,"Animation, Comedy, Family",862,https://www.imdb.com/title/tt0114709/,Toy Story,"Led by Woody, Andy's toys live happily in his ...",https://image.tmdb.org/t/p/w500/rhIRbceoE9lR4v...,Pixar Animation Studios,1995-10-30,81,7.7,T


### preprocess data

In [3]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

# Charger les stopwords
nltk.download('stopwords')
nltk.download('punkt')

# Définir les stopwords et la ponctuation à supprimer
stop_words = set(stopwords.words('english'))
punctuation = set(string.punctuation)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Sacha\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Sacha\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
# Fonction pour nettoyer le texte
def pre_process_text(text: str)-> str:
    """ 
    the function will preprocess and return the overview
    Lowercase, delete stop words and punctuation
    """
    global stop_words
    global punctuation

    # Tokenisation des mots
    tokens = word_tokenize(text.lower())
    # Suppression des stopwords et de la ponctuation
    tokens = [w for w in tokens if not w in stop_words and not w in punctuation]
    
    # Rejoindre les mots restants en un seul texte
    cleaned_text = " ".join(tokens)
    return cleaned_text

### tfidf matrix

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Charger les données de résumé de film dans une liste and preprocess them
summaries = df['overview'].apply(pre_process_text).tolist()

# Initialiser un objet TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()

# Calculer la matrice TF-IDF pour les résumés de film
tfidf_matrix = tfidf_vectorizer.fit_transform(summaries)

# Afficher la taille de la matrice TF-IDF
print("La taille de la matrice TF-IDF est :", tfidf_matrix.shape)


La taille de la matrice TF-IDF est : (31269, 56791)


### cosine similarity

In [6]:
from sklearn.metrics.pairwise import cosine_similarity
#30 sec

tfidf_matrix = tfidf_matrix.astype('float32')  #takes less time

# Calculer la similarité cosinus entre les vecteurs de résumé pour chaque paire de films
similarity_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Afficher la taille de la matrice de similarité
print("La taille de la matrice de similarité cosinus est :", similarity_matrix.shape)


La taille de la matrice de similarité cosinus est : (31269, 31269)


### select the similars movies

In [7]:
def get_top_5_similar_movies(row:pd.Series) -> pd.Series:
    """ 
    this function will return the 5 most similar movies than the one in te given row
    """
    # Get the similarity scores for the current movie
    similarity_scores = similarity_matrix[row.name]

    # Get the indices of the top 5 similar movies -> (6 because the movie of the line row will be included) -> in return select only the 5 last ones
    top_5_indices = np.argpartition(-similarity_scores, range(6))[:6]
    #get the ids of the movies (ids to be sure it is unique) (duplicates in movie title)
    top_5_ids = [df.iloc[i]['id'] for i in top_5_indices]

    return pd.Series(top_5_ids[1:], index=['sim_movie_1', 'sim_movie_2', 'sim_movie_3', 'sim_movie_4', 'sim_movie_5'])


In [8]:
#add the 5 most similar movies in the dataframe (40-50 sec)
df[['sim_movie_1', 'sim_movie_2', 'sim_movie_3', 'sim_movie_4', 'sim_movie_5']] = df.apply(get_top_5_similar_movies, axis=1)
df.to_csv('..\Data\movies_with_recommendation.csv', index=False)