# 1) Installation des bibliothèques


In [None]:
!pip install kagglehub transformers datasets faiss-cpu sentence-transformers

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.9.0.post1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-trans

In [None]:
import os
import pandas as pd
import numpy as np
import torch
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string

# NLTK downloads
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')

from collections import Counter
from datasets import Dataset, load_from_disk
import faiss

# Transformers / RAG / DPR
from transformers import (
    DPRContextEncoder,
    DPRContextEncoderTokenizer,
    RagTokenizer,
    RagTokenForGeneration,
    RagRetriever
)

from sentence_transformers import SentenceTransformer

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


# 2) Téléchargement du dataset (via kagglehub)

In [None]:
import kagglehub

# Exemple de téléchargement du dataset "The Movies Dataset"
path = kagglehub.dataset_download("rounakbanik/the-movies-dataset")  # Téléchargement
csv_path = os.path.join(path, "movies_metadata.csv")                 # Fichier principal

df = pd.read_csv(csv_path, low_memory=False)
print("Nombre de lignes d’origine :", len(df))

# Garder un sous-ensemble de colonnes utiles :
# (si `genres`, `vote_average`, etc. sont présents, vous pouvez les conserver)
cols_to_keep = ["title", "overview", "genres", "original_language",
                "release_date", "vote_average", "popularity", "poster_path"]

df = df[cols_to_keep]


Downloading from https://www.kaggle.com/api/v1/datasets/download/rounakbanik/the-movies-dataset?dataset_version_number=7...


100%|██████████| 228M/228M [00:06<00:00, 36.3MB/s]

Extracting files...





Nombre de lignes d’origine : 45466


# 3) Nettoyage et prétraitements


In [None]:
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

def to_lowercase(text):
    return text.lower()

def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(text)
    return ' '.join([w for w in words if w not in stop_words])

def lemmatize_text(text):
    lemmatizer = WordNetLemmatizer()
    words = word_tokenize(text)
    return ' '.join([lemmatizer.lemmatize(w) for w in words])

def preprocess_text(text):
    text = text if isinstance(text, str) else ""
    text = to_lowercase(text)
    text = remove_punctuation(text)
    text = remove_stopwords(text)
    text = lemmatize_text(text)
    return text


In [None]:
# Retrait des lignes avec overview nul ou trop court
df = df.dropna(subset=["overview"])
df = df[df["overview"].str.len() > 30]

# Suppression des doublons éventuels
df = df.drop_duplicates(subset=["title", "overview"])

# (Option) Filtrer sur la langue principale = "en"
df = df[df["original_language"] == "en"]

# Nettoyage / prétraitement du champ "overview"
df["processed_overview"] = df["overview"].apply(preprocess_text)

# On combine éventuellement title + overview pour un meilleur embedding
df["text"] = (
    df["title"].fillna("")
    + ". "
    + df["processed_overview"].fillna("")
)

df.reset_index(drop=True, inplace=True)
print("Nombre de lignes après nettoyage :", len(df))
print(df[["title", "overview", "text"]].head(3))



Nombre de lignes après nettoyage : 32025
              title                                           overview  \
0         Toy Story  Led by Woody, Andy's toys live happily in his ...   
1           Jumanji  When siblings Judy and Peter discover an encha...   
2  Grumpier Old Men  A family wedding reignites the ancient feud be...   

                                                text  
0  Toy Story. led woody andys toy live happily ro...  
1  Jumanji. sibling judy peter discover enchanted...  
2  Grumpier Old Men. family wedding reignites anc...  


In [None]:
# Définir le chemin du fichier CSV
csv_output_path = "/content/movies_metadata_clean.csv"

# Sauvegarder en CSV
df.to_csv(csv_output_path, index=False, encoding="utf-8")

print(f"✅ Dataset sauvegardé sous {csv_output_path} avec {len(df)} films.")

# from google.colab import drive
# drive.mount('/content/drive')

# # Sauvegarde dans Google Drive
# df.to_csv("/content/drive/My Drive/movies_metadata_clean.csv", index=False, encoding="utf-8")
# print("✅ Fichier sauvegardé sur Google Drive !")


✅ Dataset sauvegardé sous /content/movies_metadata_clean.csv avec 32025 films.


In [None]:
import os
print("Fichiers dans le répertoire actuel :", os.listdir())


Fichiers dans le répertoire actuel : ['.config', 'my_custom_dataset', 'movies_metadata_clean.csv', 'sample_data']


In [None]:
#pour dimunuer le temps de traitement des données
# df = df.iloc[:1000]


# 4) Création du dataset Hugging Face + Indexation

## 4.1) Approche DPR

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

ctx_encoder_name = "facebook/dpr-ctx_encoder-single-nq-base"
ctx_tokenizer = DPRContextEncoderTokenizer.from_pretrained(ctx_encoder_name)
ctx_encoder = DPRContextEncoder.from_pretrained(ctx_encoder_name).to(device)
ctx_encoder.eval()

def embed_passages_dpr(examples):
    inputs = ctx_tokenizer(
        examples["text"],
        padding=True,
        truncation=True,
        max_length=512,
        return_tensors="pt"
    )
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        output = ctx_encoder(**inputs)
    return {"embeddings": output.pooler_output.cpu().numpy()}


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/492 [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DPRQuestionEncoderTokenizer'. 
The class this function is called from is 'DPRContextEncoderTokenizer'.


pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Some weights of the model checkpoint at facebook/dpr-ctx_encoder-single-nq-base were not used when initializing DPRContextEncoder: ['ctx_encoder.bert_model.pooler.dense.bias', 'ctx_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing DPRContextEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DPRContextEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


## 4.2) Approche Sentence-Transformers (souvent plus adaptée à la similarité)

In [None]:
# Exemple : 'all-mpnet-base-v2' est un très bon modèle Sentence-Transformer
sbert_model = SentenceTransformer('all-mpnet-base-v2', device=device)

def embed_passages_sbert(examples):
    # batch de textes
    texts = examples["text"]
    # Sentence-Transformers gère nativement le batching
    emb = sbert_model.encode(texts, batch_size=8, show_progress_bar=False)
    return {"embeddings_sbert": emb}


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

## 4.3) Création du dataset et ajout de l’index FAISS

In [None]:
dataset_hf = Dataset.from_pandas(df)

# Calcul des embeddings DPR
dataset_hf = dataset_hf.map(embed_passages_dpr, batched=True, batch_size=8)

# Calcul des embeddings SBERT
dataset_hf = dataset_hf.map(embed_passages_sbert, batched=True, batch_size=8)

# Création de deux index FAISS : "embeddings" (DPR) et "embeddings_sbert"
dataset_hf = dataset_hf.add_faiss_index(column="embeddings", index_name="index_dpr")
dataset_hf = dataset_hf.add_faiss_index(column="embeddings_sbert", index_name="index_sbert")

print(dataset_hf)


Map:   0%|          | 0/32025 [00:00<?, ? examples/s]

Map:   0%|          | 0/32025 [00:00<?, ? examples/s]

  0%|          | 0/33 [00:00<?, ?it/s]

  0%|          | 0/33 [00:00<?, ?it/s]

Dataset({
    features: ['title', 'overview', 'genres', 'original_language', 'release_date', 'vote_average', 'popularity', 'poster_path', 'processed_overview', 'text', 'embeddings', 'embeddings_sbert'],
    num_rows: 32025
})


# 5) Fonctions de recommandation (FAISS + RAG)

## 5.1) Recherche via FAISS et cosinus

In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

import numpy as np

def recommend_faiss(query_vector, dataset, index_name="index_dpr", top_n=3):
    # Exécuter la recherche FAISS
    scores, indices = dataset.get_index(index_name).search(query_vector.reshape(1, -1), top_n)

    # S'assurer que indices est une liste
    if isinstance(indices, np.ndarray):
        indices = indices.tolist()  # Convertir en liste standard Python

    recommendations = []

    # Vérifier si indices[0] est un entier (cas où FAISS retourne un seul résultat)
    if isinstance(indices[0], np.int64) or isinstance(indices[0], int):
        indices = [[indices[0]]]  # Convertir en liste de liste

    # Boucler sur les indices corrigés
    for i in indices[0]:
        i = int(i)  # Convertir numpy.int64 en int standard
        recommendations.append(
            {
                "title": dataset[i]["title"],
                "overview": dataset[i]["text"]
            }
        )

    return recommendations



def recommend_cosine(query_vector, all_vectors, df_source, top_n=5):
    sims = cosine_similarity([query_vector], all_vectors)[0]
    best_indices = np.argsort(-sims)[:top_n]
    recommendations = []
    for idx in best_indices:
        recommendations.append(
            {
                "title": df_source.iloc[idx]["title"],
                "overview": df_source.iloc[idx]["overview"],
                "score": float(sims[idx])
            }
        )
    return recommendations


## 5.2) Exemple de requête utilisateur


In [None]:
def get_query_vector_dpr(query):
    inputs = ctx_tokenizer(query, return_tensors="pt", truncation=True, max_length=512).to(device)
    with torch.no_grad():
        emb = ctx_encoder(**inputs).pooler_output.cpu().numpy()
    return emb[0]

def get_query_vector_sbert(query):
    emb = sbert_model.encode([query], show_progress_bar=False)
    return emb[0]


In [None]:
user_query = "I love horror movies with ghosts"

# 1) DPR
query_vec_dpr = get_query_vector_dpr(user_query)
faiss_recs_dpr = recommend_faiss(query_vec_dpr, dataset_hf, index_name="index_dpr", top_n=3)
print("=== DPR + FAISS Recommendations ===")
for i, rec in enumerate(faiss_recs_dpr, start=1):
    print(f"\n--- Rec {i} ---")
    print("Title   :", rec["title"])
    print("Overview:", rec["overview"])

# 2) SBERT
query_vec_sbert = get_query_vector_sbert(user_query)
faiss_recs_sbert = recommend_faiss(query_vec_sbert, dataset_hf, index_name="index_sbert", top_n=3)
print("\n=== SBERT + FAISS Recommendations ===")
for i, rec in enumerate(faiss_recs_sbert, start=1):
    print(f"\n--- Rec {i} ---")
    print("Title   :", rec["title"])
    print("Overview:", rec["overview"])


=== DPR + FAISS Recommendations ===

--- Rec 1 ---
Title   : Fear Itself
Overview: Fear Itself. girl haunted traumatic event take u mesmerising journey 100 year horror cinema explore filmmaker scare u – let

=== SBERT + FAISS Recommendations ===

--- Rec 1 ---
Title   : Mostly Ghostly 3: One Night in Doom House
Overview: Mostly Ghostly 3: One Night in Doom House. one enchanted jewel stand earth army evil spirit led devious ghoul phears help new girlfriend ghost pal max doyle race find crystal save world


## 5.3) RAG : création du Retriever custom

In [None]:
import os
import torch
from transformers import RagTokenizer, RagTokenForGeneration, RagRetriever

save_path = "my_custom_dataset"
index_path_dpr = os.path.join(save_path, "embeddings")

# Vérifier et ajouter l'index FAISS avant la sauvegarde
if not dataset_hf.is_index_initialized("index_dpr"):
    print("🚨 L'index 'index_dpr' n'est pas initialisé. Création en cours...")
    dataset_hf = dataset_hf.add_faiss_index(column="embeddings", index_name="index_dpr")

# 1) Sauvegarde de l'index FAISS
dataset_hf.get_index("index_dpr").save(index_path_dpr)

# 2) Supprimer TOUS les index FAISS du dataset
for index_name in dataset_hf.list_indexes():
    dataset_hf.drop_index(index_name)

# 3) Sauvegarde du dataset sans aucun index
dataset_hf.save_to_disk(save_path)

# Chargement du modèle RAG
model_name = "facebook/rag-token-base"
rag_tokenizer = RagTokenizer.from_pretrained(model_name)
rag_model = RagTokenForGeneration.from_pretrained(model_name)

rag_retriever = RagRetriever.from_pretrained(
    model_name,
    index_name="custom",      # On n’utilise pas l’index wiki
    passages_path=save_path,  # Chemin du dataset local
    index_path=index_path_dpr
)

device = "cuda" if torch.cuda.is_available() else "cpu"
rag_model.to(device)

print("✅ Tout fonctionne correctement !")


Saving the dataset (0/1 shards):   0%|          | 0/32025 [00:00<?, ? examples/s]

config.json:   0%|          | 0.00/4.55k [00:00<?, ?B/s]

(…)_encoder_tokenizer/tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

question_encoder_tokenizer/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

(…)ncoder_tokenizer/special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizerFast'.


(…)enerator_tokenizer/tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

generator_tokenizer/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

generator_tokenizer/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

(…)erator_tokenizer/special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'BartTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'BartTokenizerFast'.


pytorch_model.bin:   0%|          | 0.00/2.06G [00:00<?, ?B/s]

Some weights of the model checkpoint at facebook/rag-token-base were not used when initializing RagTokenForGeneration: ['rag.question_encoder.question_encoder.bert_model.pooler.dense.bias', 'rag.question_encoder.question_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing RagTokenForGeneration from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RagTokenForGeneration from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called 

✅ Tout fonctionne correctement !


In [None]:
from datasets import load_from_disk
import torch

def recommend_with_rag(user_query, retriever, rag_model, tokenizer, top_n=3):
    # 1. Tokeniser la question utilisateur
    inputs = tokenizer(user_query, return_tensors="pt")
    input_ids = inputs["input_ids"].to(device)

    # 2. Encoder la question
    with torch.no_grad():
        question_hidden_states = rag_model.question_encoder(input_ids=input_ids)[0]

    question_hidden_states_np = question_hidden_states.cpu().numpy()

    # 3. Retrieve
    docs_dict = retriever(
        question_hidden_states=question_hidden_states_np,
        question_input_ids=input_ids,
        n_docs=top_n
    )

    # On récupère les doc_ids
    doc_ids = docs_dict["doc_ids"][0]

    # On recharge le dataset
    loaded_dataset = load_from_disk(save_path)

    # On construit la liste de recommandations
    recs = []
    for doc_id in doc_ids:
        doc_id = int(doc_id)  # Convertir numpy.int64 en int standard
        recs.append(
            {
                "title": loaded_dataset[doc_id]["title"],
                "overview": loaded_dataset[doc_id]["overview"],
                "poster_path": loaded_dataset[doc_id]["poster_path"]  # <--- le poster

            }
        )

    return recs

# Test RAG
rag_recs = recommend_with_rag("I love horror movies with ghosts", rag_retriever, rag_model, rag_tokenizer, top_n=10)
print("\n=== RAG Recommendations ===")
for i, r in enumerate(rag_recs, 1):
    print(f"\n--- {i} ---")
    print("Title   :", r["title"])
    print("Overview:", r["overview"])
    print("Chemin image: ", r["poster_path"])


=== RAG Recommendations ===

--- 1 ---
Title   : Screamers
Overview: Internationally known director Carla Garapedian follows the rock band System of a Down as they tour Europe and the US pointing out the horrors of modern genocide that began in Armenia in 1915 up though Darfur today.
Chemin image:  /2fmJfW52CjobQZvaEzpNhCVP8xI.jpg

--- 2 ---
Title   : The Death Kiss
Overview: The Death Kiss re-unites three of the stars of Dracula in a Hollywood movie-set murder mystery.
Chemin image:  /wKblRIOCaKsTskFGZBpqgKMAlcJ.jpg

--- 3 ---
Title   : Conception
Overview: Conception is a clever, romantic comedy that proves it takes more than sex to make a baby. From a couple fighting the odds of fertility to young teenagers losing their virginity, the film follows nine very different couples on the night they conceive, showing that sex can sometimes be more neurotic than erotic. The hilarious ensemble cast includes Emmy-winner Julie Bowen and Sarah Hyland (“Modern Family”), Connie Britton (“Friday 

In [None]:
import os
from datasets import load_from_disk

save_path = "data/my_custom_dataset"

# Vérification de l'existence du dataset
if not os.path.exists(save_path):
    print(f"🚨 Le dataset '{save_path}' n'existe pas ! Vérifiez son emplacement.")
    exit()

# Charger le dataset Hugging Face
dataset_hf = load_from_disk(save_path)

print(f"✅ Dataset chargé avec {len(dataset_hf)} films.")


✅ Dataset chargé avec 32025 films.
