In [1]:
import csv, time, torch, os
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("all-MiniLM-L6-v2")

  from tqdm.autonotebook import tqdm, trange
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [14]:
#__________DATA EXTRACTION__________
questions_file = "../../Data/testSet.csv"
video_file = "../../Data/videos-question-form.csv"
def extract_questions(file_path):
    questions_dict = {}
    
    with open(file_path, mode='r', encoding='utf-8') as file:
        reader = csv.reader(file,delimiter=";")
        for row in reader:
            if len(row) == 2: 
                question, number = row[0].strip(), int(row[1].strip())
                questions_dict[question] = number
    
    return questions_dict
def extract_videos(file_path):
    videos_dict = {}

    with open(file_path, mode='r', encoding='utf-8') as file:
        reader = csv.reader(file,delimiter=";")
        for row in reader:
            title, tags, link, number = row[0].strip(), row[1].strip(), row[2].strip(), int(row[3].strip())
            videos_dict[title] = number
    
    return videos_dict

questions = extract_questions(questions_file)
videos = extract_videos(video_file)

In [15]:
#__________SIMILARITY__________
start_time = time.perf_counter()
videos_embedding = model.encode(list(videos.keys()), convert_to_tensor=True)
end_time = time.perf_counter()
run_time = end_time - start_time
print("\nTemps de vectorisation des titres des vidéos: "+str(run_time)+"\nTemps moyen: "+str(run_time/len(list(videos.keys()))))


Temps de vectorisation des titres des vidéos: 0.38862749999998414
Temps moyen: 0.00826867021276562


In [16]:
#__________MAPPING__________
mapping = {}
start_time = time.perf_counter()
for query in list(questions.keys()):
    query_embedding = model.encode(query, convert_to_tensor=True)
    similarity_scores = model.similarity(query_embedding, videos_embedding)[0]
    # Récupération des 5 vidéos les plus pertinentes
    scores, indices = torch.topk(similarity_scores, k=5)
    best_videos = [list(videos.keys())[idx] for idx in indices]
    mapping[query]=best_videos
end_time = time.perf_counter()
run_time = end_time - start_time
print("\nTemps de vectorisation des questions et de calcul de correspondance: "+str(run_time)+"\nTemps moyen: "+str(run_time/len(list(questions.keys()))))


Temps de vectorisation des questions et de calcul de correspondance: 10.396903800000018
Temps moyen: 0.0148104042735043


In [17]:
#__________CALCUL DU MRR__________
total = 0.0

for question, recommended_videos in mapping.items():
    correct_video = next((idx + 1 for idx, video in enumerate(recommended_videos) if videos[video] == questions[question]), 0)
    total += 1 / correct_video if correct_video else 0 

score = total / len(questions)
print("\nMean Reciprocal Rank (MRR): {:.4f}".format(score))


Mean Reciprocal Rank (MRR): 0.7884
