In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import pandas as pd
import numpy as np
import string
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

import tensorflow as tf


### Поиск по описанию ver.1 (Word2Vec)

In [15]:
data = pd.read_csv('/content/drive/MyDrive/NLP/df_and_eda.csv')

In [16]:
data.head()

Unnamed: 0,id,title,genre,description,year
0,1,Oscar et la dame rose,drama,Listening in to a conversation between his doc...,2009.0
1,2,Cupid,thriller,A brother and sister with a past incestuous re...,1997.0
2,3,"Young, Wild and Wonderful",adult,As the bus empties the students for their fiel...,1980.0
3,4,The Secret Sin,drama,To help their unemployed father make ends meet...,1915.0
4,5,The Unrecovered,drama,The film's title refers not only to the un-rec...,2007.0


In [17]:
# Функция для для создания эмбеддингов (делаем нижний регистр, токенизируем, получаем вектора)

def get_phrase_embedding(phrase):
    vector = np.zeros([model.vector_size], dtype='float32')
    phrase_tokenized = tokenizer.tokenize(phrase.lower())
    phrase_vectors = [model[x] for x in phrase_tokenized if model.has_index_for(x)]

    if len(phrase_vectors) != 0:
        vector = np.mean(phrase_vectors, axis=0)

    return vector


In [18]:
# Переводим наши описания в список
data_list = data['description'].tolist()

In [20]:
#пример из датасета
data.iloc[20]

id                                                            21
title                                          O Signo das Tetas
genre                                                      drama
description    The Road of Milk narrates in existential drama...
year                                                      2016.0
Name: 20, dtype: object

In [21]:
# Пример из нашего списка с описаниями фильмов
data_list[20]

"The Road of Milk narrates in existential drama tones the story of a man searching for his lost time, traveling across the roads of Brazil's countryside, making their way back to their homeland. In this journey, he will find his way to sign his past that will remake your life and show you the path of salvation. He is on the threshold between reason and madness, between salvation and death."

In [22]:
#Загружаем токенизатор
from nltk.tokenize import WordPunctTokenizer
tokenizer = WordPunctTokenizer()
data_tok = [tokenizer.tokenize(x.lower()) for x in data_list]

In [23]:
# библиотеку Gensim для создания модели Word2Vec на основе наших предварительно токенизированных данных data_tok
from gensim.models import Word2Vec
model = Word2Vec(data_tok,
                 vector_size=64,      # Размерность векторного представления каждого слова.
                 min_count=4,          # Минимальное количество раз, которое слово должно встречаться в корпусе, чтобы быть учтенным при обучении модели
                 window=5).wv          # Размер окна контекста. Кол-во слов до и после целевого

In [24]:
# Посмотрим на вектор нашего примера
get_phrase_embedding('A gang of unemployed itinerant musicians play in the south of Stockholm. Then they get the chance to be an orchestra in a dance restaurant. It goes well until the female owner falls in love with one of them.')

array([ 0.37278265, -0.03240917, -0.46934313, -0.01633952, -0.36985892,
       -0.03988634,  0.15698676, -0.0838614 , -0.7195939 ,  0.7439665 ,
        0.29599938, -0.06292702,  0.44888654,  0.35149866, -0.98146594,
       -0.21769467,  0.93917847, -0.26374784,  0.4389417 ,  0.53723365,
        0.21003146,  1.0184541 , -0.6702491 , -0.48209053,  0.3309791 ,
        0.04097001, -0.63941616,  0.47990584,  0.03908385,  0.17412181,
       -1.0949821 , -0.06395699,  0.27325326, -0.08609416,  0.42452615,
        0.07384948, -0.12433602,  0.19042942,  0.10908147,  0.39378175,
        0.25949788, -0.08786989,  0.64624166, -0.01478484, -0.54891074,
        0.03988195, -0.8212854 ,  0.17736566,  1.1489497 , -0.41730848,
       -0.44326034,  1.3535517 ,  0.28276035, -0.32618997, -0.09545463,
        0.07847472,  0.16157678,  0.0895718 ,  1.3794484 ,  0.02431139,
       -0.42274734, -0.2825344 ,  1.0429529 , -0.36510265], dtype=float32)

In [25]:
# Загоним векторные представления в массив numpy
chosen_phrases = data_list

phrase_vectors = np.asarray([get_phrase_embedding(x) for x in chosen_phrases])

In [13]:
# Вычисляем эмбеддинги
data_vectors = np.vstack([get_phrase_embedding(l) for l in data_list])

In [27]:
norms = np.linalg.norm(data_vectors, axis=1)
printable_set = set(string.printable)
data_subset = [x for x in data if set(x).issubset(printable_set)]

In [30]:
#Функция для поиска ближайших соседей
def find_nearest(query, data, k=10):

    query_vector = get_phrase_embedding(query)
    dists = data_vectors.dot(query_vector[:, None])[:, 0] / ((norms+1e-16)*np.linalg.norm(query_vector))
    nearest_elements = dists.argsort(axis=0)[-k:][::-1]
    indices = nearest_elements.tolist()  # Список индексов ближайших элементов
    out = [(index, data[index]) for index in indices]
    #out = [data[i] for i in nearest_elements]

    return out  # Возвращает найденные ближайшие элементы к запросу

In [33]:
#Проверим на нашем примере из датасета
results = find_nearest(query='A gang of unemployed itinerant musicians play in the south of Stockholm. Then they get the chance to be an orchestra in a dance restaurant.', data=data_list, k=5)
for result in results:
    print(result)
#Нашёл фильм, часть описания которого мы выдернули для нашего запроса, уже радует..

(6960, 'George Gribble (George Formby) works for the local council as an odd job man in the small, industrial town of Tangleton. When two newspaper reporters arrive in town to research an article on town planning George seems to be the only person at work. He gives them a guided tour of the town, only for them to write a highly critical article emphasizing the fact that the council leader lives alone in a huge mansion while George shares a house with 13 other people. The council orders Gribble to conduct an opinion poll, but he ends up surveying the entire population of the town - who want change to improve their lot to the detriment of the town leaders business interests.')
(25062, 'Oslo, April 19th 1945, as the Third Reich is living its last days, a group of Nazis and sympathizers (a Wehrmacht general; an SS commander and his "assistant"; an Italian industrialist and his wife who is also the general\'s lover; a French collaborator) board a submarine that will take them to South Ameri

In [35]:
#Проверим на выдуманном запросе
results = find_nearest(query="zombies eating hemans brain and destroy the city", data=data_list, k=5)
for result in results:
    index, _ = result  # Получаем индекс
    movie_title = data.iloc[index]['title']  # Получаем название фильма из датафрейма df
    print("Index:", index, "Movie Title:", movie_title)

Index: 24050 Movie Title: Blue Gold: World Water Wars
Index: 51115 Movie Title: The Banner Saga 2
Index: 2304 Movie Title: Jet Black
Index: 41179 Movie Title: Lunatic Messiah
Index: 28730 Movie Title: Interstellar Wars


In [39]:
#Сделаем так, чтобы ещё показывало косинусное сходство

#Superheroes trying to save the world from aliens
#Romantic comedy about divorced people
#An old retired detective takes up arms again to avenge his son and dog cursed by witches.

def find_nearest(query, data, k=10):
    query_vector = get_phrase_embedding(query)
    dists = data_vectors.dot(query_vector[:, None])[:, 0] / ((norms+1e-16)*np.linalg.norm(query_vector))
    nearest_elements = dists.argsort(axis=0)[-k:][::-1]
    indices = nearest_elements.tolist()  # Список индексов ближайших элементов

    # Добавляем косинусное сходство в результаты
    out = [(index, data[index], dists[index]) for index in indices]

    return out  # Возвращает найденные ближайшие элементы к запросу

#Наш запрос
results = find_nearest(query="zombies eating hemans brain and destroy the city", data=data_list, k=5)
for result in results:
    index, _, similarity = result  # Получаем индекс и косинусное сходство
    movie_title = data.iloc[index]['title']  # Получаем название фильма из датафрейма df
    print(index, movie_title, similarity)


24050 Blue Gold: World Water Wars 0.74280477
51115 The Banner Saga 2 0.73801404
2304 Jet Black 0.73217654
41179 Lunatic Messiah 0.73175913
28730 Interstellar Wars 0.7314051


Если судить по косинусному сходству, то результаты неплохие
Однако, если посмотреть на сами фильмы, который выдает нам наш код, видно что к "зомби" они не имеют никакого отношения. Это связано с тем, что Word2Vec не учитывает **ни семантику, ни контекст**. Данная проблема решается в следующем ноутбуке, где применена модель **distilBERT**.