In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install gradio
!pip install fuzzywuzzy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting gradio
  Downloading gradio-3.9.1-py3-none-any.whl (11.6 MB)
[K     |████████████████████████████████| 11.6 MB 7.5 MB/s 
Collecting python-multipart
  Downloading python-multipart-0.0.5.tar.gz (32 kB)
Collecting websockets>=10.0
  Downloading websockets-10.4-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (106 kB)
[K     |████████████████████████████████| 106 kB 48.9 MB/s 
[?25hCollecting fastapi
  Downloading fastapi-0.86.0-py3-none-any.whl (55 kB)
[K     |████████████████████████████████| 55 kB 4.3 MB/s 
Collecting pycryptodome
  Downloading pycryptodome-3.15.0-cp35-abi3-manylinux2010_x86_64.whl (2.3 MB)
[K     |████████████████████████████████| 2.3 MB 59.3 MB/s 
[?25hCollecting markdown-it-py[linkify,plugins]
  Downloading markdown_it_py-2.1.0-py3-none-any.whl (84 kB)
[K     |████████████████████████████████| 84 kB 3.7 

In [3]:
import random
import gradio as gr
import re
import argparse
import pandas as pd
from fuzzywuzzy import fuzz

from typing import List



In [4]:
def regex_for_query_without_quotes(name, suffix = "ы"):
    if suffix != "": suffix += "?"
    return f"{name}{suffix}:([\w,.]+)"


def regex_for_query_with_quotes(name, suffix = "ы"):
    if suffix != "": suffix += "?"
    return f"{name}{suffix}:'([\w,. ]+)'"

In [5]:
class Handler:
    def __init__(self, column_name, value_type=str):
        self.column_name = column_name
        self.value_type = value_type

        
    def __call__(self, query: str, df: pd.DataFrame):
        raise NotImplementedError()


class ComplexHandler(Handler):
    def __init__(self, field_name, column_name, pred, cast_to_type=str, **kwargs):
        super().__init__(column_name, cast_to_type)
        pattern = regex_for_query_without_quotes(field_name, **kwargs)
        pattern_with_quotes = regex_for_query_with_quotes(field_name, **kwargs)
        self.regex = re.compile(pattern)
        self.regex_with_quotes = re.compile(pattern_with_quotes)
        self.pred = pred


    def __call__(self, query: str, df: pd.DataFrame):
        search_result = self.regex.findall(query)
        search_result_with_quotes = self.regex_with_quotes.findall(query)
        if not len(search_result) and not len(search_result_with_quotes): 
            return df
        column = df[self.column_name]
        first_result: str = search_result[0] if len(search_result) else search_result_with_quotes[0]
        first_result = first_result.split(',')
        result = pd.Series([True for _ in range(column.size)], index=column.index)
        try:
            for value in first_result:
                value = self.value_type(value)
                result &= self.pred(column, value)
        except:
            return df
        return df[result]

In [6]:
class Pipeline:
    def __init__(self, handlers: List[Handler]):
        self.handlers = handlers


    def __call__(self, query, df):
        for handler in self.handlers:
            df = handler(query, df)
        return df


def get_films(query: str, films: List[str], k: int):
    global df
    year_handler = ComplexHandler("год", "Year", lambda s, r: s == r, int)
    director_handler = ComplexHandler("режиссер", "Director", lambda s, r: s.str.contains(r))
    genre_handler = ComplexHandler("жанр", "Genres", lambda s, r: s.str.contains(r))
    actor_handler = ComplexHandler("актер", "Actors", lambda s, r: s.str.contains(r))
    country_handler = ComplexHandler("страна", "Countries", lambda s, r: s.str.contains(r))
    rating_handler = ComplexHandler("рейтинг", "Rating", lambda s, r: s >= r, float)

    pipeline = Pipeline([
        rating_handler, year_handler, director_handler, 
        genre_handler, actor_handler, country_handler
    ])
    res = pipeline(query, df)
    res = res.sort_values('Rating', ascending=False)
    res = res.iloc[:k].Title.values.tolist()
    if len(res) < k:
        res.extend(get_similar(query, films, k-len(res))) # second scenario
    films.extend(res)
    return res

In [7]:
# def get_films(query: str, films: List[str], k: int):
#     final = pd.DataFrame([])
#     for i, k in zip(df['all'], df['Title']):
#         final = final.append(pd.DataFrame([query, i, fuzz.token_set_ratio(query, i), k]).T)
#     final.columns = ['Query', 'Data', 'Similarity', 'Title']
#     final = final.sort_values('Similarity', ascending=False)
#     res = final.loc[:k].Title.values.tolist()
#     films.extend(res)
#     return res

In [8]:
# def get_films(query: str, films: List[str], k: int):
#     res = [f'film_{i+1}' for i in range(k)]
#     films.extend(res)
#     return res

def get_similar(query: str, films: List[str], k: int):
    res = [f'film_{i+1}_second_case' for i in range(k)]
    films.extend(res)
    return res

In [9]:
df = pd.read_csv('/content/drive/MyDrive/text2rec/top250.csv', index_col='FilmId')
# df = df[['Title', 'Director', 'Genres', 'Actors', 'ShortDescription', 'Year']]
# df['Actors'] = df['Actors'].apply(lambda x: ', '.join(str(i) for i in x.split(', ')[0:5]))
# df['Year'] = df['Year'].apply(lambda x: str(x))
# df['ShortDescription'].fillna(' ', inplace=True)
# df['ShortDescription'] = df['ShortDescription'].apply(lambda x: str(x))
# df['all'] = df['ShortDescription'] + ' ' + df['Director'] + ', ' + df['Actors'] + ', ' + df['Year'] + ', ' + df[
#     'Genres']
# df['all'] = df['all'].apply(lambda x: x.lower())

In [10]:
def get_recs(query: str, img_paths: List[str], k: int = 10) -> List[str]:
    is_query_first_scenario = 1 #random.randint(0, 1)
    if is_query_first_scenario:
        result = get_films(query, img_paths, k)
    else:
        result = get_similar(query, img_paths, k)
    return result

def get_imgs(films):
    return [f'/content/drive/MyDrive/text2rec/{name}.jpg' for name in films[-10:]]

In [11]:
description = '# Проект Text2Rec\n \
Сервис предоставляет возможность поиска фильмов по произвольному запросу. \n \
Поддерживается 2 сценария работы: \n \
1) Поиск по ключевым параметрам: год, режиссер, жанр, актер, страна, рейтинг. \n \
Пример запроса: "жанр:приключения страна:США" \n \
2) Поиск по произвольному запросу. \n \
Пример запроса: "Фильмы про путешествия во времени" \n\n \
Пожалуйста, оцените релевантность выдачи(Нравится/Не нравится), \
это поможет улучшить работу алгоритма. Не забудьте нажать кнопку "Отправить ответы"'

In [12]:
callback = gr.CSVLogger()

with gr.Blocks() as demo:
    films = gr.State([])
    gr.Markdown(description)
    query = gr.Textbox(label="Запрос")
    search_btn = gr.Button("Поиск")
    with gr.Row():
        with gr.Column():
            img1 = gr.Image(show_label=False)
            name1 = gr.Text(show_label=False)
            like1 = gr.Radio(show_label=False, choices=['Нравится', 'Не нравится'])
        with gr.Column():
            img2 = gr.Image(show_label=False)
            name2 = gr.Text(show_label=False)
            like2 = gr.Radio(show_label=False, choices=['Нравится', 'Не нравится'])
        with gr.Column():
            img3 = gr.Image(show_label=False)
            name3 = gr.Text(show_label=False)
            like3 = gr.Radio(show_label=False, choices=['Нравится', 'Не нравится'])
        with gr.Column():
            img4 = gr.Image(show_label=False)
            name4 = gr.Text(show_label=False)
            like4 = gr.Radio(show_label=False, choices=['Нравится', 'Не нравится'])
    with gr.Row():
        with gr.Column():
            img5 = gr.Image(show_label=False)
            name5 = gr.Text(show_label=False)
            like5 = gr.Radio(show_label=False, choices=['Нравится', 'Не нравится'])
        with gr.Column():
            img6 = gr.Image(show_label=False)
            name6 = gr.Text(show_label=False)
            like6 = gr.Radio(show_label=False, choices=['Нравится', 'Не нравится'])
        with gr.Column():
            img7 = gr.Image(show_label=False)
            name7 = gr.Text(show_label=False)
            like7 = gr.Radio(show_label=False, choices=['Нравится', 'Не нравится'])
        with gr.Column():
            img8 = gr.Image(show_label=False)
            name8 = gr.Text(show_label=False)
            like8 = gr.Radio(show_label=False, choices=['Нравится', 'Не нравится'])
    with gr.Row():
        with gr.Column():
            pass
        with gr.Column():
            img9 = gr.Image(show_label=False)
            name9 = gr.Text(show_label=False)
            like9 = gr.Radio(show_label=False, choices=['Нравится', 'Не нравится'])
        with gr.Column():
            img10 = gr.Image(show_label=False)
            name10 = gr.Text(show_label=False)
            like10 = gr.Radio(show_label=False, choices=['Нравится', 'Не нравится'])
        with gr.Column():
            pass
    send_btn = gr.Button("Отправить оценки")
    note = gr.Markdown('Оценки отправлены. Спасибо!', visible=False)
    name_list = [name1, name2, name3, name4, name5, name6, name7, name8, name9, name10]
    img_list = [img1, img2, img3, img4, img5, img6, img7, img8, img9, img10]
    like_list = [like1, like2, like3, like4, like5, like6, like7, like8, like9, like10]
    search_btn.click(fn=get_recs, inputs=[query, films], outputs=name_list)
    search_btn.click(fn=get_imgs, inputs=films, outputs=img_list)
    callback.setup([query, *name_list, *like_list], "collected_data")
    send_btn.click(lambda *args: callback.flag(args), [query, *name_list, *like_list], None, preprocess=False)
    send_btn.click(lambda: gr.update(visible=True), inputs=None, outputs=note)

In [13]:
# demo.queue()
demo.launch(debug=True, share=True)

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
Running on public URL: https://50a699eb6e97f8d8.gradio.app

This share link expires in 72 hours. For free permanent hosting and GPU upgrades (NEW!), check out Spaces: https://huggingface.co/spaces


Keyboard interruption in main thread... closing server.


(<gradio.routes.App at 0x7fe1fa93a290>,
 'http://127.0.0.1:7860/',
 'https://50a699eb6e97f8d8.gradio.app')

In [None]:
demo.close()

Closing server running on port: 7860


In [14]:
data = pd.read_csv('/content/collected_data/log.csv')
data

Unnamed: 0,Запрос,component 1,component 2,component 3,component 4,component 5,component 6,component 7,component 8,component 9,...,component 14,component 15,component 16,component 17,component 18,component 19,component 20,flag,username,timestamp
0,жанр:приключения страна:США,Король Лев,Властелин колец: Возвращение короля,Тайна Коко,Клаус,Властелин колец: Братство Кольца,Интерстеллар,Назад в будущее,Властелин колец: Две крепости,Гладиатор,...,Нравится,,Нравится,Нравится,,,,,,2022-11-11 11:17:13.552920
