In [1]:
import dill
import torch
import string
import pandas as pd
from nltk.tokenize import word_tokenize
from sentence_transformers import SentenceTransformer
from nltk.stem.snowball import SnowballStemmer
from typing import List

In [2]:
snowball_stemmer = SnowballStemmer(language="russian")
word_tokenizer = lambda x: word_tokenize(x, language="russian")

In [3]:
class Handler:
    def __init__(self):
        pass
    def __call__(self, token: str):
        pass

class StopWords(Handler):
    def __init__(self, stopwords):
        super().__init__()
        self.stopwords = set(stopwords)
    def __call__(self, token: str):
        return token if token not in self.stopwords else None

class Lowercase(Handler):
    def __init__(self):
        super().__init__()
    def __call__(self, token: str):
        return token.lower()   

class SnowballStemmerWrapper(Handler):
    def __init__(self):
        super().__init__()
        self.stemmer = SnowballStemmer("russian")
    def __call__(self, token: str):
        return self.stemmer.stem(token)

class RemovePunctualion(Handler):
    def __init__(self, additional_punkt=""):
        super().__init__()
        punctuation = string.punctuation + additional_punkt
        self.punct_trans = str.maketrans("", "", punctuation)
    def __call__(self, token: str):
        token = token.translate(self.punct_trans)
        return token if token != "" else None

class TextPipeline:
    def __init__(self, handlers: List[Handler] = list(), tokenizer=str.split):
        self.tokenizer = tokenizer
        self.handlers = handlers
    def __call__(self, text: str):
        result = []
        tokens = self.tokenizer(text)
        for token in tokens:
            for handler in self.handlers:
                token = handler(token)
                if token is None:
                    break
            else: # если дошли до конца цикла
                result.append(token)
        return " ".join(result)

In [62]:
def foo(x, id2_to_thing):
    if x in id2_to_thing:
        return id2_to_thing[x]
    return

def get_payload(metadata: pd.DataFrame, reviews: pd.DataFrame, tp = TextPipeline()):
    id2type = pd.Series(metadata["types"].values, index=metadata["kinopoisk_id"]).to_dict()
    id2film_title = pd.Series(metadata["title_ru"].values, index=metadata["kinopoisk_id"]).to_dict()
    result = pd.DataFrame()
    result["film_id"] = reviews["film_id"]
    result["film_title"] = reviews["film_id"].apply(lambda x: foo(x, id2film_title))
    result["film_type"] = reviews["film_id"].apply(lambda x: foo(x, id2type))
    result["review_id"] = reviews["review_id"]
    result["review"] = reviews["description"].apply(tp).to_list()
    return result

In [46]:
titles_genres = pd.read_csv("top991.csv")
# titles_genres = titles_genres.rename({
#     "kinopoisk_id": "film_id", 
#     "title_ru": "film_title",
#     "types": "film_type"
# }, axis=1)
titles_genres

Unnamed: 0.1,Unnamed: 0,kinopoisk_id,title_ru,rating,types
0,0,535341,1+1,8.805,Фильм
1,1,1143242,Джентльмены,8.524,Фильм
2,2,462682,Волк с Уолл-стрит,7.948,Фильм
3,3,1318972,Гнев человеческий,7.568,Фильм
4,4,41519,Брат,8.282,Фильм
...,...,...,...,...,...
987,987,688384,Дом,7.294,Мультфильм
988,988,1161783,Полицейский с Рублевки. Новогодний беспредел,6.504,Фильм
989,989,4456600,Киллер,7.437,Сериал
990,990,533954,Золушка,7.102,Фильм


In [61]:
titles_genres[titles_genres["kinopoisk_id"] == 1112132]

Unnamed: 0.1,Unnamed: 0,kinopoisk_id,title_ru,rating,types


In [47]:
df = pd.read_csv("992en.csv")
df["coef"] = df["pos"] / df["neg"]
print(len(df))
df.head()

19231


Unnamed: 0.1,Unnamed: 0,film_id,review_id,type,date,pos,neg,author,title,description,coef
0,0,535341,1496976,NEGATIVE,2012-04-09T18:49:16,1055,828,Дмитрий Кожин,Фактор денег,Risking criticism I can’t help but write my op...,1.274155
1,1,535341,1497790,NEGATIVE,2012-04-10T20:31:29,497,1830,reminn,Богатые тоже плачут,The film '1+1' has not yet had time to die out...,0.271585
2,2,535341,1969687,NEGATIVE,2014-02-01T11:39:16,334,283,iamaminov,Шедевр XXI века?,"I do not consider this film a failure at all, ...",1.180212
3,3,535341,1458705,POSITIVE,2012-02-26T09:08:29,299,50,FunkyHustla,,"The old saying says: ""The best stories are sto...",5.98
4,4,535341,1565242,NEGATIVE,2012-07-11T13:35:27,255,378,Rotten critic,Съешь ещё этих мягких французских булок,"250 best films according to the site ""Kinopois...",0.674603


In [48]:
film_ids = df["film_id"].unique()
len(film_ids)

984

In [51]:
threshold = 1
dft = df#[df["coef"] > 1]
device = "cuda:0" if torch.cuda.is_available() else "cpu"

In [63]:
text_pipeline = TextPipeline()
payload = get_payload(titles_genres, dft, text_pipeline)
payload = payload.dropna()
reviews = payload["review"].to_list()
payload.head()

Unnamed: 0,film_id,film_title,film_type,review_id,review
0,535341,1+1,Фильм,1496976,Risking criticism I can’t help but write my op...
1,535341,1+1,Фильм,1497790,The film '1+1' has not yet had time to die out...
2,535341,1+1,Фильм,1969687,"I do not consider this film a failure at all, ..."
3,535341,1+1,Фильм,1458705,"The old saying says: ""The best stories are sto..."
4,535341,1+1,Фильм,1565242,"250 best films according to the site ""Kinopois..."


In [64]:
payload[payload["film_title"].isna()]

Unnamed: 0,film_id,film_title,film_type,review_id,review


In [65]:
all_mpnet_base_v2 = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
all_mpnet_base_v2 = all_mpnet_base_v2.to(device)
all_mpnet_base_v2.device

device(type='cuda', index=0)

In [66]:
embs = all_mpnet_base_v2.encode(reviews, show_progress_bar=True)
embs.shape

Batches:   0%|          | 0/601 [00:00<?, ?it/s]

(19211, 768)

In [67]:
payload["embs"] = list(embs)
payload

Unnamed: 0,film_id,film_title,film_type,review_id,review,embs
0,535341,1+1,Фильм,1496976,Risking criticism I can’t help but write my op...,"[0.021548994, 0.039492883, 0.011241139, 0.0262..."
1,535341,1+1,Фильм,1497790,The film '1+1' has not yet had time to die out...,"[0.031169811, 0.055136558, 0.009577731, -0.008..."
2,535341,1+1,Фильм,1969687,"I do not consider this film a failure at all, ...","[0.0069298116, 0.065891154, 0.0061246017, 0.04..."
3,535341,1+1,Фильм,1458705,"The old saying says: ""The best stories are sto...","[0.0038753524, 0.041133057, 0.009274759, 0.031..."
4,535341,1+1,Фильм,1565242,"250 best films according to the site ""Kinopois...","[0.03668189, 0.02977762, 0.00094309886, 0.0260..."
...,...,...,...,...,...,...
19206,12192,Жестокие игры,Фильм,28074,Cheap.Kinzo for Malchiks and puberty girls.Eve...,"[0.061675265, 0.005615119, 0.0072633647, -0.02..."
19207,12192,Жестокие игры,Фильм,676399,The film was praised by everyone who is not to...,"[0.06839343, -0.004652626, 0.01164023, 0.03514..."
19208,12192,Жестокие игры,Фильм,221414,Film with the background!Film Sensation!The fi...,"[0.018151414, -0.00044903017, 0.001762076, 0.0..."
19209,12192,Жестокие игры,Фильм,507991,The film is great!I advise you to see all repr...,"[0.025090998, 0.0030791296, -0.005748358, 0.02..."


In [68]:
with open(f"payload_en_{len(payload)}.dill", 'wb') as f:
    dill.dump(payload, f)

In [69]:
kion_films_reviews = pd.read_csv("kion_films_reviews.csv")
kion_films_reviews

Unnamed: 0,film_id,film_title,review_id,author_id,author_name,review_title,review_type,pos,neg,review_text,date
0,477647,Гостья,1755958,1414072,Lost__Soul,Прекрасное создание,NEGATIVE,241,123,\nНебезызвестный феномен современного кинемато...,2013-03-28 00:04:00
1,477647,Гостья,1755392,17062,rain 13,Идеальный мир паразитов,POSITIVE,138,62,"\nЕсли живы воспоминания, жив и ты (с)\n\nТема...",2013-03-27 11:21:00
2,477647,Гостья,1757153,423716,Aeger_Faber,Странница,NEGATIVE,153,90,"\nЕсли вы читаете эту рецензию, то, вероятно, ...",2013-03-29 16:59:00
3,477647,Гостья,1756803,1749081,клементина кручински,Копеечное счастье Стефани Майер.,NEGATIVE,141,84,"\nВсем, кто купился на мерцающий электрический...",2013-03-28 23:54:00
4,477647,Гостья,1761571,1685,korsar45,Безаналитическая история Говарда Лавью,NEUTRAL,46,12,\nШёл 2017-й год. Земля стремительно очищалась...,2013-04-04 19:33:00
...,...,...,...,...,...,...,...,...,...,...,...
397803,775410,Виселица,2622075,6943250,\nLionheart92\n,,NEUTRAL,19,5,\nВремя от времени Аль Пачино снимается в схож...,2017-11-26 16:55:00
397804,775410,Виселица,2757625,2026005,cyberlaw,Виселица с Аль Пачино,NEGATIVE,15,9,"\nВеликий актер. Он добился таких высот, что т...",2019-01-09 04:48:00
397805,775410,Виселица,2953490,2321465,Fozzy,Расследование убийств,POSITIVE,5,2,\nАль Пачино - легенда кинематографа (правда п...,2020-10-05 19:26:00
397806,775410,Виселица,2629158,1102751,neo1570,Затертая до дыр игра в маньяка.,NEGATIVE,25,23,"\nНовинка от режиссера Джонни Мартина, который...",2017-12-15 16:56:00
