### Imports

In [1]:
from tqdm.auto import tqdm
from transformers import AutoTokenizer, AutoModel
from typing import List
from pyspark.sql import SparkSession

import torch
import pyspark.sql.functions as f

### Load HF model

In [2]:
# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained('BAAI/bge-small-en-v1.5')
model = AutoModel.from_pretrained('BAAI/bge-small-en-v1.5')
model.eval()


def batch_embeddings(batch: List[str]):
    encoded_input = tokenizer(batch, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        model_output = model(**encoded_input)
        # Perform pooling. In this case, cls pooling.
        sentence_embeddings = model_output[0][:, 0]
    # normalize embeddings
    sentence_embeddings = torch.nn.functional.normalize(sentence_embeddings, p=2, dim=1)
    return sentence_embeddings

### Read anime ids and synopsises

In [3]:
warehouse = "/user/team20/project/hive/warehouse"
team = "team20"

spark = SparkSession.builder\
        .appName("{} - spark ML".format(team))\
        .master("yarn")\
        .config("hive.metastore.uris", "thrift://hadoop-02.uni.innopolis.ru:9883")\
        .config("spark.sql.warehouse.dir", warehouse)\
        .config("spark.sql.avro.compression.codec", "snappy")\
        .enableHiveSupport()\
        .getOrCreate()
spark.sql("USE team20_projectdb")

DataFrame[]

In [4]:
spark_df = spark.sql("SELECT id AS anime_id, synopsis FROM anime_part_buck WHERE synopsis != '-'")
df = spark_df.toPandas()
df.head()

Unnamed: 0,anime_id,synopsis
0,54252,what would you do if one of your family member...
1,52976,"fleeing a traumatic childhood, lone mercenary ..."
2,51595,a strange and wonderful story about a special ...
3,51048,a comedic short film by yoji kuri about runners.
4,50953,the kingdom of metallicana is under attack fro...


### Get anime embeddings

In [5]:
batch_size = 4
synopsis_list = df['synopsis'].tolist()
synopsis_embs = []

for i in tqdm(range(0, len(synopsis_list) // batch_size)):
    batch_synopsises = synopsis_list[i * batch_size:(i + 1) * batch_size]
    batch_embs = batch_embeddings(batch_synopsises)
    synopsis_embs.extend(batch_embs)

if (i + 1) * batch_size < len(synopsis_list):
    batch_synopsises = synopsis_list[(i + 1) * batch_size:len(synopsis_list)]
    batch_embs = batch_embeddings(batch_synopsises)
    synopsis_embs.extend(batch_embs)

  0%|          | 0/4973 [00:00<?, ?it/s]

IndexError: list index out of range

### Write anime embeddings to csv

In [8]:
assert len(synopsis_embs) == len(synopsis_list), (len(synopsis_embs), len(synopsis_list))

### Find user embeddings by averaging

In [None]:
spark_df = spark.sql("SELECT id AS anime_id, synopsis FROM anime_part_buck WHERE synopsis != '-'")
scores_df = spark_df.toPandas()
scores_df.head()

### Test

In [None]:
# Sentences we want sentence embeddings for
sentences = ["""after a horrific alchemy experiment goes wrong in the elric household, brothers edward and alphonse are left in a catastrophic new reality. ignoring the alchemical principle banning human transmutation, the boys attempted to bring their recently deceased mother back to life. instead, they suffered brutal personal loss: alphonse's body disintegrated while edward lost a leg and then sacrificed an arm to keep alphonse's soul in the physical realm by binding it to a hulking suit of armor.
the brothers are rescued by their neighbor pinako rockbell and her granddaughter winry. known as a bio-mechanical engineering prodigy, winry creates prosthetic limbs for edward by utilizing "automail," a tough, versatile metal used in robots and combat armor. after years of training, the elric brothers set off on a quest to restore their bodies by locating the philosopher's stone—a powerful gem that allows an alchemist to defy the traditional laws of equivalent exchange.
as edward becomes an infamous alchemist and gains the nickname "fullmetal," the boys' journey embroils them in a growing conspiracy that threatens the fate of the world.""",
             """eccentric scientist rintarou okabe has a never-ending thirst for scientific exploration. together with his ditzy but well-meaning friend mayuri shiina and his roommate itaru hashida, rintarou founds the future gadget laboratory in the hopes of creating technological innovations that baffle the human psyche. despite claims of grandeur, the only notable "gadget" the trio have created is a microwave that has the mystifying power to turn bananas into green goo.
however, when rintarou decides to attend neuroscientist kurisu makise's conference on time travel, he experiences a series of strange events that lead him to believe that there is more to the "phone microwave" gadget than meets the eye. apparently able to send text messages into the past using the microwave, rintarou dabbles further with the "time machine," attracting the ire and attention of the mysterious organization sern.
due to the novel discovery, rintarou and his friends find themselves in an ever-present danger. as he works to mitigate the damage his invention has caused to the timeline, he is not only fighting a battle to save his loved ones, but also one against his degrading sanity."""]

print("Sentence embeddings:", batch_embeddings(sentences))

In [8]:
for i in tqdm(range(1000)):
    _ = batch_embeddings(sentences)

  0%|          | 0/1000 [00:00<?, ?it/s]