#### Note:
    Check how BERT convert data

# Import libs

In [59]:
import os, sys
import pickle

import pandas as pd
import numpy as np
from joblib import dump, load
from sqlalchemy import create_engine
from torch.optim import Adam

from others.database import SessionLocal, engine


import matplotlib.pyplot as plt
import seaborn as sns

# Load general data

In [2]:
df_users = pd.read_parquet("../data/parquet_user_default_data.parquet")
df_posts = pd.read_parquet('../data/df_post_text_features_save_november_2023.parquet')

# Transform general data

#### Transform post data

##### TOPIC

In [3]:
dummy_topics = pd.get_dummies(df_posts[['topic']], drop_first=True)

In [4]:
dummy_topics

Unnamed: 0,topic_covid,topic_entertainment,topic_movie,topic_politics,topic_sport,topic_tech
0,False,False,False,False,False,False
1,False,False,False,False,False,False
2,False,False,False,False,False,False
3,False,False,False,False,False,False
4,False,False,False,False,False,False
...,...,...,...,...,...,...
7018,False,False,True,False,False,False
7019,False,False,True,False,False,False
7020,False,False,True,False,False,False
7021,False,False,True,False,False,False


##### TEXT

In [5]:
df_posts_text = df_posts[["post_id", "text"]].copy()

###### Load Model BERT

In [116]:
from transformers import DistilBertTokenizer, DistilBertModel
from transformers import DataCollatorWithPadding
from tqdm import tqdm
from torch.utils.data import DataLoader
from datasets import Dataset
from datasets import load_dataset
from IPython.display import clear_output

from torch.utils.data import Subset

import numpy as np
import torch

In [7]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
model = DistilBertModel.from_pretrained("distilbert-base-cased")

In [8]:
def tokenization(example):
    return tokenizer.batch_encode_plus(example['text'], add_special_tokens=True, return_token_type_ids=False, truncation=True)

In [9]:
df_posts_text_hugging_face = Dataset.from_dict({"post_id": df_posts_text.post_id.values, "text": df_posts_text.text.values})

In [10]:
dataset_text = df_posts_text_hugging_face.map(tokenization, batched=True)
dataset_text.set_format(type='torch', columns=['input_ids', 'attention_mask'])

Map:   0%|          | 0/7023 [00:00<?, ? examples/s]

In [11]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
loader = DataLoader(dataset_text, batch_size=64, collate_fn=data_collator, pin_memory=True, shuffle=False)

In [12]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [19]:
@torch.inference_mode()
def get_embeddings_labels(model, loader):
    model.eval()

    total_embeddings = []
    for batch in tqdm(loader):
        batch = {key: batch[key].cuda() for key in ['attention_mask', 'input_ids']}
        
        embeddings = model(**batch)['last_hidden_state'][:, 0, :]
        total_embeddings.append(embeddings.cpu())

    return torch.cat(total_embeddings, dim=0)

In [21]:
model = model.to(device)

In [22]:
embedings = get_embeddings_labels(model, loader)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 110/110 [01:23<00:00,  1.31it/s]


In [37]:
embeddings_df = pd.DataFrame(torch.Tensor.numpy(embedings))

In [163]:
# torch.save(embedings, 'distilbert_text_embeddings.pt')

In [175]:
df_posts_embedings = pd.concat([df_posts_text,dummy_topics, embeddings_df], axis=1).drop(axis=1, columns=['text'])

In [180]:
# df_posts_embedings.to_parquet("df_posts_embedings.parquet")

#### Transform User data

In [186]:
df_users.head()

Unnamed: 0,id,gender,age,country,city,exp_group,os,source
0,200,1,34,Russia,Degtyarsk,3,Android,ads
1,201,0,37,Russia,Abakan,0,Android,ads
2,202,1,17,Russia,Smolensk,4,Android,ads
3,203,0,18,Russia,Moscow,1,iOS,ads
4,204,0,36,Russia,Anzhero-Sudzhensk,3,Android,ads


In [189]:
dummies_country = pd.get_dummies(df_users['country'], drop_first=True)

In [190]:
dummies_exp_group = pd.get_dummies(df_users['exp_group'], drop_first=True)

In [193]:
dummies_os = pd.get_dummies(df_users['os'])

In [195]:
dummies_source = pd.get_dummies(df_users['source'])

In [198]:
df_users = df_users.drop(axis=1, columns=['country','city','exp_group','os','source'])

In [201]:
df_users = pd.concat([df_users, dummies_country, dummies_exp_group, dummies_os, dummies_source], axis=1)

In [215]:
df_users.to_parquet('../data/df_users_embedings.parquet')

In [213]:
df_users = df_users.rename(columns={1: "exp_1", 2: "exp_2", 3: "exp_3", 4: "exp_4"})