In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math
import datetime
import re
import string
from tqdm import tqdm

import torch
import transformers
from transformers import BertModel, BertTokenizer, AutoTokenizer, RobertaModel
from torch.utils.data import Dataset, TensorDataset

pd.options.display.max_columns = 500



In [2]:
post_text_df = pd.read_csv("/kaggle/input/embeding-user-data/post_text_df.csv")
post_text_df.head()

Unnamed: 0,post_id,text,topic
0,1,UK economy facing major risks\n\nThe UK manufa...,business
1,2,Aids and climate top Davos agenda\n\nClimate c...,business
2,3,Asian quake hits European shares\n\nShares in ...,business
3,4,India power shares jump on debut\n\nShares in ...,business
4,5,Lacroix label bought by US firm\n\nLuxury good...,business


In [3]:
# Обработаем таблицу post_text_df
post_text_df.head()

Unnamed: 0,post_id,text,topic
0,1,UK economy facing major risks\n\nThe UK manufa...,business
1,2,Aids and climate top Davos agenda\n\nClimate c...,business
2,3,Asian quake hits European shares\n\nShares in ...,business
3,4,India power shares jump on debut\n\nShares in ...,business
4,5,Lacroix label bought by US firm\n\nLuxury good...,business


In [4]:
post_text_df['text'][0]

'UK economy facing major risks\n\nThe UK manufacturing sector will continue to face serious challenges over the next two years, the British Chamber of Commerce (BCC) has said.\n\nThe groups quarterly survey of companies found exports had picked up in the last three months of 2004 to their best levels in eight years. The rise came despite exchange rates being cited as a major concern. However, the BCC found the whole UK economy still faced major risks and warned that growth is set to slow. It recently forecast economic growth will slow from more than 3% in 2004 to a little below 2.5% in both 2005 and 2006.\n\nManufacturers domestic sales growth fell back slightly in the quarter, the survey of 5,196 firms found. Employment in manufacturing also fell and job expectations were at their lowest level for a year.\n\nDespite some positive news for the export sector, there are worrying signs for manufacturing, the BCC said. These results reinforce our concern over the sectors persistent inabili

In [5]:
def preprocessing(line):
    line = line.lower()
    line = line.replace('\n\n', ' ').replace('\n', ' ')
    line = re.sub(r'(?<=[\w\d])\b[^\w\s]+(?=\s)', ' ', line)
    return line

In [6]:
post_text_df['text'] = post_text_df['text'].apply(preprocessing)

In [7]:
post_text_df

Unnamed: 0,post_id,text,topic
0,1,uk economy facing major risks the uk manufactu...,business
1,2,aids and climate top davos agenda climate chan...,business
2,3,asian quake hits european shares shares in eur...,business
3,4,india power shares jump on debut shares in ind...,business
4,5,lacroix label bought by us firm luxury goods g...,business
...,...,...,...
7018,7315,ok i would not normally watch a farrelly brot...,movie
7019,7316,i give this movie 2 stars purely because of it...,movie
7020,7317,i cant believe this film was allowed to be mad...,movie
7021,7318,the version i saw of this film was the blockbu...,movie


In [8]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)
# print(torch.cuda.get_device_name())

cuda:0


In [9]:
tokenizer = AutoTokenizer.from_pretrained('roberta-base')
model = RobertaModel.from_pretrained('roberta-base')

Downloading (…)lve/main/config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
text_col = post_text_df['text']

In [11]:
from torch.utils.data import DataLoader

tokenized_dataset = tokenizer.batch_encode_plus(text_col.to_list(),
                                            add_special_tokens=True,
                                            padding=True,
                                            truncation=True, 
                                            return_token_type_ids=False)

In [12]:
dataset = TensorDataset(torch.tensor(tokenized_dataset['input_ids']),
                          torch.tensor(tokenized_dataset['attention_mask']))

from transformers import DataCollatorWithPadding
    

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [13]:
loader = DataLoader(dataset,
                    batch_size=5,
                    pin_memory=True,
                    shuffle=False)

In [14]:
model = model.to(device)
@torch.inference_mode()
def get_embedings(model, loader):
    model.eval()
    
    total_embeddings = []
    labels = []
    
    for batch in tqdm(loader):
        torch.cuda.empty_cache()
        batch = {'input_ids': batch[0].to(device),
                 'attention_mask':  batch[1].to(device)}
        input_ids, attention_mask = batch
        
        embeddings = model(**batch)['last_hidden_state'][:, 0,:]
        total_embeddings.append(embeddings.cpu())
    
    return torch.cat(total_embeddings, dim=0)

In [15]:
embeddings = get_embedings(model, loader)

100%|██████████| 1405/1405 [02:18<00:00, 10.11it/s]


In [16]:
embeddings

tensor([[-0.0288,  0.0188, -0.0012,  ..., -0.1546, -0.0679,  0.0406],
        [ 0.0011,  0.0438, -0.0143,  ..., -0.1370, -0.0337,  0.1035],
        [-0.0488,  0.0572,  0.0110,  ..., -0.1625, -0.0973,  0.0563],
        ...,
        [-0.0268,  0.0204, -0.0383,  ..., -0.1150, -0.0587,  0.0260],
        [-0.0756,  0.0163, -0.0128,  ..., -0.1794, -0.0071,  0.0160],
        [-0.0882,  0.0328,  0.0055,  ..., -0.1117, -0.0077,  0.0836]])

In [17]:
embeddings_list = embeddings.tolist()
embeddings_list

post_text_df = post_text_df.assign(embeddings=embeddings.tolist())

In [20]:
post_text_df.to_csv('post_text_with_embeddings.csv', index=False)

In [21]:
post_text_df.shape

(7023, 4)

In [22]:
post_text_with_embeddings = pd.read_csv("/kaggle/working/post_text_with_embeddings.csv")

In [23]:
post_text_with_embeddings.head()

Unnamed: 0,post_id,text,topic,embeddings
0,1,uk economy facing major risks the uk manufactu...,business,"[-0.02879844792187214, 0.018772652372717857, -..."
1,2,aids and climate top davos agenda climate chan...,business,"[0.0011228763032704592, 0.04378493130207062, -..."
2,3,asian quake hits european shares shares in eur...,business,"[-0.0488189235329628, 0.05721372738480568, 0.0..."
3,4,india power shares jump on debut shares in ind...,business,"[-0.024175573140382767, 0.021884476765990257, ..."
4,5,lacroix label bought by us firm luxury goods g...,business,"[-0.03241302818059921, 0.05497121810913086, -0..."
