In [1]:
import pandas as pd
import os 
import seaborn as sns
import torch
import pickle
import matplotlib.pyplot as plt
from transformers import BertConfig, BertTokenizer, BertModel
from transformers import BertForSequenceClassification
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE 

In [2]:
default_path = os.getcwd()
data_path = os.path.join(default_path, '../data')
base_model = os.path.join(default_path, '../base-model')
config_path = os.path.join(default_path, '../config')
config_file = "bert-base.json"

In [3]:
dsm_samp = pd.read_csv(os.path.join(data_path, 'dsm_samp_test.csv'))
dsm_samp.head(3)

Unnamed: 0,id,text,label
0,50gph3,every little insult even if it's online just h...,8
1,t3_wfhxs,"do you know why you're feeling depressed, or i...",0
2,58580,So I'm just gonna live in the countryside,9


In [4]:
tokenizer = BertTokenizer.from_pretrained(os.path.join(base_model, 'bert-base'), model_max_length=32)
config = BertConfig.from_pretrained(os.path.join(base_model, 'bert-base', 'bert_config.json'), output_hidden_states=True)
model = BertModel.from_pretrained(os.path.join(base_model, 'bert-base'), config=config)

Some weights of the model checkpoint at F:\AuD\base-model\bert-base were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
dsm_emb = []

for idx in range(len(dsm_samp)):
    encoded = tokenizer.encode_plus(
        text=dsm_samp.text[idx],  # the sentence to be encoded
        add_special_tokens=True,  # Add [CLS] and [SEP]
        max_length = 32,  # maximum length of a sentence
        pad_to_max_length=True,  # Add [PAD]s
        return_attention_mask = True,  # Generate the attention mask
        return_tensors = 'pt',  # ask the function to return PyTorch tensors
    )
    if idx % 1000 == 0:
        print(idx)
    # print(encoded)
    input_ids = torch.tensor(encoded['input_ids'])
    attn_mask = torch.tensor(encoded['attention_mask'])
    token_type_ids = torch.tensor(encoded['token_type_ids'])
    outputs = model(input_ids, attn_mask, token_type_ids)
    hidden_states = outputs[2]
    token_vecs = hidden_states[-2][0]
    sentence_embedding = torch.mean(token_vecs, dim=0)
    dsm_emb.append(list(sentence_embedding.detach().cpu().numpy()))

0


  input_ids = torch.tensor(encoded['input_ids'])
  attn_mask = torch.tensor(encoded['attention_mask'])
  token_type_ids = torch.tensor(encoded['token_type_ids'])


1000
2000
3000
4000
5000
6000


In [None]:
dsm_X = pd.DataFrame(dsm_emb, columns=range(len(dsm_emb[0])))
dsm_X

In [None]:
dsm_y = dsm_samp.label.tolist()

In [None]:
pca = PCA(n_components=2)
dsm_X = pca.fit_transform(dsm_X)

In [None]:
dsm_X[:, 0]

In [None]:
palette = sns.color_palette("bright", 9)
sns.scatterplot(x=dsm_X[:,0], y=dsm_X[:,1], hue=dsm_y, legend='full', palette=palette)
plt.show()

In [None]:
pca = PCA(n_components=30)
dsm_X = pca.fit_transform(dsm_X)

In [None]:
X_embedded = TSNE(n_components=2).fit_transform(dsm_X)

In [None]:
palette = sns.color_palette("bright", 9)
sns.scatterplot(x=X_embedded[:,0], y=X_embedded[:,1], hue=dsm_y, legend='full', palette=palette)
plt.show()