In [1]:
import pandas as pd
import os 
import json
import torch
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
from torch import nn
from transformers import BertForSequenceClassification
from attrdict import AttrDict
from transformers import BertConfig, BertTokenizer, BertModel
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE 

In [2]:
os.getcwd()

'F:\\AuD'

In [3]:
default_path = os.getcwd()
data_path = os.path.join(default_path, 'data')
base_model = os.path.join(default_path, 'base-model')
model_path = os.path.join(default_path, 'models')
config_path = os.path.join(default_path, 'config')
log_path = os.path.join(default_path, 'log')
config_file = "bert-base.json"

In [5]:
dsm_samp = pd.read_csv(os.path.join(data_path, 'dsm_samp_test.csv'))
dsm_samp.head(3)

Unnamed: 0,id,text,label
0,50gph3,every little insult even if it's online just h...,8
1,t3_wfhxs,"do you know why you're feeling depressed, or i...",0
2,58580,So I'm just gonna live in the countryside,9


In [6]:
torch.cuda.is_available()

False

In [7]:
len(dsm_samp)

19869

In [8]:
with open(os.path.join(config_path, 'training_config.json')) as f:
    training_config = AttrDict(json.load(f))

training_config.pad = 'max_length'
training_config.device = torch.device("cuda") if torch.cuda.is_available() else "cpu"

### 1. Sentence Embedding

#### 1.1 non-trained model

In [10]:
tokenizer = BertTokenizer.from_pretrained(os.path.join(base_model, 'bert-small'), model_max_length=128)
config = BertConfig.from_pretrained(os.path.join(base_model, 'bert-small', 'bert_config.json'), output_hidden_states=True)
model = BertModel.from_pretrained(os.path.join(base_model, 'bert-small'), config=config)

Some weights of the model checkpoint at F:\AuD\base-model\bert-small were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [11]:
config.max_position_embeddings = 128

In [12]:
model.to(training_config.device)

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 512, padding_idx=0)
    (position_embeddings): Embedding(512, 512)
    (token_type_embeddings): Embedding(2, 512)
    (LayerNorm): LayerNorm((512,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=512, out_features=512, bias=True)
            (key): Linear(in_features=512, out_features=512, bias=True)
            (value): Linear(in_features=512, out_features=512, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=512, out_features=512, bias=True)
            (LayerNorm): LayerNorm((512,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

In [None]:
dsm_emb = []

for idx in range(len(dsm_samp)):
    encoded = tokenizer.encode_plus(
        text=dsm_samp.text[idx],  # the sentence to be encoded
        add_special_tokens=True,  # Add [CLS] and [SEP]
        max_length = 64,  # maximum length of a sentence
        pad_to_max_length=True,  # Add [PAD]s
        return_attention_mask = True,  # Generate the attention mask
        return_tensors = 'pt',  # ask the function to return PyTorch tensors
    )
    # print(encoded)
    input_ids = torch.tensor(encoded['input_ids']).to(training_config.device)
    attn_mask = torch.tensor(encoded['attention_mask']).to(training_config.device)
    token_type_ids = torch.tensor(encoded['token_type_ids']).to(training_config.device)
    
    outputs = model(input_ids, attn_mask, token_type_ids)
    hidden_states = outputs[2]
    token_vecs = hidden_states[-2][0]
    sentence_embedding = torch.mean(token_vecs, dim=0)
    dsm_emb.append(list(sentence_embedding.detach().cpu().numpy()))

In [None]:
dsm_X = pd.DataFrame(dsm_emb, columns=range(len(dsm_emb[0])))
dsm_X

In [None]:
dsm_y = dsm_samp.label.tolist()

In [None]:
pca = PCA(n_components=30)
dsm_X = pca.fit_transform(dsm_X)

In [None]:
X_embedded = TSNE(n_components=2).fit_transform(dsm_X)

In [None]:
palette = sns.color_palette("bright", 10)
sns.scatterplot(x=X_embedded[:,0], y=X_embedded[:,1], hue=dsm_y, legend='full', palette=palette)
plt.show()

#### 1.2 Trained Model

In [23]:
tokenizer = BertTokenizer.from_pretrained(os.path.join(base_model, 'bert-mini'), model_max_length=128)
config = BertConfig.from_pretrained(os.path.join(base_model, 'bert-mini', 'bert_config.json'), num_labels=10, output_hidden_states=True)
model = BertForSequenceClassification.from_pretrained(os.path.join(base_model, 'bert-mini'), config=config)

Some weights of the model checkpoint at F:\AuD\base-model\bert-mini were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were no

In [24]:
config.max_position_embeddings = 128

In [25]:
model_name = os.path.join(model_path, 'DSM-5.pt')

In [26]:
# model.load_state_dict(torch.load(model_name))
model.load_state_dict(torch.load(model_name, map_location=torch.device('cpu')))
model.to(training_config.device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 256, padding_idx=0)
      (position_embeddings): Embedding(512, 256)
      (token_type_embeddings): Embedding(2, 256)
      (LayerNorm): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=256, out_features=256, bias=True)
              (key): Linear(in_features=256, out_features=256, bias=True)
              (value): Linear(in_features=256, out_features=256, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=256, out_features=256, bias=True)
              (LayerNorm): LayerNorm((256,), eps=1e-12, element

In [27]:
dsm_emb = []

for idx in range(len(dsm_samp)):
    encoded = tokenizer.encode_plus(
        text=dsm_samp.text[idx],  # the sentence to be encoded
        add_special_tokens=True,  # Add [CLS] and [SEP]
        max_length = 128,  # maximum length of a sentence
        pad_to_max_length=True,  # Add [PAD]s
        return_attention_mask = True,  # Generate the attention mask
        return_tensors = 'pt',  # ask the function to return PyTorch tensors
    )
    input_ids = torch.tensor(encoded['input_ids']).to(training_config.device)
    attn_mask = torch.tensor(encoded['attention_mask']).to(training_config.device)
    token_type_ids = torch.tensor(encoded['token_type_ids']).to(training_config.device)
    
    outputs = model(input_ids, attn_mask, token_type_ids)
    # last_hidden_state = outputs.last_hidden_state
    # print(len(last_hidden_state), len(last_hidden_state[0]), last_hidden_state[0])
    hidden_states = outputs[1]
    # print(len(hidden_states), len(hidden_states[0]), len(hidden_states[0][0]))
    # print(hidden_states[-1][0][0])
    # token_vecs = hidden_states[-2][0]
    # sentence_embedding = torch.mean(token_vecs, dim=0)
    # dsm_emb.append(list(sentence_embedding.detach().cpu().numpy()))
    dsm_emb.append(hidden_states[-1][0][0].detach().cpu().numpy())

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
  input_ids = torch.tensor(encoded['input_ids']).to(training_config.device)
  attn_mask = torch.tensor(encoded['attention_mask']).to(training_config.device)
  token_type_ids = torch.tensor(encoded['token_type_ids']).to(training_config.device)


In [28]:
dsm_X = pd.DataFrame(dsm_emb, columns=range(len(dsm_emb[0])))
dsm_X

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,246,247,248,249,250,251,252,253,254,255
0,-1.486853,-0.193270,0.151135,0.146182,0.427602,-0.362908,0.400432,0.097074,-0.102784,0.368796,...,-1.179803,-0.065481,-0.909711,0.803173,-0.347903,-0.200656,-0.810690,-1.159843,-0.862530,-1.357067
1,0.024803,-0.813630,0.518797,1.542021,0.242508,0.156137,-1.340185,-0.003623,0.019987,-1.743095,...,1.029202,-0.774437,-0.238063,-0.627681,1.745501,-0.110484,0.751730,0.960535,-0.767883,0.144055
2,-0.411822,1.214387,-0.142876,-0.874122,1.236772,0.329782,-0.971316,-0.770505,-0.271521,1.710381,...,0.055863,-1.427106,-0.851285,1.050934,-0.015948,2.187469,-0.181016,-0.389234,-0.190878,-0.217400
3,0.062845,-0.826382,0.527836,1.536701,0.197945,0.121790,-1.343516,-0.041023,0.025586,-1.695547,...,1.019377,-0.819648,-0.175562,-0.640132,1.820292,-0.040953,0.630163,1.051598,-0.888924,0.124368
4,0.037301,-0.790814,0.619905,1.597858,0.248374,0.074239,-1.391596,-0.084606,-0.075381,-1.832390,...,1.064606,-0.738650,-0.155709,-0.703536,1.506873,-0.068177,0.690584,0.992289,-0.767494,0.192331
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19864,-1.448124,-0.345054,0.128609,0.329477,0.312694,-0.542881,0.578909,0.459581,0.058677,0.189265,...,-1.122634,-0.195104,-0.789639,0.713112,-0.216910,-0.328524,-0.948527,-1.070781,-0.685449,-1.133018
19865,0.245643,-0.703774,0.331653,1.529896,0.557994,-0.131982,-1.296736,0.092015,-0.208416,-1.717511,...,1.154158,-0.490438,-0.150028,-0.618349,1.545427,-0.136561,0.735391,0.542333,-0.461790,0.080199
19866,-0.406430,1.418849,-0.025132,-0.802424,1.109272,0.414215,-0.683377,-0.712544,-0.225217,1.743096,...,0.086616,-1.464859,-0.903901,1.120344,-0.080405,2.149774,-0.323670,-0.299840,-0.325195,-0.167066
19867,0.105223,-0.802513,0.561354,1.569160,0.192706,0.187522,-1.309656,-0.053082,-0.058957,-1.693745,...,1.066048,-0.775402,-0.232960,-0.596793,1.799102,-0.100361,0.663616,1.068503,-0.854274,0.134337


In [29]:
dsm_X.to_csv(os.path.join(data_path, 'emb', 'mini-10-emb.csv'), index=False)

FileNotFoundError: [Errno 2] No such file or directory: 'F:\\AuD\\data\\emb\\mini-10-emb.csv'

In [None]:
pca = PCA(n_components=30)
dsm_X = pca.fit_transform(dsm_X)

In [None]:
dsm_y = dsm_samp.label.tolist()

In [None]:
X_embedded = TSNE(n_components=2).fit_transform(dsm_X)

In [None]:
with open(os.path.join(data_path, 'emb', 'tiny-9-embedded.pickle'), 'wb') as f:
    pickle.dump(X_embedded, f, pickle.HIGHEST_PROTOCOL)

In [None]:
with open(os.path.join(data_path, 'emb', 'tiny-9-embedded.pickle'), 'rb') as f:
    X_embedded = pickle.load(f)

In [None]:
palette = sns.color_palette("bright", 10)
sns.scatterplot(x=X_embedded[:,0], y=X_embedded[:,1], hue=dsm_y, legend='full', palette=palette)
plt.show()

### 2. Word Embedding

In [None]:
text = 'I am depressed'
text2 = 'she is angry to me'

In [None]:
encoded = tokenizer.encode_plus(
    text=text2,  # the sentence to be encoded
    add_special_tokens=True,  # Add [CLS] and [SEP]
    max_length = 64,  # maximum length of a sentence
    pad_to_max_length=True,  # Add [PAD]s
    return_attention_mask = True,  # Generate the attention mask
    return_tensors = 'pt',  # ask the function to return PyTorch tensors
)
input_ids = torch.tensor(encoded['input_ids']).to(training_config.device)
attn_mask = torch.tensor(encoded['attention_mask']).to(training_config.device)
token_type_ids = torch.tensor(encoded['token_type_ids']).to(training_config.device)