## NLP - Task 4
## Применение Bert


In [1]:
! pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/ed/db/98c3ea1a78190dac41c0127a063abf92bd01b4b0b6970a6db1c2f5b66fa0/transformers-4.0.1-py3-none-any.whl (1.4MB)
[K     |████████████████████████████████| 1.4MB 12.3MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 49.6MB/s 
Collecting tokenizers==0.9.4
[?25l  Downloading https://files.pythonhosted.org/packages/0f/1c/e789a8b12e28be5bc1ce2156cf87cb522b379be9cadc7ad8091a4cc107c4/tokenizers-0.9.4-cp36-cp36m-manylinux2010_x86_64.whl (2.9MB)
[K     |████████████████████████████████| 2.9MB 52.1MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.43-cp36-none-any.whl size=893261 sha256=a3b4eef36ce86c9c25

In [2]:
import pandas as pd

import torch 
import torch.utils.data as data_utils

from transformers import BertTokenizer, BertModel

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

### Датасет
https://www.kaggle.com/datatattle/covid-19-nlp-text-classification

In [5]:
! unzip ./archive.zip

Archive:  ./archive.zip
  inflating: Corona_NLP_test.csv     
  inflating: Corona_NLP_train.csv    


In [6]:
data_train = pd.read_csv('./Corona_NLP_train.csv', encoding='ISO-8859-1', parse_dates=['TweetAt'])
data_train.head()

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
0,3799,48751,London,2020-03-16,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral
1,3800,48752,UK,2020-03-16,advice Talk to your neighbours family to excha...,Positive
2,3801,48753,Vagabonds,2020-03-16,Coronavirus Australia: Woolworths to give elde...,Positive
3,3802,48754,,2020-03-16,My food stock is not the only one which is emp...,Positive
4,3803,48755,,2020-03-16,"Me, ready to go at supermarket during the #COV...",Extremely Negative


In [7]:
set(data_train['Sentiment'])

{'Extremely Negative', 'Extremely Positive', 'Negative', 'Neutral', 'Positive'}

In [8]:
sentiment_to_idx = {'Extremely Negative' : 0,
                    'Negative': 1,
                    'Neutral': 2,
                    'Positive': 3,
                    'Extremely Positive': 4}

In [9]:
data_train['Target'] = data_train['Sentiment'].apply(lambda x: sentiment_to_idx[x])

In [10]:
data_train.head()

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment,Target
0,3799,48751,London,2020-03-16,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral,2
1,3800,48752,UK,2020-03-16,advice Talk to your neighbours family to excha...,Positive,3
2,3801,48753,Vagabonds,2020-03-16,Coronavirus Australia: Woolworths to give elde...,Positive,3
3,3802,48754,,2020-03-16,My food stock is not the only one which is emp...,Positive,3
4,3803,48755,,2020-03-16,"Me, ready to go at supermarket during the #COV...",Extremely Negative,0


In [11]:
data_test = pd.read_csv('./Corona_NLP_test.csv', encoding='ISO-8859-1', parse_dates=['TweetAt'])

In [12]:
data_test['Target'] = data_test['Sentiment'].apply(lambda x: sentiment_to_idx[x])

In [13]:
data_test.head()

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment,Target
0,1,44953,NYC,2020-02-03,TRENDING: New Yorkers encounter empty supermar...,Extremely Negative,0
1,2,44954,"Seattle, WA",2020-02-03,When I couldn't find hand sanitizer at Fred Me...,Positive,3
2,3,44955,,2020-02-03,Find out how you can protect yourself and love...,Extremely Positive,4
3,4,44956,Chicagoland,2020-02-03,#Panic buying hits #NewYork City as anxious sh...,Negative,1
4,5,44957,"Melbourne, Victoria",2020-03-03,#toiletpaper #dunnypaper #coronavirus #coronav...,Neutral,2


### Эмбеддинги

In [14]:
X_train = data_train['OriginalTweet'].copy()
y_train = data_train['Target'].copy()

In [15]:
X_test = data_test['OriginalTweet'].copy()
y_test = data_test['Target'].copy()

In [16]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert = BertModel.from_pretrained('bert-base-uncased')

for param in bert.parameters():
    param.requires_grad = False

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




In [17]:
bert = bert.to(device)

In [25]:
def encode_sentence_with_bert(sentence, device=device):
    
    tokenized = tokenizer(sentence, truncation=True, return_tensors='pt')
    tokenized = {key: tensor.to(device) for key, tensor in tokenized.items()}

    encoded = None 
    
    with torch.no_grad():
        encoded = bert(**tokenized)['last_hidden_state'][0, 0]

    return encoded

In [28]:
train_len, test_len = len(X_train), len(X_test)

train_embeddings = []
test_embeddings = []

for i, current_text in enumerate(X_train):
    
    current_emb = encode_sentence_with_bert(X_train[i])
    train_embeddings.append(current_emb)

for i, current_text in enumerate(X_train):

    current_emb = encode_sentence_with_bert(X_test[i])
    test_embeddings.append(current_emb)