In [6]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig
import numpy as np
import torch

In [3]:
#MODEL_NAME = "roberta-base"
MODEL_NAME = 'cardiffnlp/twitter-roberta-base-sentiment-latest'

In [7]:
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model.load_state_dict(torch.load('bert_cla2.ckpt'))

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


<All keys matched successfully>

In [8]:
model.cuda()

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerN

In [12]:
text = "From now on I shall forever be known as Stephen “sorry, I was on mute” Brown. #wfh"
encoded_input = tokenizer(text, return_tensors='pt')
encoded_input.to('cuda')
output = model(**encoded_input)

In [13]:
output

SequenceClassifierOutput(loss=None, logits=tensor([[ 1.9264,  0.5326, -2.7659]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [14]:
import pandas as pd
df = pd.read_csv('data.csv')

In [15]:
import nltk
import re
import string
def clean_text(text):
    '''Make text lowercase, remove text in square brackets,remove links,remove punctuation
    and remove words containing numbers.'''
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text


def text_preprocessing(text):
    """
    Cleaning and parsing the text.

    """
    tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
    nopunc = clean_text(text)
    tokenized_text = tokenizer.tokenize(nopunc)
    #remove_stopwords = [w for w in tokenized_text if w not in stopwords.words('english')]
    combined_text = ' '.join(tokenized_text)
    return combined_text

In [16]:
df.renderedContent = df.renderedContent.apply(text_preprocessing)

In [17]:
tag = []
texts = df.renderedContent.values


In [18]:
for text in texts:
    encoded_input = tokenizer(text, return_tensors='pt')
    encoded_input.to('cuda')
    output = model(**encoded_input)
    tag.append(output.logits.argmax().item())

In [19]:
df['sentiment'] = tag

In [20]:
df1 = df[['date','renderedContent', 'sentiment','location']]
df1.to_csv('data_sentiment.csv', index = False)
df1

Unnamed: 0,date,renderedContent,sentiment,location
0,2020-02-14 03:04:33+00:00,work from home im looking for stay at home mom...,1,New York
1,2020-02-04 23:21:24+00:00,limited time sign up for freeends march work f...,1,New York
2,2020-03-31 21:21:25+00:00,catch our own tiffany joy murchison mstiffanyj...,1,New York
3,2020-03-31 21:18:46+00:00,clocking out is near make sure to end of your ...,1,New York
4,2020-03-31 20:04:07+00:00,she s so cool lea leainny stayhome goldenretri...,2,New York
...,...,...,...,...
8326,2020-02-15 08:51:31+00:00,working saturday this weekend dmassociatesco m...,1,London
8327,2020-02-14 08:20:55+00:00,journey to work was a bitch todaynurstedcourt ...,0,London
8328,2020-02-09 21:20:44+00:00,rftravelnews edisonhoda magic i ll set my alar...,1,London
8329,2020-02-07 13:49:37+00:00,is it possible to drink too much tea workingfr...,1,London


In [21]:
(df[['renderedContent', 'sentiment']].sample(frac = 0.01)).to_csv('manual_eval.csv')