# Exploration of the data on hand

## 0 - Imports

In [1]:
import pandas as pd
import numpy as np
import os

from tqdm.notebook import tqdm
tqdm.pandas()

In [2]:
path = "/Users/armandkouyoumdjian/github/INTRO_NLP_A2/exercise2/"
os.chdir(path)

In [3]:
traindata = pd.read_csv("data/traindata.csv", sep="\t", header=None)
display(traindata.head())
traindata.columns = ['polarity', 'category', 'OTE/target', 'character offsets', 'review']
traindata.head()

Unnamed: 0,0,1,2,3,4
0,positive,AMBIENCE#GENERAL,seating,18:25,short and sweet – seating is great:it's romant...
1,positive,AMBIENCE#GENERAL,trattoria,25:34,This quaint and romantic trattoria is at the t...
2,positive,FOOD#QUALITY,food,98:102,The have over 100 different beers to offer thi...
3,negative,SERVICE#GENERAL,STAFF,5:10,THIS STAFF SHOULD BE FIRED.
4,positive,FOOD#STYLE_OPTIONS,menu,4:8,"The menu looked great, and the waiter was very..."


Unnamed: 0,polarity,category,OTE/target,character offsets,review
0,positive,AMBIENCE#GENERAL,seating,18:25,short and sweet – seating is great:it's romant...
1,positive,AMBIENCE#GENERAL,trattoria,25:34,This quaint and romantic trattoria is at the t...
2,positive,FOOD#QUALITY,food,98:102,The have over 100 different beers to offer thi...
3,negative,SERVICE#GENERAL,STAFF,5:10,THIS STAFF SHOULD BE FIRED.
4,positive,FOOD#STYLE_OPTIONS,menu,4:8,"The menu looked great, and the waiter was very..."


In [4]:
traindata.review[0]

"short and sweet – seating is great:it's romantic,cozy and private."

In [5]:
devdata = pd.read_csv("data/devdata.csv", sep="\t", header=None)
display(devdata.head())
devdata.columns = ['polarity', 'category', 'OTE/target', 'character offsets', 'review']
devdata.head()

Unnamed: 0,0,1,2,3,4
0,positive,LOCATION#GENERAL,neighborhood,54:66,"great food, great wine list, great service in ..."
1,negative,RESTAURANT#GENERAL,place,15:20,I thought this place was totally overrated.
2,positive,FOOD#QUALITY,Fish,0:4,Fish is so very fresh.
3,negative,SERVICE#GENERAL,manager,19:26,"I showed it to the manager, and he smilingly a..."
4,neutral,DRINKS#QUALITY,margaritas,63:73,"The food we ordered was excellent, although I ..."


Unnamed: 0,polarity,category,OTE/target,character offsets,review
0,positive,LOCATION#GENERAL,neighborhood,54:66,"great food, great wine list, great service in ..."
1,negative,RESTAURANT#GENERAL,place,15:20,I thought this place was totally overrated.
2,positive,FOOD#QUALITY,Fish,0:4,Fish is so very fresh.
3,negative,SERVICE#GENERAL,manager,19:26,"I showed it to the manager, and he smilingly a..."
4,neutral,DRINKS#QUALITY,margaritas,63:73,"The food we ordered was excellent, although I ..."


In [17]:
len(devdata)

376

## 1 - General Exploration

In [6]:
len(traindata)

1503

In [7]:
traindata.isna().sum()

polarity             0
category             0
OTE/target           0
character offsets    0
review               0
dtype: int64

In [8]:
traindata.dtypes

polarity             object
category             object
OTE/target           object
character offsets    object
review               object
dtype: object

## 2 - Exploration per category

In [7]:
traindata["polarity"].value_counts()

positive    1055
negative     390
neutral       58
Name: polarity, dtype: int64

In [8]:
traindata["category"].value_counts()

FOOD#QUALITY                603
SERVICE#GENERAL             263
AMBIENCE#GENERAL            188
RESTAURANT#GENERAL          138
FOOD#STYLE_OPTIONS           98
FOOD#PRICES                  58
DRINKS#QUALITY               41
RESTAURANT#MISCELLANEOUS     39
DRINKS#STYLE_OPTIONS         26
RESTAURANT#PRICES            20
LOCATION#GENERAL             16
DRINKS#PRICES                13
Name: category, dtype: int64

We notice some **class imbalance**, we should reweight, just like in https://www.aclweb.org/anthology/W19-6120.pdf. 

In [9]:
# Clean the category name


## 3 - Sentiment Analysis

### 3.1 -  Using Hugging-Face pre-trained transformer + Only the review (Not the category)

### NOTE: We have to use a classifier only trained on the model's data, so this would not work in the end


In [9]:
import transformers
transformers.__version__

'3.1.0'

In [10]:
from transformers import pipeline

In [11]:
print(pipeline('sentiment-analysis')('I hate you'))

[{'label': 'NEGATIVE', 'score': 0.9991129040718079}]


In [12]:
# Let'see how the default - pretrained - pipelineperforms

def give_polarity(sentence, nlp_pipeline):
    lbl = nlp_pipeline(sentence)[0]['label']
    
    return lbl.lower()


In [13]:
nlp = pipeline('sentiment-analysis')


In [14]:
give_polarity("What a lovely day!", nlp)

'positive'

In [15]:
pipeline

<function transformers.pipelines.pipeline(task: str, model: Optional = None, config: Union[str, transformers.configuration_utils.PretrainedConfig, NoneType] = None, tokenizer: Union[str, transformers.tokenization_utils.PreTrainedTokenizer, NoneType] = None, framework: Union[str, NoneType] = None, **kwargs) -> transformers.pipelines.Pipeline>

Test on the whole dataset

In [16]:
traindata["prediction"] = traindata["review"].progress_apply(lambda x: give_polarity(x, nlp))

  0%|          | 0/1503 [00:00<?, ?it/s]

In [17]:
#  Accuracy
acc= (traindata["prediction"]==traindata["polarity"]).sum()/len(traindata)

print(f"Accuracy using pretrained transformer: {acc:.2f}")

Accuracy using pretrained transformer: 0.83


In [16]:
devdata["prediction"] = devdata["review"].progress_apply(lambda x: give_polarity(x, nlp))

#  Accuracy
acc= (devdata["prediction"]==devdata["polarity"]).sum()/len(devdata)

print(f"Accuracy using pretrained transformer: {acc:.2f}")

  0%|          | 0/376 [00:00<?, ?it/s]

Accuracy using pretrained transformer: 0.84


### 3.2 - Transformer from scratch

**Resources**:
- http://jalammar.github.io/illustrated-bert/

- Tutorial video: https://www.youtube.com/watch?v=8N-nM3QW7O0&t=1s&ab_channel=VenelinValkov

- https://jalammar.github.io/a-visual-guide-to-using-bert-for-the-first-time/

In [18]:
traindata.drop("polarity", axis=1, inplace=True)

In [19]:
traindata.columns

Index(['category', 'OTE/target', 'character offsets', 'review', 'prediction'], dtype='object')

In [20]:
from transformers import BertForSequenceClassification


In [21]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [22]:
print(model)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [23]:
# we cannot feed a sentence directly to the model
model("I am testing the model")

AttributeError: 'str' object has no attribute 'size'

### Tokenizer

In [12]:
from transformers import BertTokenizer
import torch

In [5]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
text_batch = ["I love Pixar.", "I don't care for Pixar."]
encoding = tokenizer(text_batch, return_tensors='pt', padding=True, truncation=True)

Downloading: 100%|██████████| 232k/232k [00:00<00:00, 578kB/s]


In [6]:
encoding

{'input_ids': tensor([[  101,  1045,  2293, 14255, 18684,  2099,  1012,   102,     0,     0,
             0,     0],
        [  101,  1045,  2123,  1005,  1056,  2729,  2005, 14255, 18684,  2099,
          1012,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [13]:
#labels = torch.tensor([1,0]).unsqueeze(0)
input_ids = encoding['input_ids']
attention_mask = encoding['attention_mask']


In [20]:
output = model(input_ids=encoding['input_ids'], attention_mask=encoding['attention_mask'])


In [21]:
output

(tensor([[0.1601, 0.3013],
         [0.1279, 0.2943]], grad_fn=<AddmmBackward>),)

In [16]:
print(model)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element