# Exploration of the data on hand

## 0 - Imports

In [4]:
import pandas as pd
import numpy as np
import os

In [5]:
path = "/Users/armandkouyoumdjian/github/INTRO_NLP_A2/exercise2/"
os.chdir(path)

In [6]:
traindata = pd.read_csv("data/traindata.csv", sep="\t", header=None)
traindata.columns = ['polarity', 'category', 'OTE/target', 'character offsets', 'review']
traindata.head()

Unnamed: 0,polarity,category,OTE/target,character offsets,review
0,positive,AMBIENCE#GENERAL,seating,18:25,short and sweet – seating is great:it's romant...
1,positive,AMBIENCE#GENERAL,trattoria,25:34,This quaint and romantic trattoria is at the t...
2,positive,FOOD#QUALITY,food,98:102,The have over 100 different beers to offer thi...
3,negative,SERVICE#GENERAL,STAFF,5:10,THIS STAFF SHOULD BE FIRED.
4,positive,FOOD#STYLE_OPTIONS,menu,4:8,"The menu looked great, and the waiter was very..."


## 1 - General Exploration

In [7]:
len(traindata)

1503

In [8]:
traindata.isna().sum()

polarity             0
category             0
OTE/target           0
character offsets    0
review               0
dtype: int64

In [9]:
traindata.dtypes

polarity             object
category             object
OTE/target           object
character offsets    object
review               object
dtype: object

## 2 - Exploration per category

In [10]:
traindata["polarity"].value_counts()

positive    1055
negative     390
neutral       58
Name: polarity, dtype: int64

In [11]:
traindata["category"].value_counts()

FOOD#QUALITY                603
SERVICE#GENERAL             263
AMBIENCE#GENERAL            188
RESTAURANT#GENERAL          138
FOOD#STYLE_OPTIONS           98
FOOD#PRICES                  58
DRINKS#QUALITY               41
RESTAURANT#MISCELLANEOUS     39
DRINKS#STYLE_OPTIONS         26
RESTAURANT#PRICES            20
LOCATION#GENERAL             16
DRINKS#PRICES                13
Name: category, dtype: int64

We notice some **class imbalance**, we should reweight, just like in https://www.aclweb.org/anthology/W19-6120.pdf. 

## 3 - Sentiment Analysis

### 3.1 -  Using Hugging-Face pre-trained transformer + Only the review (Not the category)

### NOTE: We have to use a classifier only trained on the model's data, so this would not work in the end


In [12]:
from transformers import pipeline

In [13]:
print(pipeline('sentiment-analysis')('I hate you'))

[{'label': 'NEGATIVE', 'score': 0.9991129040718079}]


In [19]:
# Let'see how the default - pretrained - pipelineperforms

def give_polarity(sentence, nlp_pipeline):
    lbl = nlp_pipeline(sentence)[0]['label']
    
    return lbl.lower()


In [20]:
nlp = pipeline('sentiment-analysis')


In [21]:
give_polarity("What a lovely day!", nlp)

'positive'

Test on the whole dataset

In [22]:
traindata["prediction"] = traindata["review"].apply(lambda x: give_polarity(x, nlp))

### 3.2 - Transformer from scratch

In [31]:
#  Accuracy
acc= (traindata["prediction"]==traindata["polarity"]).sum()/len(traindata)

print(f"Accuracy using pretrained transformer: {acc:.2f}")

Accuracy using pretrained transformer: 0.83


In [None]:
traindata.drop("polarity", axis=1, inplace=True)