In [143]:
! pip install transformers



# Sentiment Analysis using BERT

## Importing Dependencies

In [144]:
import torch
import pandas as pd
import numpy as np
from transformers import BertTokenizer, BertForSequenceClassification

## Downloading Dataset

In [145]:
df = pd.read_csv('https://gist.githubusercontent.com/Mukilan-Krishnakumar/e998ecf27d11b84fe6225db11c239bc6/raw/74dbac2b992235e555df9a0a4e4d7271680e7e45/imdb_movie_reviews.csv')
df.head()

Unnamed: 0,text,sentiment
0,"My daughter liked it but I was aghast, that a ...",neg
1,I... No words. No words can describe this. I w...,neg
2,this film is basically a poor take on the old ...,neg
3,"This is a terrible movie, and I'm not even sur...",neg
4,First of all this movie is a piece of reality ...,pos


We will drop the sentiment which comes along with the dataset and predict our own sentiment using BERT

In [146]:
df = df.drop('sentiment',axis=1)

## Model Building and Evaluation

In [147]:
tokenizer = BertTokenizer.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')
model = BertForSequenceClassification.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')

In [148]:
def sentiment_score(movie_review):
	token = tokenizer.encode(movie_review, return_tensors = 'pt')
	result = model(token)
	return int(torch.argmax(result.logits))+1

In [149]:
df['sentiment'] = df['text'].apply(lambda x: sentiment_score(x[:512]))

In [123]:
df.head()

Unnamed: 0,text,sentiment
0,"My daughter liked it but I was aghast, that a ...",3
1,I... No words. No words can describe this. I w...,1
2,this film is basically a poor take on the old ...,2
3,"This is a terrible movie, and I'm not even sur...",1
4,First of all this movie is a piece of reality ...,4


In [89]:
pip install emoji==0.6.0

Collecting emoji==0.6.0
  Downloading emoji-0.6.0.tar.gz (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.0/51.0 kB[0m [31m946.4 kB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: emoji
  Building wheel for emoji (setup.py) ... [?25l[?25hdone
  Created wheel for emoji: filename=emoji-0.6.0-py3-none-any.whl size=49720 sha256=a059796106ab9617b24a8f3154800a3ce8ce755305fa8e28163e108488f4e849
  Stored in directory: /root/.cache/pip/wheels/1b/bd/d9/310c33c45a553798a714e27e3b8395d37128425442b8c78e07
Successfully built emoji
Installing collected packages: emoji
Successfully installed emoji-0.6.0


## Pre-trained BERT on Twitter US Election 2020 for Stance Detection towards Donald Trump (KE-MLM)

In [150]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import numpy as np

In [151]:
# choose GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [152]:
# select mode path here
pretrained_LM_path = "kornosk/bert-election2020-twitter-stance-trump-KE-MLM"

In [153]:
# load model
tokenizer = AutoTokenizer.from_pretrained(pretrained_LM_path)
model = AutoModelForSequenceClassification.from_pretrained(pretrained_LM_path)

id2label = {
    0: "AGAINST",
    1: "FAVOR",
    2: "NONE"
}


In [154]:
##### Prediction Neutral #####
sentence = "Hello World."
inputs = tokenizer(sentence.lower(), return_tensors="pt")
outputs = model(**inputs)
predicted_probability = torch.softmax(outputs[0], dim=1)[0].tolist()

print("Sentence:", sentence)
print("Prediction:", id2label[np.argmax(predicted_probability)])
print("Against:", predicted_probability[0])
print("Favor:", predicted_probability[1])
print("Neutral:", predicted_probability[2])

Sentence: Hello World.
Prediction: NONE
Against: 0.07368938624858856
Favor: 0.11205834150314331
Neutral: 0.8142523169517517


In [155]:
##### Prediction Favor #####
sentence = "Go Go Trump!!!"
inputs = tokenizer(sentence.lower(), return_tensors="pt")
outputs = model(**inputs)
predicted_probability = torch.softmax(outputs[0], dim=1)[0].tolist()

print("Sentence:", sentence)
print("Prediction:", id2label[np.argmax(predicted_probability)])
print("Against:", predicted_probability[0])
print("Favor:", predicted_probability[1])
print("Neutral:", predicted_probability[2])

Sentence: Go Go Trump!!!
Prediction: FAVOR
Against: 0.2513195872306824
Favor: 0.4501757025718689
Neutral: 0.2985047996044159


In [156]:
##### Prediction Against #####
sentence = "Trump is the worst."
inputs = tokenizer(sentence.lower(), return_tensors="pt")
outputs = model(**inputs)
predicted_probability = torch.softmax(outputs[0], dim=1)[0].tolist()

print("Sentence:", sentence)
print("Prediction:", id2label[np.argmax(predicted_probability)])
print("Against:", predicted_probability[0])
print("Favor:", predicted_probability[1])
print("Neutral:", predicted_probability[2])

Sentence: Trump is the worst.
Prediction: AGAINST
Against: 0.7339825630187988
Favor: 0.11051063984632492
Neutral: 0.15550683438777924


In [158]:
sentence = "Science is true."
inputs = tokenizer(sentence.lower(), return_tensors="pt")
outputs = model(**inputs)
predicted_probability = torch.softmax(outputs[0], dim=1)[0].tolist()

print("Sentence:", sentence)
print("Prediction:", id2label[np.argmax(predicted_probability)])
print("Against:", predicted_probability[0])
print("Favor:", predicted_probability[1])
print("Neutral:", predicted_probability[2])

Sentence: Science is true.
Prediction: NONE
Against: 0.14677490293979645
Favor: 0.13593906164169312
Neutral: 0.7172859907150269


##  🙊 Detoxify Toxic Comment Classification with ⚡ Pytorch Lightning and 🤗 Transformers

In [174]:
# install detoxify

! pip install detoxify




In [197]:
results=Detoxify('original').predict('I joked to my friend and said shut up you are a liar ')

In [198]:
print(results)

{'toxicity': 0.9385704, 'severe_toxicity': 0.0037372643, 'obscene': 0.1116012, 'threat': 0.001874594, 'insult': 0.65328133, 'identity_attack': 0.00457644}


In [195]:
print(results)

{'toxicity': 0.97135735, 'severe_toxicity': 0.006518601, 'obscene': 0.2137416, 'threat': 0.0018301342, 'insult': 0.7385384, 'identity_attack': 0.004855454}


In [176]:
results_unbiased=Detoxify('unbiased').predict('shut up you are a liar')

In [177]:
results_turkish = Detoxify('multilingual').predict('kapa çeneni sen bir yalancısın')

In [178]:
print(results)

{'toxicity': 0.9841488, 'severe_toxicity': 0.012881926, 'obscene': 0.2888863, 'threat': 0.0023240668, 'insult': 0.83928835, 'identity_attack': 0.00809011}


In [179]:
print(results_unbiased)

{'toxicity': 0.9947649, 'severe_toxicity': 0.0004373019, 'obscene': 0.011551815, 'identity_attack': 0.001791593, 'insult': 0.987027, 'threat': 0.0005621685, 'sexual_explicit': 0.0011744519}


In [180]:
print(results_turkish)

{'toxicity': 0.9954861, 'severe_toxicity': 0.015652074, 'obscene': 0.122717656, 'identity_attack': 0.0065702004, 'insult': 0.23378459, 'threat': 0.021746922, 'sexual_explicit': 0.12384143}


In [185]:
results_german = Detoxify('multilingual').predict('Halt die fresse du bist ein Lügner!')

In [186]:
print(results_german)

{'toxicity': 0.89689934, 'severe_toxicity': 0.00070730655, 'obscene': 0.026918555, 'identity_attack': 0.015749967, 'insult': 0.47567183, 'threat': 0.00074008835, 'sexual_explicit': 0.0012803869}


In [191]:
print(pd.DataFrame(results,results_unbiased,results_german).round(5))

                 toxicity  severe_toxicity  obscene  identity_attack   insult  \
toxicity          0.98415          0.01288  0.28889          0.00809  0.83929   
severe_toxicity   0.98415          0.01288  0.28889          0.00809  0.83929   
obscene           0.98415          0.01288  0.28889          0.00809  0.83929   
identity_attack   0.98415          0.01288  0.28889          0.00809  0.83929   
insult            0.98415          0.01288  0.28889          0.00809  0.83929   
threat            0.98415          0.01288  0.28889          0.00809  0.83929   
sexual_explicit   0.98415          0.01288  0.28889          0.00809  0.83929   

                  threat sexual_explicit  
toxicity         0.00232             NaN  
severe_toxicity  0.00232             NaN  
obscene          0.00232             NaN  
identity_attack  0.00232             NaN  
insult           0.00232             NaN  
threat           0.00232             NaN  
sexual_explicit  0.00232             NaN  


Sources

https://huggingface.co/kornosk/bert-election2020-twitter-stance-trump-KE-MLM

https://huggingface.co/unitary/toxic-bert

https://github.com/unitaryai/detoxify

https://huggingface.co/docs/transformers/model_doc/bert

https://huggingface.co/models?other=politics&sort=trending

https://wandb.ai/mukilan/BERT_Sentiment_Analysis/reports/An-Introduction-to-BERT-And-How-To-Use-It--VmlldzoyNTIyOTA1#bert-based-models



