In [2]:
!pip install -q transformers




In [3]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
import numpy as np
from scipy.special import softmax
import csv
import urllib.request
import pandas as pd

tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")

model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")

Downloading (…)lve/main/config.json:   0%|          | 0.00/929 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [4]:
model
tokenizer.model_max_length = 1071


In [5]:
# download label mapping
task='sentiment'
labels=[]
mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/{task}/mapping.txt"
with urllib.request.urlopen(mapping_link) as f:
    html = f.read().decode('utf-8').split("\n")
    csvreader = csv.reader(html, delimiter='\t')
labels = [row[1] for row in csvreader if len(row) > 1]

In [6]:
labels

['negative', 'neutral', 'positive']

In [7]:
encoded_input = tokenizer("The sun is out", return_tensors='pt')
output = model(**encoded_input)
scores = output[0][0].detach().numpy()
scores = softmax(scores)
ranking = np.argsort(scores)
ranking = ranking[::-1]
for i in range(scores.shape[0]):
    l = labels[ranking[i]]
    s = scores[ranking[i]]
    print(f"{i+1}) {l} {np.round(float(s), 4)}")

1) neutral 0.5085
2) positive 0.479
3) negative 0.0125


In [8]:
ranking

array([1, 2, 0], dtype=int64)

In [9]:
def review_sentiment(review: str):
    encoded_input = tokenizer(review, return_tensors='pt', truncation=True, max_length=512)
    output = model(**encoded_input)
    scores = output.logits.detach().numpy()[0]
    scores = np.exp(scores) / np.sum(np.exp(scores))
    ranking = np.argsort(scores)[::-1]
    ans = [(labels[i], scores[i]) for i in ranking]
    return ans

In [10]:
review_sentiment("Hello!")

[('positive', 0.8134766), ('neutral', 0.17542176), ('negative', 0.011101609)]

In [11]:
df=pd.read_csv('data/Coursera_reviews.csv')

In [12]:
df['positive'] = 0

# Add the 'neutral' column with default values
df['neutral'] = 0

# Add the 'negative' column with default values
df['negative'] = 0

In [13]:
df.columns.tolist()

['reviews',
 'reviewers',
 'date_reviews',
 'rating',
 'course_id',
 'positive',
 'neutral',
 'negative']

In [14]:
# Define a function to apply review_sentiment to each row
def apply_review_sentiment(row):
    review = row['reviews']
    sentiment_scores = review_sentiment(review)
    for sentiment, score in sentiment_scores:
        row[sentiment] = score
    return row

In [15]:
df = df.head(100)
df = df.apply(apply_review_sentiment, axis=1)

In [16]:
df

Unnamed: 0,reviews,reviewers,date_reviews,rating,course_id,positive,neutral,negative
0,"Pretty dry, but I was able to pass with just t...",By Robert S,"Feb 12, 2020",4,google-cbrs-cpi-training,0.628795,0.206076,0.165129
1,would be a better experience if the video and ...,By Gabriel E R,"Sep 28, 2020",4,google-cbrs-cpi-training,0.068340,0.768785,0.162875
2,Information was perfect! The program itself wa...,By Jacob D,"Apr 08, 2020",4,google-cbrs-cpi-training,0.710999,0.214451,0.074550
3,A few grammatical mistakes on test made me do ...,By Dale B,"Feb 24, 2020",4,google-cbrs-cpi-training,0.168156,0.438748,0.393096
4,Excellent course and the training provided was...,By Sean G,"Jun 18, 2020",4,google-cbrs-cpi-training,0.978909,0.016020,0.005071
...,...,...,...,...,...,...,...,...
95,Poorly produced.,By Tahseen S,"Mar 11, 2020",1,financial-markets-global,0.010035,0.081914,0.908052
96,I find the professor is hard to understand bec...,By David S,"May 28, 2020",1,financial-markets-global,0.005719,0.065606,0.928674
97,Don't waste your time! It's unstructured pile ...,By Sergey F,"May 02, 2020",1,financial-markets-global,0.015717,0.146580,0.837703
98,too short videos,By Sudiksha,"Sep 01, 2019",1,financial-markets-global,0.039034,0.307478,0.653488
