### Read in Data and libraries

In [1]:
#import libraries

import pandas as pd
import numpy as np

In [2]:
#import dataset

df = pd.read_csv("/content/drive/MyDrive/data.csv")

### Data Dimension

In [3]:
#size of data

print(df.shape)

(595, 3)


In [4]:
#see some random rows

df.sample(5)

Unnamed: 0,id,Video,Comment
443,444,Video Three,I just started learning mysql in my journey fo...
551,552,Video One,"Thank u bhaiya, it means a lot to me . I belon..."
446,447,Video Three,Thank you for sharing great SQL knowledge.
521,522,Video One,"Sir, Today's class was really good. All the to..."
120,121,Video Two,Great learning. Kindly do more videos with thi...


## Understand NLTK Concept

In [5]:
#Import library

import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [6]:
#Create one example from dataset

example = df['Comment'][25]
print(example)

I love your videos! Thank you so much! They're really helpful.


In [7]:
#Tokenize the example

token = nltk.word_tokenize(example)
print(token)

['I', 'love', 'your', 'videos', '!', 'Thank', 'you', 'so', 'much', '!', 'They', "'re", 'really', 'helpful', '.']


## Sentiment Analysis

In [8]:
#Import Libraries

from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [9]:
#Build the model

MODEL = f"cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(MODEL)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [10]:
#Train the model

model = AutoModelForSequenceClassification.from_pretrained(MODEL)

In [11]:
#Apply model on example

print(example)

I love your videos! Thank you so much! They're really helpful.


In [12]:
# Step-1 : Tokenize the example

encoded_text = tokenizer(example, return_tensors='pt')
print(encoded_text)

{'input_ids': tensor([[   0,  100,  657,  110, 3424,  328, 3837,   47,   98,  203,  328,  252,
          214,  269, 7163,    4,    2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}


In [13]:
# Step-2 : Encoded the result

output = model(**encoded_text)

output

SequenceClassifierOutput(loss=None, logits=tensor([[-2.1232, -1.1101,  4.1413]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [14]:
# Step-3 : Remove extra things

scores = output[0][0].detach().numpy()
scores

array([-2.1231842, -1.1100707,  4.1413403], dtype=float32)

In [15]:
# Step-4 : Convert to scale 0 to 1 (Exponential)

from scipy.special import softmax
scores = softmax(scores)
scores

array([0.00188912, 0.00520296, 0.9929079 ], dtype=float32)

In [16]:
# Step-5 : Push the result in a dictionary

scores_dict = {
    'Negative' : scores[0],
    'Neuatral' : scores[1],
    'Positive' : scores[2]
}
print(scores_dict)

{'Negative': 0.0018891245, 'Neuatral': 0.005202956, 'Positive': 0.9929079}


In [17]:
#All steps in a function

def score(example):
  encoded_text = tokenizer(example, return_tensors='pt')
  output = model(**encoded_text)
  scores = output[0][0].detach().numpy()
  scores = softmax(scores)
  scores_dict = {
    'Negative' : scores[0],
    'Neuatral' : scores[1],
    'Positive' : scores[2]
}
  return scores_dict

In [18]:
#Create a free dictionary

result = {}

In [19]:
#Apply the model on whole dataset

from tqdm.notebook import tqdm

for i, row in tqdm(df.iterrows(), total = len(df)):
    try:
        text = row['Comment']
        myid = row['id']
        result[myid] = score(text)
    except RuntimeError:
        print(f'Broke for id {myid}')

  0%|          | 0/595 [00:00<?, ?it/s]

Broke for id 3
Broke for id 335
Broke for id 344


In [20]:
#Convert result to dataframe

result_df = pd.DataFrame(result)
result_df

Unnamed: 0,1,2,4,5,6,7,8,9,10,11,...,586,587,588,589,590,591,592,593,594,595
Negative,0.480478,0.009779,0.003977,0.001604,0.03695,0.019957,0.002267,0.296133,0.037144,0.023899,...,0.001264,0.001473,0.003011,0.002382,0.004865,0.005401,0.040566,0.004231,0.00311,0.00482
Neuatral,0.479108,0.12398,0.022076,0.016402,0.27394,0.11436,0.010676,0.393086,0.146301,0.114096,...,0.011796,0.023852,0.015679,0.030865,0.022898,0.043872,0.139259,0.114846,0.074032,0.067265
Positive,0.040413,0.866241,0.973947,0.981994,0.68911,0.865683,0.987057,0.31078,0.816554,0.862005,...,0.986941,0.974675,0.98131,0.966753,0.972237,0.950727,0.820175,0.880923,0.922857,0.927915


In [21]:
#Transpose the table

result_df = pd.DataFrame(result).T
result_df

Unnamed: 0,Negative,Neuatral,Positive
1,0.480478,0.479108,0.040413
2,0.009779,0.123980,0.866241
4,0.003977,0.022076,0.973947
5,0.001604,0.016402,0.981994
6,0.036950,0.273940,0.689110
...,...,...,...
591,0.005401,0.043872,0.950727
592,0.040566,0.139259,0.820175
593,0.004231,0.114846,0.880923
594,0.003110,0.074032,0.922857


In [22]:
#Add ID as a new column

result_df = result_df.reset_index().rename(columns = {'index' : 'id'})
result_df

Unnamed: 0,id,Negative,Neuatral,Positive
0,1,0.480478,0.479108,0.040413
1,2,0.009779,0.123980,0.866241
2,4,0.003977,0.022076,0.973947
3,5,0.001604,0.016402,0.981994
4,6,0.036950,0.273940,0.689110
...,...,...,...,...
587,591,0.005401,0.043872,0.950727
588,592,0.040566,0.139259,0.820175
589,593,0.004231,0.114846,0.880923
590,594,0.003110,0.074032,0.922857


In [23]:
#See the final sentiment result

result_df = result_df.merge(df, how = 'left')
result_df

Unnamed: 0,id,Negative,Neuatral,Positive,Video,Comment
0,1,0.480478,0.479108,0.040413,Video Four,cross_validation is deprecated. You can use mo...
1,2,0.009779,0.123980,0.866241,Video Four,"youtubers like this guy, TechWithTim, Neural N..."
2,4,0.003977,0.022076,0.973947,Video Four,I have tried many other courses and tutorial b...
3,5,0.001604,0.016402,0.981994,Video Four,"Interestingly enough, I'm getting almost the s..."
4,6,0.036950,0.273940,0.689110,Video Four,Hey Harrison! Love your tutorials. Just a coup...
...,...,...,...,...,...,...
587,591,0.005401,0.043872,0.950727,Video One,I am a Web developer(fresher)\nI was searching...
588,592,0.040566,0.139259,0.820175,Video One,WAITING FOR IT FROM 1 MONTH HOPE TO LEARN NEW ...
589,593,0.004231,0.114846,0.880923,Video One,great lecture but notes will be there for this...
590,594,0.003110,0.074032,0.922857,Video One,Uh r inspiration for alot of people... keeping...
