In [1]:
import pandas as pd
import torch
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from scipy.special import softmax

# Load pre-trained model and tokenizer
model_name = 'roberta-base'
tokenizer = RobertaTokenizer.from_pretrained(model_name)
model = RobertaForSequenceClassification.from_pretrained(model_name)

# Define a function to calculate polarity scores using RoBERTa
def polairty_scores_robert(text):
    encoded_text = tokenizer(text, return_tensors='pt', padding=True, truncation=True)
    op = model(**encoded_text)
    scores = op[0][0].detach().numpy()
    scores = softmax(scores)
    scores_dict = {'robertneg': scores[0], 'robertnut': scores[1], 'robertnpos': scores[2]}
    return scores_dict

# Read the DataFrame
df = pd.read_csv("flipkart_reviews.csv")  # Provide the path to your dataset

# Apply sentiment analysis to each row
sentiment_results = []
for index, row in df.iterrows():
    review = row['Review']
    scores_dict = polairty_scores_robert(review)
    sentiment_results.append(scores_dict)

# Add sentiment analysis results to the DataFrame
sentiment_columns = ['robertneg', 'robertnut', 'robertnpos']
sentiment_df = pd.DataFrame(sentiment_results, columns=sentiment_columns)
df = pd.concat([df, sentiment_df], axis=1)

# Display the updated DataFrame
print(df)


  torch.utils._pytree._register_pytree_node(


Downloading vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


IndexError: index 2 is out of bounds for axis 0 with size 2

In [24]:
df = pd.read_csv("flipkart_reviews.csv")

In [2]:
df.Review.iloc[3]

'My wife is so happy and best product 👌🏻😘'

In [4]:
text = df.Review.iloc[3]
encoded_text = tokenizer(text, return_tensors='pt', padding=True, truncation=True)
op = model(**encoded_text)
scores = op[0][0].detach().numpy()
scores = softmax(scores)
scores

array([0.48326042, 0.51673955], dtype=float32)

In [5]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from scipy.special import softmax

# Load pre-trained model and tokenizer
MODEL = "cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Sentiment Scores: {'robertneg': 0.0015707917045801878, 'robertnut': 0.007419607602059841, 'robertnpos': 0.9910095930099487}


In [38]:

# Define a function to calculate sentiment scores using RoBERTa
def calculate_sentiment_scores(text):
    # Tokenize input text
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)

    # Forward pass through the model
    with torch.no_grad():
        outputs = model(**inputs)

    # Apply softmax to get probabilities
    probabilities = softmax(outputs.logits, axis=1).flatten().tolist()
    print(probabilities)
    # Return sentiment scores as a dictionary
    scores_dict = {
        'robertneg': probabilities[0],
        'robertnut': probabilities[1],
        'robertnpos': probabilities[2]
    }
    return scores_dict

# Example usage:
text = "Very Bad"
sentiment_scores = calculate_sentiment_scores(text)
print("Sentiment Scores:", sentiment_scores)


[0.8655868172645569, 0.12227807939052582, 0.012135038152337074]
Sentiment Scores: {'robertneg': 0.8655868172645569, 'robertnut': 0.12227807939052582, 'robertnpos': 0.012135038152337074}


In [22]:
sentiment_scores

[0.002877665450796485, 0.03500713035464287, 0.9621151685714722]

In [8]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from scipy.special import softmax

# Define a function to calculate sentiment scores using RoBERTa
def calculate_sentiment_scores(text):
    # Tokenize input text
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)

    # Forward pass through the model
    with torch.no_grad():
        outputs = model(**inputs)

    # Apply softmax to get probabilities
    probabilities = softmax(outputs.logits, axis=1).flatten().tolist()

    # Return sentiment scores as a list
    return probabilities

# Apply sentiment analysis to each row in the DataFrame
sentiment_scores_list = []
for index, row in df.iterrows():
    text = row['Review']
    sentiment_scores = calculate_sentiment_scores(text)
    sentiment_scores_list.append(sentiment_scores)

# Add sentiment scores to the DataFrame
sentiment_columns = ['robertneg', 'robertnut', 'robertnpos']
sentiment_df = pd.DataFrame(sentiment_scores_list, columns=sentiment_columns)
df = pd.concat([df, sentiment_df], axis=1)

# Display the updated DataFrame
print(df)

                                           Product_name  \
0     Lenovo Ideapad Gaming 3 Ryzen 5 Hexa Core 5600...   
1     Lenovo Ideapad Gaming 3 Ryzen 5 Hexa Core 5600...   
2     Lenovo Ideapad Gaming 3 Ryzen 5 Hexa Core 5600...   
3     DELL Inspiron Athlon Dual Core 3050U - (4 GB/2...   
4     DELL Inspiron Athlon Dual Core 3050U - (4 GB/2...   
...                                                 ...   
2299  MSI 27 inch Full HD IPS Panel Monitor (PRO MP2...   
2300  MSI 27 inch Full HD IPS Panel Monitor (PRO MP2...   
2301  MSI 27 inch Full HD IPS Panel Monitor (PRO MP2...   
2302  MSI 27 inch Full HD IPS Panel Monitor (PRO MP2...   
2303  MSI 27 inch Full HD IPS Panel Monitor (PRO MP2...   

                                                 Review  Rating  robertneg  \
0     Best under 60k Great performanceI got it for a...       5   0.005346   
1                                    Good perfomence...       5   0.005259   
2     Great performance but usually it has also that...  

In [13]:
two[df.columns[-4:]]

Unnamed: 0,Rating,robertneg,robertnut,robertnpos
13,2,0.476295,0.433912,0.089793
183,2,0.926307,0.065846,0.007847
203,2,0.926307,0.065846,0.007847
689,2,0.736681,0.233552,0.029767
769,2,0.736681,0.233552,0.029767
799,2,0.736681,0.233552,0.029767
860,2,0.700588,0.272478,0.026935
974,2,0.955419,0.040967,0.003614
1030,2,0.779997,0.209058,0.010945
1046,2,0.679641,0.249228,0.071131


In [20]:
df

Unnamed: 0,Rating,robertneg,robertnut,robertnpos
0,5,0.005346,0.041061,0.953593
1,5,0.005259,0.135149,0.859592
2,5,0.077736,0.282266,0.639997
3,5,0.001227,0.006407,0.992366
4,5,0.001508,0.010719,0.987773


In [19]:
df[df.columns[-4:]]

Unnamed: 0,Rating,robertneg,robertnut,robertnpos
0,5,0.005346,0.041061,0.953593
1,5,0.005259,0.135149,0.859592
2,5,0.077736,0.282266,0.639997
3,5,0.001227,0.006407,0.992366
4,5,0.001508,0.010719,0.987773


In [27]:
import pandas as pd

# Initialize thresholds for each sentiment class
neg_thresholds = []
nut_thresholds = []
pos_thresholds = []

# Calculate thresholds for each rating
for rating in range(1, 6):
    rating_data = df[df['Rating'] == rating]
    neg_threshold = rating_data['robertneg'].mean()
    nut_threshold = rating_data['robertnut'].mean()
    pos_threshold = rating_data['robertnpos'].mean()
    neg_thresholds.append(neg_threshold)
    nut_thresholds.append(nut_threshold)
    pos_thresholds.append(pos_threshold)

# Print thresholds for each sentiment class and rating
for rating in range(1, 6):
    print(f"Rating {rating}:")
    print(f"Negative Threshold: {neg_thresholds[rating-1]}")
    print(f"Neutral Threshold: {nut_thresholds[rating-1]}")
    print(f"Positive Threshold: {pos_thresholds[rating-1]}")
    print()


Rating 1:
Negative Threshold: 0.7804764147858014
Neutral Threshold: 0.16784208274001014
Positive Threshold: 0.05168150020282432

Rating 2:
Negative Threshold: 0.6775490829478139
Neutral Threshold: 0.2567918043502647
Positive Threshold: 0.06565912479393023

Rating 3:
Negative Threshold: 0.2561798040073232
Neutral Threshold: 0.2085956098312246
Positive Threshold: 0.53522458443019

Rating 4:
Negative Threshold: 0.07231745697883551
Neutral Threshold: 0.16061943494697925
Positive Threshold: 0.7670631085765427

Rating 5:
Negative Threshold: 0.020599798396413044
Neutral Threshold: 0.10060444753549659
Positive Threshold: 0.8787957529837936



In [25]:
sentiment_columns = ['robertneg', 'robertnut', 'robertnpos']
# sentiment_df = pd.DataFrame(sentiment_scores_list, columns=sentiment_columns)
df = pd.concat([df, sentiment_df], axis=1)

In [29]:
df[df.columns[-3:]]

Unnamed: 0,robertneg,robertnut,robertnpos
0,0.005346,0.041061,0.953593
1,0.005259,0.135149,0.859592
2,0.077736,0.282266,0.639997
3,0.001227,0.006407,0.992366
4,0.001508,0.010719,0.987773
...,...,...,...
2299,0.003550,0.042456,0.953994
2300,0.010585,0.626950,0.362466
2301,0.008338,0.083822,0.907840
2302,0.520648,0.456470,0.022883


In [31]:
df.to_csv('roberta.csv')