In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import string 
plt.style.use('ggplot')

import nltk

In [2]:
from nltk.corpus import stopwords

In [3]:
from nltk.stem import WordNetLemmatizer

In [4]:
#Reading and cleaning text
text= open("SE/sampleText.txt",encoding="utf-8").read()
lower_case = text.lower()
cleaned_text = lower_case.translate(str.maketrans('', '', string.punctuation))

In [5]:
#Basic NLKT

In [6]:
tokens = nltk.word_tokenize(cleaned_text)
tokens[:10]

['ask',
 'sityush',
 'to',
 'clean',
 'up',
 'his',
 'behavior',
 'than',
 'issue',
 'me']

In [7]:
# Removing Stop Words
final_words = []
for word in tokens:
    if word not in stopwords.words('english'):
        final_words.append(word)

In [8]:
# Lemmatization - From plural to single + Base form of a word (example better-> good)
lemma_words = []
for word in final_words:
    word = WordNetLemmatizer().lemmatize(word)
    lemma_words.append(word)

In [9]:
tagged = nltk.pos_tag(tokens)
tagged[:10]

[('ask', 'VB'),
 ('sityush', 'NN'),
 ('to', 'TO'),
 ('clean', 'VB'),
 ('up', 'RP'),
 ('his', 'PRP$'),
 ('behavior', 'NN'),
 ('than', 'IN'),
 ('issue', 'VB'),
 ('me', 'PRP')]

In [10]:
# VADER Seniment Scoring

In [11]:
from nltk.sentiment import SentimentIntensityAnalyzer
from tqdm.notebook import tqdm

sia = SentimentIntensityAnalyzer()

In [12]:
#EXPERIMENT
exp=' '.join(lemma_words)
sia.polarity_scores(exp)

{'neg': 0.238, 'neu': 0.495, 'pos': 0.267, 'compound': 0.0772}

In [13]:
exp=' '.join(final_words)
sia.polarity_scores(exp)

{'neg': 0.222, 'neu': 0.505, 'pos': 0.273, 'compound': 0.128}

In [14]:
sia.polarity_scores(cleaned_text)

{'neg': 0.148, 'neu': 0.671, 'pos': 0.181, 'compound': 0.128}

In [15]:
#Roberta Pretrained Model

In [16]:
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from scipy.special import softmax

In [17]:
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

In [18]:
# Run for Roberta Model
encoded_text = tokenizer(cleaned_text, return_tensors='pt')
output = model(**encoded_text)
scores = output[0][0].detach().numpy()
scores = softmax(scores)
scores_dict = {
    'roberta_neg' : scores[0],
    'roberta_neu' : scores[1],
    'roberta_pos' : scores[2]
}
print(scores_dict)

{'roberta_neg': 0.76321274, 'roberta_neu': 0.2285487, 'roberta_pos': 0.008238544}


In [None]:
#Pipeline to check accuracy
# Load your labeled dataset
data = pd.DataFrame({
    "text":["This is good day", "guess that depends on if you want to be on the jury", "I am not feeling good."],
    "label":["positive", "neutral", "negative"]
})
# Map labels to indices
label_map = {"negative": 0, "neutral": 1, "positive": 2}

# Predict and evaluate
correct_predictions = 0

for index, row in data.iterrows():
    text = row["text"]
    true_label = label_map[row["label"]]
    
    # Tokenize and predict
    encoded_text = tokenizer(text, return_tensors='pt')
    output = model(**encoded_text)
    scores = softmax(output[0][0].detach().numpy())
    predicted_label = scores.argmax()  # Get the index of the max score
    
    # Compare with true label
    if predicted_label == true_label:
        correct_predictions += 1

In [None]:
# Calculate accuracy
accuracy = correct_predictions / len(data)
print(f"Accuracy: {accuracy:.2f}")