In [1]:
import pandas as pd
import numpy as np
import nltk

In [2]:
df=pd.read_csv('Review.csv')

In [None]:
df.info()

In [None]:
df.shape

In [None]:
df=df.head(777)

In [None]:
df.head(3)

In [None]:
example=df['Text'][5]
example

In [None]:
nltk.download('punkt')
tokens = nltk.word_tokenize(example)
tokens[:15]

In [None]:
nltk.download('averaged_perceptron_tagger')
tagged = nltk.pos_tag(tokens)
tagged[:10]

In [None]:
nltk.download('maxent_ne_chunker')
nltk.download('words')
entities = nltk.chunk.ne_chunk(tagged)
entities.pprint()

In [None]:
from nltk.sentiment import SentimentIntensityAnalyzer
from tqdm.notebook import tqdm
nltk.download('vader_lexicon')
pk = SentimentIntensityAnalyzer()

In [None]:
pk.polarity_scores('I am so happy!')


In [None]:
pk.polarity_scores('This is the worst thing ever.')


In [None]:
pk.polarity_scores(example)


In [None]:
# Run the polarity score on the entire dataset
res = {}
for i, row in tqdm(df.iterrows(), total=len(df)):
    text = row['Text']
    myid = row['Id']
    # Check if the text is valid before processing
    if isinstance(text, str):
        res[myid] = pk.polarity_scores(text)
    else:
        print(f"Warning: Skipping row {myid} due to invalid text.")

In [None]:
vaders = pd.DataFrame(res).T
vaders = vaders.reset_index().rename(columns={'index': 'Id'})
vaders = vaders.merge(df, how='left')

In [None]:
vaders.head(2)

In [None]:
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from scipy.special import softmax

In [None]:
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

In [None]:
pk.polarity_scores(example)

In [None]:
encoded_text = tokenizer(example, return_tensors='pt')
output = model(**encoded_text)
scores = output[0][0].detach().numpy()
scores = softmax(scores)
scores_dict = {
    'roberta_neg' : scores[0],
    'roberta_neu' : scores[1],
    'roberta_pos' : scores[2]
}
scores_dict

In [None]:
def polarity_scores_roberta(example):
    encoded_text = tokenizer(example, return_tensors='pt')
    output = model(**encoded_text)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    scores_dict = {
        'roberta_neg' : scores[0],
        'roberta_neu' : scores[1],
        'roberta_pos' : scores[2]
    }
    return scores_dict

In [None]:
res = {}
for i, row in tqdm(df.iterrows(), total=len(df)):
    try:
        text = row['Text']
        myid = row['Id']
        vader_result = pk.polarity_scores(text)
        vader_result_rename = {}
        for key, value in vader_result.items():
            vader_result_rename[f"vader_{key}"] = value
        roberta_result = polarity_scores_roberta(text)
        both = {**vader_result_rename, **roberta_result}
        res[myid] = both
    except RuntimeError:
        print(f'Broke at id {myid}')

In [None]:
results_df = pd.DataFrame(res).T
results_df = results_df.reset_index().rename(columns={'index': 'Id'})
results_df = results_df.merge(df, how='left')

In [None]:
results_df.columns


In [None]:
results_df.query('Score == 1') \
    .sort_values('roberta_pos', ascending=False)['Text'].values[0]

In [None]:
results_df.query('Score == 1') \
    .sort_values('vader_pos', ascending=False)['Text'].values[0]

In [None]:
results_df.query('Score == 5') \
    .sort_values('roberta_neg', ascending=False)['Text'].values[0]

In [None]:
results_df.query('Score == 5') \
    .sort_values('vader_neg', ascending=False)['Text'].values[0]

In [None]:
from transformers import Trainer, TrainingArguments, AutoModelForSequenceClassification, AutoTokenizer
import torch
from datasets import Dataset

In [None]:
model_name = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

In [None]:
def tokenize_function(examples):
    return tokenizer(examples["Text"], padding="max_length", truncation=True)

dataset = Dataset.from_pandas(results_df[['Text', 'Score']])
dataset = dataset.map(tokenize_function, batched=True)
dataset = dataset.rename_column("Score", "labels")
dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    save_strategy="epoch",
    logging_dir="./logs",
)

# trainerr setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    eval_dataset=dataset,
)


In [None]:
trainer.train()

model.save_pretrained("./fine_tuned_sentiment")
tokenizer.save_pretrained("./fine_tuned_sentiment")

In [None]:
from transformers import pipeline

sent_pipeline = pipeline("sentiment-analysis")

In [None]:
def lets_predict(s):
    tokenizer_fine = AutoTokenizer.from_pretrained("./fine_tuned_sentiment")
    model_fine = AutoModelForSequenceClassification.from_pretrained("./fine_tuned_sentiment")

    inputs = tokenizer_fine(s, return_tensors="pt", truncation=True, padding=True)
    outputs = model_fine(**inputs)
    logits = outputs.logits
    predicted_class = torch.argmax(logits, dim=1).item()

    label_mapping = {0: "NEGATIVE", 1: "POSITIVE"}
    return label_mapping[predicted_class]

s = input("Enter a sentence: ")
print(lets_predict(s))

In [None]:
import pickle

In [None]:
pickle.dump(pk,open('pk.pkl','wb'))