In [None]:
import pandas as pd
import torch
from transformers import pipeline
from datasets import Dataset
from torch.utils.data import DataLoader
from tqdm import tqdm

# Constants
DATA_PATH = r'C:\Users\hieud\Documents\draft thesis\thesis\src\data\merged_clean_df.csv'

# Load dataset
print("📂 Loading dataset...")
df = pd.read_csv(DATA_PATH)
df.dropna(inplace=True)

# Remove neutral class & keep only 100 rows for testing
df = df[df['Sentiment'] != 1].head(100)
print(f"✅ Dataset loaded! Shape: {df.shape}")
print(df['Sentiment'].value_counts())  # Show class distribution


  from .autonotebook import tqdm as notebook_tqdm


📂 Loading dataset...
✅ Dataset loaded! Shape: (100, 2)
Sentiment
2    83
0    17
Name: count, dtype: int64


In [2]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

MODEL_NAME = "cardiffnlp/twitter-roberta-base-sentiment"

# Load Tokenizer & Model
print("🧠 Loading model and tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)

# Move model to GPU (if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()
print(f"🚀 Model is on {device} and ready for inference!")


🧠 Loading model and tokenizer...
🚀 Model is on cuda and ready for inference!


In [3]:
BATCH_SIZE = 64 

# Convert dataset to Hugging Face format
dataset = Dataset.from_pandas(df[['Text']])
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, num_workers=0)

# Sentiment Analysis Pipeline
sentiment_pipeline = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer, device=0)
print("✅ Sentiment pipeline initialized!")


Device set to use cuda:0


✅ Sentiment pipeline initialized!


In [7]:
LABEL_MAPPING = {'LABEL_0': 0, 'LABEL_2': 2}  # Only map negative (0) and positive (2)
predictions = []

print("🚀 Running inference on 100 rows...")
for batch in tqdm(dataloader, desc="🔎 Predicting"):
    texts = [t for t in batch['Text'] if isinstance(t, str) and t.strip()]
    if not texts:
        continue  # Skip empty texts

    batch_preds = sentiment_pipeline(texts, truncation=True, max_length=128)

    # ✅ FILTER OUT NEUTRAL (LABEL_1) PREDICTIONS
    batch_preds = [pred for pred in batch_preds if pred['label'] in LABEL_MAPPING]  # Keep only 0 and 2

    predictions.extend(batch_preds)

print("✅ Inference completed!")

🚀 Running inference on 100 rows...


🔎 Predicting:   0%|          | 0/2 [00:00<?, ?it/s]

🔎 Predicting: 100%|██████████| 2/2 [00:02<00:00,  1.05s/it]

✅ Inference completed!





In [8]:
# Ensure predictions are populated
if not predictions:
    raise ValueError("⚠️ Predictions list is empty. Ensure inference ran correctly!")

# Convert predictions to DataFrame
df = df.iloc[:len(predictions)]  # Ensure matching sizes
df['Predicted_Sentiment'] = [LABEL_MAPPING.get(pred['label'], -1) for pred in predictions]
df['Confidence_Score'] = [pred['score'] for pred in predictions]

# Show sample results
print(df[['Text', 'Sentiment', 'Predicted_Sentiment', 'Confidence_Score']].head(10))




                                                 Text  Sentiment  \
1   nz 50 retailers dont even contactless credit c...          0   
2   forever acknowledge channel help lessons ideas...          2   
3   whenever go place doesnt take apple pay doesnt...          0   
4   apple pay convenient secure easy use used kore...          2   
6   got apple pay south africa 20202021 widely acc...          2   
8   united states abundance retailers accept apple...          2   
10  wow really went town psu test rack that's seri...          2   
11  lab exciting thing seen reallly going shake qu...          2   
12  linus engineer love lmg content across channel...          2   
13  used time linus smartest guy room video clearl...          2   

    Predicted_Sentiment  Confidence_Score  
1                     0          0.661250  
2                     2          0.845339  
3                     0          0.906098  
4                     2          0.653632  
6                     2        

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Predicted_Sentiment'] = [LABEL_MAPPING.get(pred['label'], -1) for pred in predictions]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Confidence_Score'] = [pred['score'] for pred in predictions]


In [None]:
#reading data again 
df = pd.read_csv(DATA_PATH)
#drop neutral class
df = df[df['Sentiment'] != 1]
#saving data after removing neutral class
df.to_csv(r'C:\Users\hieud\Documents\draft thesis\thesis\src\data\latest.csv', index=False)
df

Unnamed: 0,Text,Sentiment
1,nz 50 retailers dont even contactless credit c...,0
2,forever acknowledge channel help lessons ideas...,2
3,whenever go place doesnt take apple pay doesnt...,0
4,apple pay convenient secure easy use used kore...,2
6,got apple pay south africa 20202021 widely acc...,2
...,...,...
1618903,woke school best feeling ever,2
1618904,thewdbcom cool hear old walt interviews,2
1618905,ready mojo makeover ask details,2
1618906,happy 38th birthday boo alll time tupac amaru ...,2
