In [13]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import pandas as pd
import emoji
from tqdm import tqdm

# Load fine-tuned BERTweet sentiment model
model_name = "finiteautomata/bertweet-base-sentiment-analysis"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# Load dataset
file_path = "/content/drive/My Drive/Sentimental-Analysis/data/Tweets.csv"
df = pd.read_csv(file_path)

# Use correct label mapping
label_map = {"negative": 0, "neutral": 1, "positive": 2}

texts = df['text'].astype(str).tolist()
labels = df['airline_sentiment'].map(label_map).tolist()

preds = []
true = []

# Predict only first 5 batches (5*32=160 samples) for speed
batch_size = 32
for i in tqdm(range(0, 160, batch_size)):
    batch_texts = [emoji.demojize(t) for t in texts[i:i + batch_size]]
    inputs = tokenizer(batch_texts, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
        batch_preds = torch.argmax(outputs.logits, dim=1).tolist()
    preds.extend(batch_preds)
    true.extend(labels[i:i + batch_size])

# Evaluate
acc = accuracy_score(true, preds)
report = classification_report(true, preds, target_names=["negative", "neutral", "positive"])
cm = confusion_matrix(true, preds)

print("✅ Accuracy:", round(acc * 100, 2), "%")
print("\n📊 Classification Report:\n", report)
print("🧾 Confusion Matrix:\n", cm)




tokenizer_config.json:   0%|          | 0.00/338 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

bpe.codes: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/22.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/167 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/949 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/540M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/540M [00:00<?, ?B/s]


  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:08<00:33,  8.42s/it][A
 40%|████      | 2/5 [00:16<00:24,  8.21s/it][A
 60%|██████    | 3/5 [00:24<00:16,  8.11s/it][A
 80%|████████  | 4/5 [00:32<00:07,  7.91s/it][A
100%|██████████| 5/5 [00:36<00:00,  7.29s/it]

✅ Accuracy: 77.5 %

📊 Classification Report:
               precision    recall  f1-score   support

    negative       0.82      0.83      0.83        60
     neutral       0.77      0.61      0.68        54
    positive       0.73      0.89      0.80        46

    accuracy                           0.78       160
   macro avg       0.77      0.78      0.77       160
weighted avg       0.78      0.78      0.77       160

🧾 Confusion Matrix:
 [[50  8  2]
 [ 8 33 13]
 [ 3  2 41]]





In [9]:
import pandas as pd
file_path = "/content/drive/My Drive/Sentimental-Analysis/data/Tweets.csv"
print(df.columns)


Index(['tweet_id', 'airline_sentiment', 'airline_sentiment_confidence',
       'negativereason', 'negativereason_confidence', 'airline',
       'airline_sentiment_gold', 'name', 'negativereason_gold',
       'retweet_count', 'text', 'tweet_coord', 'tweet_created',
       'tweet_location', 'user_timezone'],
      dtype='object')


In [1]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive
