In [1]:
import pandas as pd

df = pd.read_csv("tweets_manual_labeling.csv")

df.head()


Unnamed: 0,date,tweet,manual_labeling
0,27/3/2023,Are you concerned about ChatGPT potentially re...,1
1,3/4/2025,AI continues to destroy jobs and always will,0
2,26/7/2023,More women than men stand to lose their jobs b...,0
3,12/7/2023,"AI Taking Jobs Could Benefit Economy, Says Mar...",1
4,23/9/2023,ai will destroy certain industries if left unc...,0


In [2]:
# Define label mapping
label_mapping = {
    0: "negative",
    1: "neutral",
    2: "positive"
}

# Create a new column with text labels
df["manual_labels"] = df["manual_labeling"].map(label_mapping)

# Display to verify
df[["tweet", "manual_labeling", "manual_labels"]].head()


Unnamed: 0,tweet,manual_labeling,manual_labels
0,Are you concerned about ChatGPT potentially re...,1,neutral
1,AI continues to destroy jobs and always will,0,negative
2,More women than men stand to lose their jobs b...,0,negative
3,"AI Taking Jobs Could Benefit Economy, Says Mar...",1,neutral
4,ai will destroy certain industries if left unc...,0,negative


In [3]:
df = df.drop_duplicates(subset="tweet").reset_index(drop=True)

In [4]:
print(f"Remaining tweets after removing duplicates: {len(df)}")

Remaining tweets after removing duplicates: 2960


In [5]:
import re

def clean_tweet(tweet):
    tweet = tweet.lower()  # Convert to lowercase
    tweet = re.sub(r"http\S+|www\S+|https\S+", "", tweet)  # Remove URLs
    tweet = re.sub(r"@\w+", "", tweet)  # Remove mentions
    tweet = re.sub(r"#", "", tweet)  # Remove the hashtag symbol
    tweet = re.sub(r"[^\w\s]", "", tweet)  # Remove punctuation and special characters
    tweet = re.sub(r"\s+", " ", tweet).strip()  # Remove extra spaces
    return tweet

# Apply cleaning to the tweet column
df["clean_tweet"] = df["tweet"].apply(clean_tweet)

# Show some cleaned examples
df[["tweet", "clean_tweet"]].head()


Unnamed: 0,tweet,clean_tweet
0,Are you concerned about ChatGPT potentially re...,are you concerned about chatgpt potentially re...
1,AI continues to destroy jobs and always will,ai continues to destroy jobs and always will
2,More women than men stand to lose their jobs b...,more women than men stand to lose their jobs b...
3,"AI Taking Jobs Could Benefit Economy, Says Mar...",ai taking jobs could benefit economy says marc...
4,ai will destroy certain industries if left unc...,ai will destroy certain industries if left unc...


In [6]:
# Save the cleaned, labeled dataset for model evaluation
df.to_csv("tweets_model_accuracy.csv", index=False)


In [7]:
df = pd.read_csv("tweets_model_accuracy.csv")

In [8]:
print(df.columns)
df.head()

Index(['date', 'tweet', 'manual_labeling', 'manual_labels', 'clean_tweet'], dtype='object')


Unnamed: 0,date,tweet,manual_labeling,manual_labels,clean_tweet
0,27/3/2023,Are you concerned about ChatGPT potentially re...,1,neutral,are you concerned about chatgpt potentially re...
1,3/4/2025,AI continues to destroy jobs and always will,0,negative,ai continues to destroy jobs and always will
2,26/7/2023,More women than men stand to lose their jobs b...,0,negative,more women than men stand to lose their jobs b...
3,12/7/2023,"AI Taking Jobs Could Benefit Economy, Says Mar...",1,neutral,ai taking jobs could benefit economy says marc...
4,23/9/2023,ai will destroy certain industries if left unc...,0,negative,ai will destroy certain industries if left unc...


In [9]:
!pip install -q transformers
!pip install -q torch torchvision torchaudio
!pip install -q emoji

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m110.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m59.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m61.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m870.0 kB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m13.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [10]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import numpy as np

# Load tokenizer and model
MODEL = "cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
model.eval().cuda()  # Move model to GPU


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/929 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


pytorch_model.bin:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

In [11]:
# CardiffNLP model labels
id2label = {0: "negative", 1: "neutral", 2: "positive"}


In [12]:
def predict_sentiment(text):
    encoded_input = tokenizer(text, return_tensors='pt', truncation=True).to("cuda")
    with torch.no_grad():
        output = model(**encoded_input)
        scores = torch.nn.functional.softmax(output.logits, dim=1)
        predicted_label = torch.argmax(scores).item()
    return id2label[predicted_label]


In [13]:
# Use the cleaned tweets
df["model_prediction"] = df["clean_tweet"].apply(predict_sentiment)


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [14]:
from sklearn.metrics import accuracy_score, classification_report

# Accuracy
accuracy = accuracy_score(df["manual_labels"], df["model_prediction"])
print(f"Model Accuracy: {accuracy:.2f}")

# Detailed Report
print(classification_report(df["manual_labels"], df["model_prediction"]))


Model Accuracy: 0.83
              precision    recall  f1-score   support

    negative       0.80      0.92      0.86      1040
     neutral       0.84      0.80      0.82      1419
    positive       0.88      0.71      0.78       501

    accuracy                           0.83      2960
   macro avg       0.84      0.81      0.82      2960
weighted avg       0.83      0.83      0.83      2960



In [15]:
# Save the final dataset with model predictions
df.to_csv("tweets_model_accuracy_with_predictions.csv", index=False)