In [1]:
import pandas as pd
datapath = "../Datasets/twitter_dataset/twitter_training.csv"
df = pd.read_csv(datapath)
df.shape

(74681, 4)

In [2]:
df = df.head(1000) #selecting only first 1000 rows
df.head()

Unnamed: 0,2401,Borderlands,Positive,"im getting on borderlands and i will murder you all ,"
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...


In [3]:
df.columns = ['tweet_id','entity','sentiment','tweet_content']

In [4]:
df.head()

Unnamed: 0,tweet_id,entity,sentiment,tweet_content
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...


In [5]:
df['tweet_content'][7]

"So I spent a few hours doing something for fun... If you don't know I'm a HUGE @ Borderlands fan and Maya is one of my favorite characters."

In [6]:
df.isnull().sum() #finding the number of null values

tweet_id         0
entity           0
sentiment        0
tweet_content    4
dtype: int64

In [7]:
df.dropna(inplace = True) #drop null values


In [8]:
df.duplicated().sum() #sum of duplicate values


np.int64(31)

In [9]:
# Identify the duplicate rows (keeping all copies)
duplicate_rows = df[df.duplicated(keep=False)]

# Display the columns that are duplicating the content
print(duplicate_rows[['tweet_id', 'tweet_content']].sort_values(by='tweet_content'))

     tweet_id                                      tweet_content
143      2425                                    "What a bitch!"
145      2425                                    "What a bitch!"
515      2489  . :: Ah yes. A very very old image of demon Lo...
518      2489  . :: Ah yes. A very very old image of demon Lo...
61       2411                                               .. [
63       2411                                               .. [
592      2503                                             ......
591      2503                                             ......
623      2509  4 player local Borderlands 3. Such a great wee...
624      2509  4 player local Borderlands 3. Such a great wee...
689      2522  A Noob Trys To Make A Salvador Build Borderlan...
692      2522  A Noob Trys To Make A Salvador Build Borderlan...
967      2572                            AGAIN @ Borderlands WTF
966      2572                            AGAIN @ Borderlands WTF
702      2524  Beat DLC2 

In [10]:
df = df.drop_duplicates(subset=['tweet_content'], keep='first') #drop the duplicates
print(df.duplicated().sum()) #0 means no duplicate values
print(df.dropna().count()) #returns the count of unique values

0
tweet_id         959
entity           959
sentiment        959
tweet_content    959
dtype: int64


now there is unique set of 959 rows on which we can perform some preprocessing operation

**Converting to lower case**

In [11]:
df['tweet_content'] = df["tweet_content"].str.lower()

In [12]:
df['tweet_content'][7]

"so i spent a few hours doing something for fun... if you don't know i'm a huge @ borderlands fan and maya is one of my favorite characters."

remove html tags

In [13]:
import re
def remove_html_tags(text):
    pattern = re.compile("<.*?>")
    return pattern.sub(r" ",text)

In [14]:
df['tweet_content'] = df["tweet_content"].apply(remove_html_tags)

In [15]:
df['tweet_content'][7]

"so i spent a few hours doing something for fun... if you don't know i'm a huge @ borderlands fan and maya is one of my favorite characters."

**Remove URL**

In [16]:
def remove_url(text):
    pattern = re.compile(r"https?://\S+|www\.\S+")
    return pattern.sub(r"",text)

In [17]:
df['tweet_content'] = df['tweet_content'].apply(remove_url)

In [18]:
df.head()

Unnamed: 0,tweet_id,entity,sentiment,tweet_content
0,2401,Borderlands,Positive,i am coming to the borders and i will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...


**Remove Punctuation**

In [19]:
import string,time
exclude = string.punctuation
def remove_punc1(text):
    return text.translate(str.maketrans('','',exclude))

In [20]:
df['tweet_content'] = df['tweet_content'].apply(remove_punc1)

In [21]:
df['tweet_content'][7]

'so i spent a few hours doing something for fun if you dont know im a huge  borderlands fan and maya is one of my favorite characters'

In [22]:
chat_words = {
    'AFAIK':'As Far As I Know',
    'AFK':'Away From Keyboard',
    'ASAP':'As Soon As Possible',
} 


{
    "FYI": "For Your Information",
    "ASAP": "As Soon As Possible",
    "BRB": "Be Right Back",
    "BTW": "By The Way",
    "OMG": "Oh My God",
    "IMO": "In My Opinion",
    "LOL": "Laugh Out Loud",
    "TTYL": "Talk To You Later",
    "GTG": "Got To Go",
    "TTYT": "Talk To You Tomorrow",
    "IDK": "I Don't Know",
    "TMI": "Too Much Information",
    "IMHO": "In My Humble Opinion",
    "ICYMI": "In Case You Missed It",
    "AFAIK": "As Far As I Know",
    "BTW": "By The Way",
    "FAQ": "Frequently Asked Questions",
    "TGIF": "Thank God It's Friday",
    "FYA": "For Your Action",
    "ICYMI": "In Case You Missed It",
}

{'FYI': 'For Your Information',
 'ASAP': 'As Soon As Possible',
 'BRB': 'Be Right Back',
 'BTW': 'By The Way',
 'OMG': 'Oh My God',
 'IMO': 'In My Opinion',
 'LOL': 'Laugh Out Loud',
 'TTYL': 'Talk To You Later',
 'GTG': 'Got To Go',
 'TTYT': 'Talk To You Tomorrow',
 'IDK': "I Don't Know",
 'TMI': 'Too Much Information',
 'IMHO': 'In My Humble Opinion',
 'ICYMI': 'In Case You Missed It',
 'AFAIK': 'As Far As I Know',
 'FAQ': 'Frequently Asked Questions',
 'TGIF': "Thank God It's Friday",
 'FYA': 'For Your Action'}

**Chat conversion**

In [23]:
def chat_conversion(text):
    new_text = []
    for w in text.split():
        if w.upper() in chat_words:
            converted_word = chat_words[w.upper()].lower()
            new_text.append(converted_word)
        else:
            new_text.append(w)
    return " ".join(new_text)

In [24]:
df['tweet_content'] = df['tweet_content'].apply(chat_conversion)


In [25]:
df.head()

Unnamed: 0,tweet_id,entity,sentiment,tweet_content
0,2401,Borderlands,Positive,i am coming to the borders and i will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you all
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...


**Stopwords removal**

In [26]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\megha/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [27]:
stopwords.words('english')

['a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 "he'd",
 "he'll",
 'her',
 'here',
 'hers',
 'herself',
 "he's",
 'him',
 'himself',
 'his',
 'how',
 'i',
 "i'd",
 'if',
 "i'll",
 "i'm",
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it'd",
 "it'll",
 "it's",
 'its',
 'itself',
 "i've",
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'on

In [28]:
len(stopwords.words('english'))

198

In [29]:
def remove_stopwords(text):
    new_text = []
    for w in text.split():
        if w in stopwords.words('english'):
            new_text.append('')
        else:
            new_text.append(w)
    x = new_text[:]
    new_text.clear()
    return ' '.join(x)

In [30]:
df['tweet_content_stopwords']=df['tweet_content'].apply(remove_stopwords)

In [31]:
df.head()

Unnamed: 0,tweet_id,entity,sentiment,tweet_content,tweet_content_stopwords
0,2401,Borderlands,Positive,i am coming to the borders and i will kill you...,coming borders kill
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you all,im getting borderlands kill
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...,im coming borderlands murder
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...,im getting borderlands 2 murder
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...,im getting borderlands murder


**Handle Emoji's**

In [32]:
import emoji
def demogize_emoji(text):
    return emoji.demojize(str(text))

In [33]:
text = "Loved the movie. It was ðŸ˜˜ðŸ˜˜"
demogize_emoji(text)

'Loved the movie. It was :face_blowing_a_kiss::face_blowing_a_kiss:'

**Tokenization**

In [34]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to C:\Users\megha/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [35]:
from nltk.tokenize import word_tokenize
sent1 = "I am going to Jacksonville"
print(word_tokenize(sent1))

['I', 'am', 'going', 'to', 'Jacksonville']


In [36]:
df.head()

Unnamed: 0,tweet_id,entity,sentiment,tweet_content,tweet_content_stopwords
0,2401,Borderlands,Positive,i am coming to the borders and i will kill you...,coming borders kill
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you all,im getting borderlands kill
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...,im coming borderlands murder
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...,im getting borderlands 2 murder
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...,im getting borderlands murder


let's use Spacy instead of NLTK for tokenization as the latter one is slower.

In [37]:
import spacy
nlp = spacy.load('en_core_web_sm')
def spacy_tokenize(text):
    doc = nlp(str(text)) #make sure you do not have null values before this.
    return [token.text for token in doc]

In [38]:
df['tokens'] = df['tweet_content'].apply(spacy_tokenize)
df.head()

Unnamed: 0,tweet_id,entity,sentiment,tweet_content,tweet_content_stopwords,tokens
0,2401,Borderlands,Positive,i am coming to the borders and i will kill you...,coming borders kill,"[i, am, coming, to, the, borders, and, i, will..."
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you all,im getting borderlands kill,"[i, m, getting, on, borderlands, and, i, will,..."
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...,im coming borderlands murder,"[i, m, coming, on, borderlands, and, i, will, ..."
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...,im getting borderlands 2 murder,"[i, m, getting, on, borderlands, 2, and, i, wi..."
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...,im getting borderlands murder,"[i, m, getting, into, borderlands, and, i, can..."


**Lemmatization**

In [39]:
import spacy
nlp = spacy.load('en_core_web_sm')
def lemmatization(text):
    doc = nlp(str(text))
    return [token.lemma_.lower() for token in doc]

In [40]:
df['lemmatized_tokens'] = df['tweet_content'].apply(lemmatization)
df.head()

Unnamed: 0,tweet_id,entity,sentiment,tweet_content,tweet_content_stopwords,tokens,lemmatized_tokens
0,2401,Borderlands,Positive,i am coming to the borders and i will kill you...,coming borders kill,"[i, am, coming, to, the, borders, and, i, will...","[i, be, come, to, the, border, and, i, will, k..."
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you all,im getting borderlands kill,"[i, m, getting, on, borderlands, and, i, will,...","[i, m, get, on, borderland, and, i, will, kill..."
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...,im coming borderlands murder,"[i, m, coming, on, borderlands, and, i, will, ...","[i, m, come, on, borderland, and, i, will, mur..."
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...,im getting borderlands 2 murder,"[i, m, getting, on, borderlands, 2, and, i, wi...","[i, m, get, on, borderland, 2, and, i, will, m..."
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...,im getting borderlands murder,"[i, m, getting, into, borderlands, and, i, can...","[i, m, get, into, borderland, and, i, can, mur..."


Let's proceed with simple sentiment analysis using distilbert from hugging face.

In [61]:
#convert the lemmatized tokens to single string
df['final_text'] = df['lemmatized_tokens'].apply(lambda x: ' '.join(x))

In [66]:
# zero shot classification
from transformers import pipeline

# Load the zero-shot pipeline
zero_shot_pipeline = pipeline(
    "zero-shot-classification",
    model="facebook/bart-large-mnli"
)

# Define your exact four labels
candidate_labels = ['Positive', 'Negative', 'Neutral', 'Irrelevant']

# Process the data
# The input must be a dictionary containing the list of texts and the labels
results = zero_shot_pipeline(
    df['final_text'].tolist(), 
    candidate_labels=candidate_labels,
    multi_label=False # Set to False for single-choice classification
)

# The output structure is different; you'd extract the 'prediction' and 'score'.
df['new_label'] = [res['labels'][0] for res in results]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Device set to use cpu


ValueError: You must include at least one label and at least one sequence.

In [None]:
# this is binary classification - either positive or negative
from transformers import pipeline

# Load the model locally (one-time setup)
local_sentiment_pipeline = pipeline(
    "sentiment-analysis",
    model="cardiffnlp/twitter-roberta-base-sentiment-latest"
)

# Convert your final text column to a list
tweets_for_analysis = df['final_text'].tolist()

# Run the pipeline in one bulk operation
local_results = local_sentiment_pipeline(tweets_for_analysis)

# Extract results (same structure as API results)
df['sentiment_label'] = [res['label'] for res in local_results]
df['sentiment_score'] = [res['score'] for res in local_results]

Device set to use cpu


In [63]:
# ADD THIS LINE to see the count of positive and negative labels
print("\n--- Sentiment Distribution ---")
print(df['sentiment_label'].value_counts())


--- Sentiment Distribution ---
sentiment_label
NEGATIVE    518
POSITIVE    441
Name: count, dtype: int64


In [65]:
# let's validate with the data.
print(df['sentiment'].value_counts())

sentiment
Positive      401
Neutral       271
Negative      185
Irrelevant    102
Name: count, dtype: int64


In [None]:
#if you want to use the hugging face api instead,
import os
import requests
import json
import time

API_URL = "https://router.huggingface.co/hf-inference/models/distilbert/distilbert-base-uncased-finetuned-sst-2-english"
HF_TOKEN = os.environ['HF_TOKEN']
headers = {
        "Authorization": f"Bearer {HF_TOKEN}",
        "Content-Type": "application/json"
    }

def check_api_status():
    payload = {"inputs": "This tweet is awesome."}
    response = requests.post(API_URL, headers=headers, json=payload, timeout=10)
    return response.status_code
      

# Run the check
check_api_status()