## In this notebook we perform sentiment analysis on twitter data obtained from - https://www.kaggle.com/code/viathorr/sentiment-analysis-with-bert-and-roberta-79-acc

In [1]:
!pip install evaluate




[notice] A new release of pip is available: 25.1.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


## Import the necessary modules

In [2]:
import pandas as pd
import os
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

### Perform data cleaning and preprocessing

In [3]:
train_path="train.csv"
test_path="test.csv"

train_df=pd.read_csv(train_path, encoding="iso-8859-1")
test_df=pd.read_csv(test_path, encoding="iso-8859-1")

train_df.shape, test_df.shape

((27481, 10), (4815, 9))

In [4]:
train_df.head()

Unnamed: 0,textID,text,selected_text,sentiment,Time of Tweet,Age of User,Country,Population -2020,Land Area (Km²),Density (P/Km²)
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,morning,0-20,Afghanistan,38928346,652860.0,60
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,noon,21-30,Albania,2877797,27400.0,105
2,088c60f138,my boss is bullying me...,bullying me,negative,night,31-45,Algeria,43851044,2381740.0,18
3,9642c003ef,what interview! leave me alone,leave me alone,negative,morning,46-60,Andorra,77265,470.0,164
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,noon,60-70,Angola,32866272,1246700.0,26


In [5]:
test_df.head()

Unnamed: 0,textID,text,sentiment,Time of Tweet,Age of User,Country,Population -2020,Land Area (Km²),Density (P/Km²)
0,f87dea47db,Last session of the day http://twitpic.com/67ezh,neutral,morning,0-20,Afghanistan,38928346.0,652860.0,60.0
1,96d74cb729,Shanghai is also really exciting (precisely -...,positive,noon,21-30,Albania,2877797.0,27400.0,105.0
2,eee518ae67,"Recession hit Veronique Branquinho, she has to...",negative,night,31-45,Algeria,43851044.0,2381740.0,18.0
3,01082688c6,happy bday!,positive,morning,46-60,Andorra,77265.0,470.0,164.0
4,33987a8ee5,http://twitpic.com/4w75p - I like it!!,positive,noon,60-70,Angola,32866272.0,1246700.0,26.0


### Drop meaningless columns (any column other than text and sentiment)

In [6]:
cols_name=train_df.columns.to_list()
cols_name

['textID',
 'text',
 'selected_text',
 'sentiment',
 'Time of Tweet',
 'Age of User',
 'Country',
 'Population -2020',
 'Land Area (Km²)',
 'Density (P/Km²)']

In [7]:
drop_cols_train=filter(lambda x : x not in ("text", "sentiment"), cols_name)
train_df=train_df.drop(columns=drop_cols_train)
train_df.head()


Unnamed: 0,text,sentiment
0,"I`d have responded, if I were going",neutral
1,Sooo SAD I will miss you here in San Diego!!!,negative
2,my boss is bullying me...,negative
3,what interview! leave me alone,negative
4,"Sons of ****, why couldn`t they put them on t...",negative


In [8]:
cols_name_test=test_df.columns.to_list()
drop_cols_test=filter(lambda x : x not in ("text", "sentiment"), cols_name_test)
test_df=test_df.drop(columns=drop_cols_test)
test_df.head()

Unnamed: 0,text,sentiment
0,Last session of the day http://twitpic.com/67ezh,neutral
1,Shanghai is also really exciting (precisely -...,positive
2,"Recession hit Veronique Branquinho, she has to...",negative
3,happy bday!,positive
4,http://twitpic.com/4w75p - I like it!!,positive


### Check for null values and fix them

In [9]:
train_df[train_df.isnull().any(axis=1)]

Unnamed: 0,text,sentiment
314,,neutral


In [10]:
test_df[test_df.isnull().any(axis=1)]

Unnamed: 0,text,sentiment
3534,,
3535,,
3536,,
3537,,
3538,,
...,...,...
4810,,
4811,,
4812,,
4813,,


In [11]:
train_df.dropna(inplace=True), test_df.dropna(inplace=True)

(None, None)

In [12]:
train_df[train_df.isnull().any(axis=1)]

Unnamed: 0,text,sentiment


In [13]:
test_df[test_df.isnull().any(axis=1)]

Unnamed: 0,text,sentiment


### Check for duplicate values and deal with them if they exist

In [14]:
train_df_duplicated=train_df.duplicated().sum()
print(train_df_duplicated)

0


In [15]:
test_df_duplicated=test_df.duplicated().sum()
print(test_df_duplicated)

0


### Let's now clean the data so that we dont have any URLs, hashtags or mentions in the data

In [16]:
# Clean text from links, hashtags, etc.
import re
def clean_text(text):
    text = re.sub(r"http\S+", "", text)  # remove URLs
    text = re.sub(r"#\w+", "", text)  # remove hashtags
    text = re.sub(r"@\w+", "", text)  # remove mentions
    # text = re.sub(r"[^\w\s]", "", text)  # remove special characters
    text = re.sub(r"[^\w\s,.!?'\":;()\-]", "", text)  
    text = text.strip()  # remove extra whitespace

    return text

In [17]:
train_df["text"]=train_df["text"].apply(clean_text)
train_df["text"]

0                       Id have responded, if I were going
1            Sooo SAD I will miss you here in San Diego!!!
2                                my boss is bullying me...
3                           what interview! leave me alone
4        Sons of , why couldnt they put them on the rel...
                               ...                        
27476    wish we could come see u on Denver  husband lo...
27477    Ive wondered about rake to.  The client has ma...
27478    Yay good for both of you. Enjoy the break - yo...
27479                               But it was worth it  .
27480    All this flirting going on - The ATG smiles. Y...
Name: text, Length: 27480, dtype: object

In [18]:
test_df["text"]=test_df["text"].apply(clean_text)
test_df["text"]

0                                 Last session of the day
1       Shanghai is also really exciting (precisely --...
2       Recession hit Veronique Branquinho, she has to...
3                                             happy bday!
4                                           - I like it!!
                              ...                        
3529    its at 3 am, im very tired but i cant sleep  b...
3530    All alone in this old house again.  Thanks for...
3531    I know what you mean. My little dog is sinking...
3532    _sutra what is your next youtube video gonna b...
3533                          - omgssh  ang cute ng bby.!
Name: text, Length: 3534, dtype: object

In [19]:
train_df.head()

Unnamed: 0,text,sentiment
0,"Id have responded, if I were going",neutral
1,Sooo SAD I will miss you here in San Diego!!!,negative
2,my boss is bullying me...,negative
3,what interview! leave me alone,negative
4,"Sons of , why couldnt they put them on the rel...",negative


In [20]:
train_df=train_df.rename(columns={"sentiment": "label"})
test_df=test_df.rename(columns={"sentiment": "label"})
train_df.head()

Unnamed: 0,text,label
0,"Id have responded, if I were going",neutral
1,Sooo SAD I will miss you here in San Diego!!!,negative
2,my boss is bullying me...,negative
3,what interview! leave me alone,negative
4,"Sons of , why couldnt they put them on the rel...",negative


### Change the labels from pos, neg and neutral to 0,1,2

In [21]:
train_df["label"] = train_df["label"].str.strip().str.lower()
test_df["label"]  = test_df["label"].str.strip().str.lower()

label2id={
    "negative":0,
    "positive":1,
    "neutral":2
}

#apply the mapping
train_df["label"]=train_df["label"].map(label2id)
test_df["label"]=test_df["label"].map(label2id)

print(train_df["label"].unique())
train_df.head()

[2 0 1]


Unnamed: 0,text,label
0,"Id have responded, if I were going",2
1,Sooo SAD I will miss you here in San Diego!!!,0
2,my boss is bullying me...,0
3,what interview! leave me alone,0
4,"Sons of , why couldnt they put them on the rel...",0


In [22]:
test_df.head()

Unnamed: 0,text,label
0,Last session of the day,2
1,Shanghai is also really exciting (precisely --...,1
2,"Recession hit Veronique Branquinho, she has to...",0
3,happy bday!,1
4,- I like it!!,1


### Convert the text in train and test dataset from text to numerical data

In [23]:
vectorizer=CountVectorizer()
X_train=vectorizer.fit_transform(train_df["text"])
y_train=train_df["label"]
X_test=vectorizer.transform(test_df["text"])
y_test=test_df["label"]

### Train the Naive Bayes classifier on the data

In [24]:
model=MultinomialNB()
model.fit(X_train, y_train)

0,1,2
,alpha,1.0
,force_alpha,True
,fit_prior,True
,class_prior,


### Evaluate the model

In [25]:
y_pred_test=model.predict(X_test)
y_pred_train=model.predict(X_train)
test_accuracy=accuracy_score(y_test, y_pred_test)
train_accuracy=accuracy_score(y_train, y_pred_train)
print("Training accuracy", train_accuracy)
print("Testing accuracy", test_accuracy)

Training accuracy 0.8276928675400291
Testing accuracy 0.6581777023203169
