In [16]:
import kagglehub
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix
import re
import emoji
import os

In [17]:
path = kagglehub.dataset_download("kazanova/sentiment140")
print("Path to dataset files:", path)

Path to dataset files: C:\Users\Nicolás\.cache\kagglehub\datasets\kazanova\sentiment140\versions\2


In [18]:
file_path = os.path.join(path, "training.1600000.processed.noemoticon.csv")

columns = ["target", "id", "date", "flag", "user", "text"]
df = pd.read_csv(file_path, encoding="latin1", names=columns)

df.head()

Unnamed: 0,target,id,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


# EDA

**Cleaning dataset**

To predict the sentiment behind the tweet we don't actually need most of the columns there are on the dataset, we actually only need the text and the tweet itself and the sentiment related to the tweet

In [19]:
df = df.drop(['id', 'date', 'flag', 'user'], axis=1)
df.head()

Unnamed: 0,target,text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


**Visualazing data**

Lets look into the way data is organized in our dataset

In [20]:
temp = df.groupby('target').count()['text'].reset_index().sort_values(by='text',ascending=False)
temp.style.background_gradient(cmap='Purples')

Unnamed: 0,target,text
0,0,800000
1,4,800000


**Mapping labels**

Original target labels are 4 for positive, 2 for neutral and 0 for negative, but i think it would be easier to interprate and more intuitive if i changed the labels to 2 for positive, 1 for neutral and 0 for negative

In [21]:
# Mapping labels
label_mapping = {0: 0, 2: 1, 4:2}
df['target'] = df['target'].map(label_mapping)

df.tail()

Unnamed: 0,target,text
1599995,2,Just woke up. Having no school is the best fee...
1599996,2,TheWDB.com - Very cool to hear old Walt interv...
1599997,2,Are you ready for your MoJo Makeover? Ask me f...
1599998,2,Happy 38th Birthday to my boo of alll time!!! ...
1599999,2,happy #charitytuesday @theNSPCC @SparksCharity...


**Preprocessing tweets**

We want to clean each tweet so the program can read each tweet without any problems.

In [None]:
def preprocess_tweet(tweet):
    tweet = re.sub(r"@\w+", "", tweet)  # Eliminate mentions
    tweet = re.sub(r"#\w+", "", tweet)  # Eliminate hashtags
    tweet = re.sub(r"http\S+|www\S+", "", tweet)  # Eliminate URL's
    tweet = emoji.demojize(tweet)  # Convert emoji to text
    tweet = re.sub(r"[^a-zA-Z\s]", "", tweet)  # Eliminate special characters
    tweet = tweet.lower().strip()  # Eliminate uppercase and spaces
    return tweet

df['cleaned_text'] = df["text"].apply(preprocess_tweet)

In [None]:
df.head()

Unnamed: 0,target,text,cleaned_text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",a thats a bummer you shoulda got david carr o...
1,0,is upset that he can't update his Facebook by ...,is upset that he cant update his facebook by t...
2,0,@Kenichan I dived many times for the ball. Man...,i dived many times for the ball managed to sav...
3,0,my whole body feels itchy and like its on fire,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all....",no its not behaving at all im mad why am i her...


**Dividing Data**

Now we divide the data into train and test so we can start training the model.