In [3]:

import re
import string 
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from tqdm import tqdm

In [4]:
import pandas as pd 

In [5]:
# col_names = ['target', 'id', 'date', 'flag', 'user', 'text']

twitter_df = pd.read_csv("../data/raw/twitter-kaggle-sentiment-shrink.csv", 
                         encoding='latin-1', 
                        #  header=None, 
                        #  names=col_names,
                         engine='python')

twitter_df.head()

Unnamed: 0,target,id,date,flag,user,text
0,0,2200003196,Tue Jun 16 18:18:12 PDT 2009,NO_QUERY,LaLaLindsey0609,@chrishasboobs AHHH I HOPE YOUR OK!!!
1,0,1467998485,Mon Apr 06 23:11:14 PDT 2009,NO_QUERY,sexygrneyes,"@misstoriblack cool , i have no tweet apps fo..."
2,0,2300048954,Tue Jun 23 13:40:11 PDT 2009,NO_QUERY,sammydearr,@TiannaChaos i know just family drama. its la...
3,0,1993474027,Mon Jun 01 10:26:07 PDT 2009,NO_QUERY,Lamb_Leanne,School email won't open and I have geography ...
4,0,2256550904,Sat Jun 20 12:56:51 PDT 2009,NO_QUERY,yogicerdito,upper airways problem


In [6]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

### Clean text for processing

In [7]:
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

In [8]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'\@\w+|\#','', text)
    text = re.sub(r'\d+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = nltk.word_tokenize(text)
    tokens = [stemmer.stem(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)


In [9]:
tqdm.pandas()

In [10]:
twitter_df['cleaned_text'] = twitter_df['text'].progress_apply(clean_text)

100%|██████████| 50000/50000 [00:08<00:00, 5610.67it/s]


In [11]:
twitter_df['sentiment'] = twitter_df['target'].map({0 : 'negative',4:'positive'})

In [12]:
twitter_df[['text', 'cleaned_text','sentiment']].head()

Unnamed: 0,text,cleaned_text,sentiment
0,@chrishasboobs AHHH I HOPE YOUR OK!!!,ahhh hope ok,negative
1,"@misstoriblack cool , i have no tweet apps fo...",cool tweet app razr,negative
2,@TiannaChaos i know just family drama. its la...,know famili drama lamehey next time u hang kim...,negative
3,School email won't open and I have geography ...,school email wont open geographi stuff revis s...,negative
4,upper airways problem,upper airway problem,negative


In [13]:
# Convert sentiment from string labels to integers
sentiment_map = {'positive': 1, 'negative': -1, 'neutral': 0}
twitter_df['sentiment'] = twitter_df['sentiment'].map(sentiment_map)

# Convert object columns to string if needed
twitter_df['cleaned_text'] = twitter_df['cleaned_text'].astype(str)

In [14]:
twitter_df.to_csv('../data/processed/twitter-kaggle-sentiment-cleaned.csv', index=False)