In [2]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("kazanova/sentiment140")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/kazanova/sentiment140?dataset_version_number=2...


100%|██████████| 80.9M/80.9M [00:01<00:00, 43.3MB/s]

Extracting files...





Path to dataset files: /Users/marius/.cache/kagglehub/datasets/kazanova/sentiment140/versions/2


In [5]:
import os

print("Path to dataset files:", path)
print("Files in dataset directory:", os.listdir(path))

Path to dataset files: /Users/marius/.cache/kagglehub/datasets/kazanova/sentiment140/versions/2
Files in dataset directory: ['training.1600000.processed.noemoticon.csv']


In [11]:
import pandas as pd

file_path = os.path.join(path, "training.1600000.processed.noemoticon.csv")

columns = ['target', 'id', 'date', 'flag', 'user', 'text']

full_data = pd.read_csv(file_path, encoding='latin-1', names=columns)

In [19]:
#we are interested in the labels and tweets only

data = full_data[['target', 'text']].copy()
data.rename(columns={"target": "label"}, inplace=True)

print(data[:10])

   label                                               text
0      0  @switchfoot http://twitpic.com/2y1zl - Awww, t...
1      0  is upset that he can't update his Facebook by ...
2      0  @Kenichan I dived many times for the ball. Man...
3      0    my whole body feels itchy and like its on fire 
4      0  @nationwideclass no, it's not behaving at all....
5      0                      @Kwesidei not the whole crew 
6      0                                        Need a hug 
7      0  @LOLTrish hey  long time no see! Yes.. Rains a...
8      0               @Tatiana_K nope they didn't have it 
9      0                          @twittera que me muera ? 


In [22]:
import nltk
import re

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Error loading punkt: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1000)>
[nltk_data] Error loading stopwords: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1000)>
[nltk_data] Error loading wordnet: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1000)>


False

In [30]:
# Preprocessing function without parallelization (on my hardware took ~3min)
def preprocess_text(text):
    # Tokenize text
    tokens = word_tokenize(text.lower())  # Lowercase and tokenize

    # Remove punctuation and non-alphabetic characters
    tokens = [re.sub(r"[^a-zA-Z]", "", token) for token in tokens]
    tokens = [token for token in tokens if token]  # Remove empty strings

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]

    # Lemmatize tokens
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]

    return ' '.join(lemmatized_tokens) 

data['processed_text'] = data['text'].apply(preprocess_text)
new_data = data[['label', 'processed_text']]
print(new_data.head())


   label                                     processed_text
0      0  switchfoot http twitpiccomyzl awww bummer shou...
1      0  upset ca nt update facebook texting might cry ...
2      0  kenichan dived many time ball managed save res...
3      0                    whole body feel itchy like fire
4      0             nationwideclass behaving mad ca nt see


In [32]:
# Preprocessing function with parallelization (on my hardware took ~2min)
# Roughly a 33% speed increase

from joblib import Parallel, delayed
import nltk

nltk.download('stopwords')

def preprocess_text(text):
    from nltk.tokenize import word_tokenize
    from nltk.corpus import stopwords
    from nltk.stem import WordNetLemmatizer
    import re

    # Tokenize text
    tokens = word_tokenize(text.lower())  # Lowercase and tokenize

    # Remove punctuation and non-alphabetic characters
    tokens = [re.sub(r"[^a-zA-Z]", "", token) for token in tokens]
    tokens = [token for token in tokens if token]  # Remove empty strings

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]

    # Lemmatize tokens
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]

    return ' '.join(lemmatized_tokens)  # Return processed text as a single string

def preprocess_and_return(row):
    return row['label'], preprocess_text(row['text'])

processed_data = Parallel(n_jobs=-1)(delayed(preprocess_and_return)(row) for index, row in data.iterrows())
new_data = pd.DataFrame(processed_data, columns=['label', 'processed_text'])
print(new_data.head())

[nltk_data] Error loading stopwords: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1000)>


   label                                     processed_text
0      0  switchfoot http twitpiccomyzl awww bummer shou...
1      0  upset ca nt update facebook texting might cry ...
2      0  kenichan dived many time ball managed save res...
3      0                    whole body feel itchy like fire
4      0             nationwideclass behaving mad ca nt see
