# Load libraries

In [1]:
import tqdm
import re
import nltk

import pandas as pd

from functools import lru_cache
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from scipy.stats import mode

In [None]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

In [3]:
# Create a list of unique stop words
stop_words = set(stopwords.words('english'))

In [None]:
# Load file from https://www.kaggle.com/datasets/kazanova/sentiment140
data = pd.read_csv("path_to_file_from_link_above", header=None, usecols=[0, 5], names=['target', 'text'])
data

Unnamed: 0,target,text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."
...,...,...
1599995,4,Just woke up. Having no school is the best fee...
1599996,4,TheWDB.com - Very cool to hear old Walt interv...
1599997,4,Are you ready for your MoJo Makeover? Ask me f...
1599998,4,Happy 38th Birthday to my boo of alll time!!! ...


In [6]:
# Removing garbage from tweets
only_char = lambda x: " ".join(re.sub(r"[^a-zA-Z']", " ", x).split()).lower()
data['text'] = data['text'].apply(only_char)
print(data['text'].head(10))

0    switchfoot http twitpic com y zl awww that's a...
1    is upset that he can't update his facebook by ...
2    kenichan i dived many times for the ball manag...
3       my whole body feels itchy and like its on fire
4    nationwideclass no it's not behaving at all i'...
5                          kwesidei not the whole crew
6                                           need a hug
7    loltrish hey long time no see yes rains a bit ...
8                   tatiana k nope they didn't have it
9                                twittera que me muera
Name: text, dtype: object


In [7]:
# In this case, we select the token of the part of speech as the most common for the current word
def safe_get_pos(token):
    pos_synset_list = list()
    synsets = wordnet.synsets(token)
    for synset in synsets:
        pos_synset_list.append(synset.pos())
    return mode(pos_synset_list)[0][0] if len(synsets) > 0 else wordnet.NOUN

In [8]:
# Initialize the lemmatizer object
lemmatizer = WordNetLemmatizer()

In [9]:
# We will cache the results of the lemmatization function for each unique word.
@lru_cache(maxsize=None)
def cached_lemmatize(token):
    return lemmatizer.lemmatize(token, pos=safe_get_pos(token))

In [10]:
# We use the function from the article with a little modification in line 9
def checkExecTimeMystemOneText(texts):
    lol = lambda lst, sz: [lst[i:i+sz] for i in range(0, len(lst), sz)]
    txtpart = lol(texts, 10000)
    res = []
    for txtp in tqdm.tqdm(txtpart):
        alltexts = ' '.join([txt + ' brborderlinebr ' for txt in txtp])
        tokens = word_tokenize(alltexts)
        words = [cached_lemmatize(token) for token in tokens if token not in stop_words]
        doc = []
        for txt in words:
            if txt != '\n' and txt.strip() != '':
                if txt == 'brborderlinebr':
                    doc = " ".join(doc)
                    res.append(doc)
                    doc = []
                else:
                    doc.append(txt)
    return res
res = checkExecTimeMystemOneText(data['text'])
res

  0%|          | 0/160 [00:00<?, ?it/s]

100%|██████████| 160/160 [02:03<00:00,  1.30it/s]


["switchfoot http twitpic com zl awww 's bummer shoulda get david carr third day",
 "upset ca n't update facebook texting might cry result school today also blah",
 'kenichan dive many time ball manage save rest go bound',
 'whole body feel itchy like fire',
 "nationwideclass 's behave 'm mad ca n't see",
 'kwesidei whole crew',
 'need hug',
 "loltrish hey long time see yes rain bit bit lol 'm fine thanks 's",
 "tatiana k nope n't",
 'twittera que muera',
 "spring break plain city 's snow",
 'pierce ear',
 "caregiving could n't bear watch think ua loss embarrassing",
 'octolinz count idk either never talk anymore',
 "smarrison would 've first n't gun really though zac snyder 's doucheclown",
 'iamjazzyfizzle wish get watch miss iamlilnicki premiere',
 "hollis ' death scene hurt severely watch film wry director cut",
 'file tax',
 'lettya ahh ive always want see rent love soundtrack',
 'fakerpattypattz oh dear drink forget table drink',
 "alydesigns day n't get much do",
 "one friend ca