## Install requirements

In [4]:
# ! pip install nltk
# ! pip install pandas
# ! pip install autocorrect 

## Import libraries

In [5]:
import pandas as pd
import nltk
import re

## Load Data

In [6]:
df = pd.read_csv("Social_Media_Dataset.csv", header=None)
df.head()

Unnamed: 0,0
0,my teachers seeing me submit my assignment at ...
1,My uncle just got married. My uncle just got m...
2,This is sad. The story of this girls break up....
3,How much reddit karma is considered a lot? Two...
4,cat attack my cat decided to jump out from und...


In [7]:
data = df.to_string() 
data

"                                                                                                                                                                                                                                                                                                                                                                                                                                                   0\n0                                                                                                                                                                                                                                                          my teachers seeing me submit my assignment at 3am again: 👁️👄👁️ i wonder if my teachers are silently judging me :/\\n\\n\\n\\n&amp;#x200B;\\n\\n\\n\\nfiller filter flicker snicker bicker clicker\n1                                                                                                                  

## Preprocessing

### Data Cleaning Steps

#### a. Remove newlines and Tabs

In [8]:
clean_data = data.replace("\\n", " ") # remove new lines \n and replace with space
clean_data = clean_data.replace("\\t", " ") # remove tabs
clean_data = re.sub(re.compile(r'\s+'), " ", clean_data) # remove white spaces

clean_data

" 0 0 my teachers seeing me submit my assignment at 3am again: 👁️👄👁️ i wonder if my teachers are silently judging me :/ &amp;#x200B; filler filter flicker snicker bicker clicker 1 My uncle just got married. My uncle just got married to a beautiful wife and I would like to present him with something. I couldn’t think of anything else but to tell you guys. Congratulations Uncle Jeremy! I’m so happy for you! 2 This is sad. The story of this girls break up. https://www.reddit.com/r/Advice/comments/jco47r/im_only_13_and_im_being_stressed_over_some_stupid/?utm_medium=android_app&amp;utm_source=share I dont need any karma, go and suport her. 3 How much reddit karma is considered a lot? Two days ago marked my 1 year anniversary of being on reddit. And in that time I have accumulated a bit more than 80,000 karma. So I was just wondering, how much karma is considered a lot? 4 cat attack my cat decided to jump out from underneath a chair and attack me he barely even scratched me but my cat is ann

#### b. Remove Punctuation/ Unicode characters/ Special Characters

Before processing the text, we must remove the extra, non-semantic and non-informative characters in the sentence. These characters are digits, white spaces, and others non-informative characters like {, }, =, @, &, %, $, ^, <, >, !, ?, ;, \n, \t, (, .

we done the following steps in this section of data cleaning

* unicode characters
* remove URLs
* remove HTML tags
* unwanted digits
* spacial caracters and Punctuations

1. unicode characters

In [9]:
# encode text to ascii to remove emoji
clean_data = clean_data.encode("ascii", "ignore") 
clean_data = clean_data.decode()

clean_data

" 0 0 my teachers seeing me submit my assignment at 3am again:  i wonder if my teachers are silently judging me :/ &amp;#x200B; filler filter flicker snicker bicker clicker 1 My uncle just got married. My uncle just got married to a beautiful wife and I would like to present him with something. I couldnt think of anything else but to tell you guys. Congratulations Uncle Jeremy! Im so happy for you! 2 This is sad. The story of this girls break up. https://www.reddit.com/r/Advice/comments/jco47r/im_only_13_and_im_being_stressed_over_some_stupid/?utm_medium=android_app&amp;utm_source=share I dont need any karma, go and suport her. 3 How much reddit karma is considered a lot? Two days ago marked my 1 year anniversary of being on reddit. And in that time I have accumulated a bit more than 80,000 karma. So I was just wondering, how much karma is considered a lot? 4 cat attack my cat decided to jump out from underneath a chair and attack me he barely even scratched me but my cat is annoying h

In [10]:
# remove HTML tags
clean_data = re.sub(r'https?://[a-zA-Z0-9\.\/\-_?=;&]*', '', clean_data)
clean_data = re.sub(r'<[^>]+>', '', clean_data)
clean_data

" 0 0 my teachers seeing me submit my assignment at 3am again:  i wonder if my teachers are silently judging me :/ &amp;#x200B; filler filter flicker snicker bicker clicker 1 My uncle just got married. My uncle just got married to a beautiful wife and I would like to present him with something. I couldnt think of anything else but to tell you guys. Congratulations Uncle Jeremy! Im so happy for you! 2 This is sad. The story of this girls break up.  I dont need any karma, go and suport her. 3 How much reddit karma is considered a lot? Two days ago marked my 1 year anniversary of being on reddit. And in that time I have accumulated a bit more than 80,000 karma. So I was just wondering, how much karma is considered a lot? 4 cat attack my cat decided to jump out from underneath a chair and attack me he barely even scratched me but my cat is annoying his name is Eddy/Edmond he's always surprising us because most of the time he's super friendly fun times 5 The first YouTube video was made clo

2. unwanted digits

In [11]:
unwanted_digit = ['0','1','2','3','4','5','6','7','8','9']

for digit in unwanted_digit:
    clean_data = clean_data.replace(digit, "")

3. spacial caracters and Punctuations

In [12]:
unwanted_punc = ['"',"'",'=','@','&','%','.',',',':','\\','$','^','<','>','!','?','{','}',';','\n','\t','(',')','[',']','/','*','+','#','\u200c','\ufeff','-','_','|']

for punc in unwanted_punc:
    clean_data = clean_data.replace(punc, "")

#### c. Hashtag removes

In [13]:
clean_data = clean_data.replace("#", "")

#### d. Tokenization

One of the most important steps in processing natural languages ​​is the tokenization step, in which we separate the sentence or corpus into its component or tokens words.

* In this section, we used the NLTK library in Python to implement tokenization.
* An elementary and very simple method to extract the words of a sentence is to use spaces and separate the words, although, of course, in languages ​​such as Japanese where there are no spaces, we use other methods such as the Maximum Matching algorithm.

In [14]:
from nltk.tokenize import word_tokenize

tokens = word_tokenize(clean_data)
print(tokens, len(tokens))

['my', 'teachers', 'seeing', 'me', 'submit', 'my', 'assignment', 'at', 'am', 'again', 'i', 'wonder', 'if', 'my', 'teachers', 'are', 'silently', 'judging', 'me', 'ampxB', 'filler', 'filter', 'flicker', 'snicker', 'bicker', 'clicker', 'My', 'uncle', 'just', 'got', 'married', 'My', 'uncle', 'just', 'got', 'married', 'to', 'a', 'beautiful', 'wife', 'and', 'I', 'would', 'like', 'to', 'present', 'him', 'with', 'something', 'I', 'couldnt', 'think', 'of', 'anything', 'else', 'but', 'to', 'tell', 'you', 'guys', 'Congratulations', 'Uncle', 'Jeremy', 'Im', 'so', 'happy', 'for', 'you', 'This', 'is', 'sad', 'The', 'story', 'of', 'this', 'girls', 'break', 'up', 'I', 'dont', 'need', 'any', 'karma', 'go', 'and', 'suport', 'her', 'How', 'much', 'reddit', 'karma', 'is', 'considered', 'a', 'lot', 'Two', 'days', 'ago', 'marked', 'my', 'year', 'anniversary', 'of', 'being', 'on', 'reddit', 'And', 'in', 'that', 'time', 'I', 'have', 'accumulated', 'a', 'bit', 'more', 'than', 'karma', 'So', 'I', 'was', 'just',

#### e. Stop words

Stop words are available in abundance in any human language. By removing these words, we remove the low-level information from our text in order to give more focus to the important information.

* We have the stopwords of that language in any language, and we use the wordnet corpus in the ntlk library to view its list in English.

In [15]:
import nltk

nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nader\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

For example, let's look at 20 of the English stop words that exist in wordnet.

In [16]:
from nltk.corpus import stopwords

stopwords.words("english")[:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

First, we extract the tokens and then if any of these tokens are present in the list of stopwords, we remove them from the set of tokens.

In [17]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# step1: tokenization
tokens = word_tokenize(clean_data)

# step2: remove capitalization tokens
normal_tokens = []
for token in tokens:
    normal_tokens.append(token.lower())


# step3: remove stopwords
clean_stop_words_tokens = []
for token in normal_tokens:
    if token not in stopwords.words("english"): 
        clean_stop_words_tokens.append(token)
print(clean_stop_words_tokens, len(clean_stop_words_tokens))

['teachers', 'seeing', 'submit', 'assignment', 'wonder', 'teachers', 'silently', 'judging', 'ampxb', 'filler', 'filter', 'flicker', 'snicker', 'bicker', 'clicker', 'uncle', 'got', 'married', 'uncle', 'got', 'married', 'beautiful', 'wife', 'would', 'like', 'present', 'something', 'couldnt', 'think', 'anything', 'else', 'tell', 'guys', 'congratulations', 'uncle', 'jeremy', 'im', 'happy', 'sad', 'story', 'girls', 'break', 'dont', 'need', 'karma', 'go', 'suport', 'much', 'reddit', 'karma', 'considered', 'lot', 'two', 'days', 'ago', 'marked', 'year', 'anniversary', 'reddit', 'time', 'accumulated', 'bit', 'karma', 'wondering', 'much', 'karma', 'considered', 'lot', 'cat', 'attack', 'cat', 'decided', 'jump', 'underneath', 'chair', 'attack', 'barely', 'even', 'scratched', 'cat', 'annoying', 'name', 'eddyedmond', 'hes', 'always', 'surprising', 'us', 'time', 'hes', 'super', 'friendly', 'fun', 'times', 'first', 'youtube', 'video', 'made', 'closer', 'today', 'look', 'months', 'years', 'months', 'si

#### f. Remove URLs

In [18]:
clean_data = re.sub(r'^https?:\/\/.*[\r\n]*', ' ', clean_data)

#### g. Remove HTML tags

In [19]:
clean_data = re.sub(r'<[^>]+>', ' ', clean_data)

#### h. Repeated characters reduction, for example: Hellllo → Hello

Sometimes in social networks, some of the main characters of a word are repeated several times to emphasize more. For example, the word frrrrrieeeeeeennnddss, we must return these words to their original form and remove the repeated character.

* We wrote a class to remove repeated parameters with name RepeatReplacer.
* For this operation, we use the wordnet corpus, which is a dictionary of English words.

In [20]:
import nltk

nltk.download("wordnet")

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\nader\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

RepeatReplacer Class for example the word of loooove

step1: (loo)(o)o(ve)    => o: remove

step2: (lo)(o)o(ve)     => o: remove

step3: (l)(o)o(ve)      => o: remove

step3: love             => are on wordne

In [21]:
import re
from nltk.corpus import wordnet


class RepeatReplacer():
    def __init__(self):
        # The beginning and the end char of word can be anything, but there must be a repeated character in the middle
        self.repeat_regexp = re.compile(r'(\w*)(\w)\2(\w*)') 
        self.repl = r'\1\2\3' # 1: start chars + a char, 2: middle char, 3: end chars

    def replace(self, word):
        if(wordnet.synsets(word)): 
            return word # if the word was in wordnet returns that word.
            
        repl_word = self.repeat_regexp.sub(self.repl, word) # Delete a repeated character
        if(repl_word != word): 
            return self.replace(repl_word) # Recursive function
        else: 
            return repl_word # The word is not in word net

In [22]:
replacer = RepeatReplacer()

test_token = "Hellllo"
replacer.replace(word=test_token)

'Hello'

In [23]:
replacer = RepeatReplacer()

clean_repeat_tokens = []
for token in clean_stop_words_tokens:
    clean = replacer.replace(word=token)
    clean_repeat_tokens.append(clean)
    if clean != token: print(token, "=> clean: ", clean)

reddit => clean:  redit
reddit => clean:  redit
eddyedmond => clean:  edyedmond
reddit => clean:  redit
reddit => clean:  redit
bankkkk => clean:  bank
friiiieeeennndddsssssssssssssssssssssssssss => clean:  friends
pllan => clean:  plan
reddit => clean:  redit
tomorrowthanks => clean:  tomorowthanks
whyyyyyy => clean:  why
zimmerinceptionmombasawav => clean:  zimerinceptionmombasawav
cringyembarrassing => clean:  cringyembarasing
app => clean:  ap
app => clean:  ap
heeelllllppppp => clean:  help
ucoolstarfish => clean:  ucolstarfish
redditor => clean:  reditor
ucoolstarfish => clean:  ucolstarfish
redditor => clean:  reditor
pillshow => clean:  pilshow
itt => clean:  it
qwertyyiopasdfghjklzxcvbnm => clean:  qwertyiopasdfghjklzxcvbnm
dddddd => clean:  dd


In [24]:
print(clean_repeat_tokens, len(clean_repeat_tokens))

['teachers', 'seeing', 'submit', 'assignment', 'wonder', 'teachers', 'silently', 'judging', 'ampxb', 'filler', 'filter', 'flicker', 'snicker', 'bicker', 'clicker', 'uncle', 'got', 'married', 'uncle', 'got', 'married', 'beautiful', 'wife', 'would', 'like', 'present', 'something', 'couldnt', 'think', 'anything', 'else', 'tell', 'guys', 'congratulations', 'uncle', 'jeremy', 'im', 'happy', 'sad', 'story', 'girls', 'break', 'dont', 'need', 'karma', 'go', 'suport', 'much', 'redit', 'karma', 'considered', 'lot', 'two', 'days', 'ago', 'marked', 'year', 'anniversary', 'redit', 'time', 'accumulated', 'bit', 'karma', 'wondering', 'much', 'karma', 'considered', 'lot', 'cat', 'attack', 'cat', 'decided', 'jump', 'underneath', 'chair', 'attack', 'barely', 'even', 'scratched', 'cat', 'annoying', 'name', 'edyedmond', 'hes', 'always', 'surprising', 'us', 'time', 'hes', 'super', 'friendly', 'fun', 'times', 'first', 'youtube', 'video', 'made', 'closer', 'today', 'look', 'months', 'years', 'months', 'since

#### i. Remove capitalization/ Case normalization

In [25]:
normal_tokens = []
for token in clean_repeat_tokens:
    normal_tokens.append(token.lower())

print(normal_tokens, len(normal_tokens))

['teachers', 'seeing', 'submit', 'assignment', 'wonder', 'teachers', 'silently', 'judging', 'ampxb', 'filler', 'filter', 'flicker', 'snicker', 'bicker', 'clicker', 'uncle', 'got', 'married', 'uncle', 'got', 'married', 'beautiful', 'wife', 'would', 'like', 'present', 'something', 'couldnt', 'think', 'anything', 'else', 'tell', 'guys', 'congratulations', 'uncle', 'jeremy', 'im', 'happy', 'sad', 'story', 'girls', 'break', 'dont', 'need', 'karma', 'go', 'suport', 'much', 'redit', 'karma', 'considered', 'lot', 'two', 'days', 'ago', 'marked', 'year', 'anniversary', 'redit', 'time', 'accumulated', 'bit', 'karma', 'wondering', 'much', 'karma', 'considered', 'lot', 'cat', 'attack', 'cat', 'decided', 'jump', 'underneath', 'chair', 'attack', 'barely', 'even', 'scratched', 'cat', 'annoying', 'name', 'edyedmond', 'hes', 'always', 'surprising', 'us', 'time', 'hes', 'super', 'friendly', 'fun', 'times', 'first', 'youtube', 'video', 'made', 'closer', 'today', 'look', 'months', 'years', 'months', 'since

#### j. Remove Whitespaces, for instance, He llo → Hello

In the first part, we removed the extra spaces and now we have made clean tokens and there is no need to perform this part anymore.

In [26]:
# remove white spaces and tabs
# clean_data = clean_data.replace("\\t", " ")
# clean_data = re.sub(re.compile(r'\s+'), " ", clean_data)

#### k. Typo Correction/ Misspelled words: big “dada” → big “data”

**Spell Correction**

Spelling errors happen a lot for various reasons such as proximity between keyboards and other reasons. In this section, we want to learn how to detect and correct it.

- Spell Error Detection
    - We make a dictionary of language words, where we can use the **wordnet corpus**. **The words that are not in this data set as dictionary words are considered misspelled**.

- Spell Error Correction Method
    1. We extract the list of **synonyms** or **SynSets** of the word, for which we can also use NLTK and Wordnet.
    2. We choose the word that has the s**mallest Editing distance** as the corrected word.

**What is Editing distance?**

The distance of converting one word to another using deletion, insertion and substitution of characters. Each of these operations has a point.
- [Delete: 1 point] [Insert: 1 point] [Substitution: 2 point]
- Example: We want to calculate the editing distance between two words, **there** are **three**
    - one delete + one insert : (there->thre) and (thre->three) => ED = 1+1=2

**spelling correction implementation**

To spelling correction in this project, we used the **autocorrect** library.
- we also implemented this operation from scratch with the of **NLTK** and **Wordnet** based on the editing distance and the given description, which is available at the following address. [Custom_Spell_Corrector](docs/Custom_Spell_Corrector.ipynb)

In [27]:
! pip install autocorrect



In [28]:
from autocorrect import Speller

speller = Speller()

print(speller("big dada"))
print(speller("corect"))

big data
correct


In [29]:
from autocorrect import Speller

speller = Speller()

clean_spell_tokens = []
for token in normal_tokens:
    clean = speller(token)
    if clean != token: print(token, "=> clean spell: ", clean)
    clean_spell_tokens.append(clean)

ampxb => clean spell:  amp
snicker => clean spell:  sticker
bicker => clean spell:  kicker
clicker => clean spell:  clicked
suport => clean spell:  sport
redit => clean spell:  edit
redit => clean spell:  edit
idk => clean spell:  id
idk => clean spell:  id
redit => clean spell:  edit
redit => clean spell:  edit
everytime => clean spell:  overtime
hapy => clean spell:  happy
clas => clean spell:  class
hmu => clean spell:  hu
tryna => clean spell:  try
dieing => clean spell:  diving
hapyi => clean spell:  happy
redit => clean spell:  edit
slacking => clean spell:  lacking
hapened => clean spell:  happened
opiniuns => clean spell:  opinions
emojis => clean spell:  emoji
reditor => clean spell:  editor
reditor => clean spell:  editor
prefered => clean spell:  preferred
pilshow => clean spell:  pillow
toim => clean spell:  tom
plae => clean spell:  place
xanax => clean spell:  canal
postingits => clean spell:  postings
itits => clean spell:  itis
temping => clean spell:  tempting
meno => 

In [30]:
print(clean_spell_tokens, len(clean_spell_tokens))

['teachers', 'seeing', 'submit', 'assignment', 'wonder', 'teachers', 'silently', 'judging', 'amp', 'filler', 'filter', 'flicker', 'sticker', 'kicker', 'clicked', 'uncle', 'got', 'married', 'uncle', 'got', 'married', 'beautiful', 'wife', 'would', 'like', 'present', 'something', 'couldnt', 'think', 'anything', 'else', 'tell', 'guys', 'congratulations', 'uncle', 'jeremy', 'im', 'happy', 'sad', 'story', 'girls', 'break', 'dont', 'need', 'karma', 'go', 'sport', 'much', 'edit', 'karma', 'considered', 'lot', 'two', 'days', 'ago', 'marked', 'year', 'anniversary', 'edit', 'time', 'accumulated', 'bit', 'karma', 'wondering', 'much', 'karma', 'considered', 'lot', 'cat', 'attack', 'cat', 'decided', 'jump', 'underneath', 'chair', 'attack', 'barely', 'even', 'scratched', 'cat', 'annoying', 'name', 'edyedmond', 'hes', 'always', 'surprising', 'us', 'time', 'hes', 'super', 'friendly', 'fun', 'times', 'first', 'youtube', 'video', 'made', 'closer', 'today', 'look', 'months', 'years', 'months', 'since', 'v

#### l. Stemming or lemmatization

Converting different forms of a word to the basic form of that word

* Delete extra characters from the beginning and end of the desired word
* Convert plural to singular
* Convert uppercase to lowercase
* Implementation of algorithms and linguistic commands to get to the root of the word

**Porter Stemmer**

This method is used by search engines to store the roots of the words instead of the words themselves and brings the following advantages

* Reduce the size of indexed words
* Increasing accuracy in search algorithms
This method tries to extract the root and basic form of a word based on the linguistic rules of natural languages. In the picture below, we can see the steps and examples of this algorithm.

In [31]:
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()
stemmer.stem("information")

'inform'

In [32]:
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()

stem_tokens = []
for token in clean_spell_tokens:
    clean = stemmer.stem(token)
    if clean != token: print(token, "=>", clean)
    stem_tokens.append(clean)

teachers => teacher
seeing => see
assignment => assign
teachers => teacher
silently => silent
judging => judg
clicked => click
uncle => uncl
married => marri
uncle => uncl
married => marri
beautiful => beauti
something => someth
anything => anyth
else => els
guys => guy
congratulations => congratul
uncle => uncl
jeremy => jeremi
happy => happi
story => stori
girls => girl
considered => consid
days => day
marked => mark
anniversary => anniversari
accumulated => accumul
wondering => wonder
considered => consid
decided => decid
barely => bare
scratched => scratch
annoying => annoy
hes => he
always => alway
surprising => surpris
hes => he
friendly => friendli
times => time
youtube => youtub
months => month
years => year
months => month
since => sinc
years => year
months => month
december => decemb
chinese => chines
people => peopl
chinese => chines
language => languag
languages => languag
actually => actual
bored => bore
inspiration => inspir
favorite => favorit
bored => bore
guys => guy
s

In [33]:
print(stem_tokens, len(stem_tokens))

['teacher', 'see', 'submit', 'assign', 'wonder', 'teacher', 'silent', 'judg', 'amp', 'filler', 'filter', 'flicker', 'sticker', 'kicker', 'click', 'uncl', 'got', 'marri', 'uncl', 'got', 'marri', 'beauti', 'wife', 'would', 'like', 'present', 'someth', 'couldnt', 'think', 'anyth', 'els', 'tell', 'guy', 'congratul', 'uncl', 'jeremi', 'im', 'happi', 'sad', 'stori', 'girl', 'break', 'dont', 'need', 'karma', 'go', 'sport', 'much', 'edit', 'karma', 'consid', 'lot', 'two', 'day', 'ago', 'mark', 'year', 'anniversari', 'edit', 'time', 'accumul', 'bit', 'karma', 'wonder', 'much', 'karma', 'consid', 'lot', 'cat', 'attack', 'cat', 'decid', 'jump', 'underneath', 'chair', 'attack', 'bare', 'even', 'scratch', 'cat', 'annoy', 'name', 'edyedmond', 'he', 'alway', 'surpris', 'us', 'time', 'he', 'super', 'friendli', 'fun', 'time', 'first', 'youtub', 'video', 'made', 'closer', 'today', 'look', 'month', 'year', 'month', 'sinc', 'video', 'made', 'back', 'video', 'made', 'year', 'month', 'away', 'decemb', 'chin

**Lemmatizing**

One of the methods of converting words to the basic form

* Lemmatizing: In this method, based on the meaning of the word, we try to reach the basic form of the word, and this is a better method, and unlike stemming, meaningless words are not produced.
* Stemming: In this method, we pay attention to the form of the word and remove suffixes and prefixes to get the basic appearance of a word.

In [34]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer() # by default Part of speach is Noun or Name
print(stemmer.stem("believes"))
print(lemmatizer.lemmatize("believes"))

believ
belief


In [35]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer() # by default Part of speach is Noun or Name

lem_tokens = []
for token in clean_spell_tokens:
    clean = lemmatizer.lemmatize(token)
    if clean != token: print(token, "=>", clean)
    lem_tokens.append(clean)

teachers => teacher
teachers => teacher
guys => guy
congratulations => congratulation
girls => girl
days => day
hes => he
us => u
hes => he
times => time
months => month
years => year
months => month
years => year
months => month
languages => language
guys => guy
songs => song
bands => band
subs => sub
guys => guy
cameras => camera
learners => learner
drivers => driver
costs => cost
things => thing
movies => movie
works => work
uniforms => uniform
serves => serf
ps => p
ps => p
minutes => minute
lets => let
friends => friend
thoughts => thought
messages => message
needs => need
songs => song
thoughts => thought
scenarios => scenario
ios => io
logos => logo
photos => photo
logos => logo
opinions => opinion
games => game
gets => get
exceptions => exception
ones => one
investments => investment
teenagers => teenager
competitors => competitor
ways => way
pills => pill
holds => hold
friends => friend
friends => friend
lies => lie
lies => lie
lies => lie
lies => lie
lies => lie
wants => want

In [36]:
print(lem_tokens, len(lem_tokens))

['teacher', 'seeing', 'submit', 'assignment', 'wonder', 'teacher', 'silently', 'judging', 'amp', 'filler', 'filter', 'flicker', 'sticker', 'kicker', 'clicked', 'uncle', 'got', 'married', 'uncle', 'got', 'married', 'beautiful', 'wife', 'would', 'like', 'present', 'something', 'couldnt', 'think', 'anything', 'else', 'tell', 'guy', 'congratulation', 'uncle', 'jeremy', 'im', 'happy', 'sad', 'story', 'girl', 'break', 'dont', 'need', 'karma', 'go', 'sport', 'much', 'edit', 'karma', 'considered', 'lot', 'two', 'day', 'ago', 'marked', 'year', 'anniversary', 'edit', 'time', 'accumulated', 'bit', 'karma', 'wondering', 'much', 'karma', 'considered', 'lot', 'cat', 'attack', 'cat', 'decided', 'jump', 'underneath', 'chair', 'attack', 'barely', 'even', 'scratched', 'cat', 'annoying', 'name', 'edyedmond', 'he', 'always', 'surprising', 'u', 'time', 'he', 'super', 'friendly', 'fun', 'time', 'first', 'youtube', 'video', 'made', 'closer', 'today', 'look', 'month', 'year', 'month', 'since', 'video', 'made'