# Tokenization

In [14]:
text = 'Hi Everyone! This is TeamNLP. We are learning Natural Language Processing.'

In [15]:
text.split(' ')

['Hi',
 'Everyone!',
 'This',
 'is',
 'TeamNLP.',
 'We',
 'are',
 'learning',
 'Natural',
 'Language',
 'Processing.']

In [16]:
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize, word_tokenize

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\gokul\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [17]:
#This command splits text into sentences.
sent_tokens = sent_tokenize(text)
sent_tokens

['Hi Everyone!',
 'This is TeamNLP.',
 'We are learning Natural Language Processing.']

In [18]:
# This command splits the text into words
word_tokens = word_tokenize(text)
word_tokens

['Hi',
 'Everyone',
 '!',
 'This',
 'is',
 'TeamNLP',
 '.',
 'We',
 'are',
 'learning',
 'Natural',
 'Language',
 'Processing',
 '.']

# Stemming

In [19]:
from nltk.stem import PorterStemmer, SnowballStemmer
ps = PorterStemmer()

In [20]:
word = ('eats')
ps.stem(word)

'eat'

In [21]:
word = ('eating')
ps.stem(word)

'eat'

In [22]:
word = ('eaten')
ps.stem(word)

'eaten'

In [None]:
 text = 'Hi Everyone! This is TeamNLP. We are learning Natural Language Processing.'

In [24]:
word_tokens = word_tokenize(text)

In [25]:
stemmed_sentence = " ".join(ps.stem(word) for word in word_tokens)
stemmed_sentence

'hi everyon ! thi is teamnlp . we are learn natur languag process .'

# Lemmatization

In [42]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')

from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
sent_tokens = sent_tokenize(text)
print(sent_tokens)

lemmatizer = WordNetLemmatizer()
print(lemmatizer.lemmatize('workers'))


['Hi Everyone!', 'This is TeamNLP.', 'We are learning Natural Language Processing.']
worker


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\gokul\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\gokul\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [31]:
lemmatizer.lemmatize('workers')

'worker'

In [32]:
lemmatizer.lemmatize('words')

'word'

In [33]:
lemmatizer.lemmatize('feet')

'foot'

In [34]:
lemmatizer.lemmatize('stripes', 'v')

'strip'

In [35]:
lemmatizer.lemmatize('stripes', 'n')

'stripe'

In [39]:
 text = 'Hi Everyone! This is TeamNLP. We are learning Natural Language Processing.'

In [40]:
word_tokens = word_tokenize(text)

In [41]:
lemmatized_sentence = " ".join(lemmatizer.lemmatize(word.lower()) for word in word_tokens)
lemmatized_sentence

'hi everyone ! this is teamnlp . we are learning natural language processing .'

# Part of Speech Tagging (POS)

In [43]:
from nltk import pos_tag

In [46]:
import nltk


nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag

sent_tokens = sent_tokenize(text)
print(sent_tokens)

word_tokens = word_tokenize(text)
print(word_tokens)

lemmatizer = WordNetLemmatizer()
print(lemmatizer.lemmatize('workers'))

pos_tags = pos_tag(['fighting'])
print(pos_tags)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\gokul\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\gokul\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\gokul\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


['Your text here.']
['Your', 'text', 'here', '.']
worker
[('fighting', 'VBG')]


In [47]:
pos_tag(['fighting'])

[('fighting', 'VBG')]

In [48]:
 text = 'Hi Everyone! This is TeamNLP. We are learning Natural Language Processing.'

In [49]:
word_tokens = word_tokenize(text)

In [50]:
pos_tag(word_tokens)

[('Hi', 'NNP'),
 ('Everyone', 'NN'),
 ('!', '.'),
 ('This', 'DT'),
 ('is', 'VBZ'),
 ('TeamNLP', 'NNP'),
 ('.', '.'),
 ('We', 'PRP'),
 ('are', 'VBP'),
 ('learning', 'VBG'),
 ('Natural', 'NNP'),
 ('Language', 'NNP'),
 ('Processing', 'NNP'),
 ('.', '.')]

# Text Preprocessing (Clean Data)

In [52]:
import pandas as pd
import string
df = pd.read_csv(r'C:\Users\gokul\Downloads\Twitter Sentiments.csv')
# drop the columns
df = df.drop(columns=['id', 'label'], axis=1)
df.head()

Unnamed: 0,tweet
0,@user when a father is dysfunctional and is s...
1,@user @user thanks for #lyft credit i can't us...
2,bihday your majesty
3,#model i love u take with u all the time in ...
4,factsguide: society now #motivation


## Convert to lowercase

In [53]:
df['clean_text'] = df['tweet'].str.lower()
df.head()

Unnamed: 0,tweet,clean_text
0,@user when a father is dysfunctional and is s...,@user when a father is dysfunctional and is s...
1,@user @user thanks for #lyft credit i can't us...,@user @user thanks for #lyft credit i can't us...
2,bihday your majesty,bihday your majesty
3,#model i love u take with u all the time in ...,#model i love u take with u all the time in ...
4,factsguide: society now #motivation,factsguide: society now #motivation


## Removal of Punctuations

In [54]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [55]:
def remove_punctuations(text):
    punctuations = string.punctuation
    return text.translate(str.maketrans('', '', punctuations))

In [56]:
df['clean_text'] = df['clean_text'].apply(lambda x: remove_punctuations(x))
df.head()

Unnamed: 0,tweet,clean_text
0,@user when a father is dysfunctional and is s...,user when a father is dysfunctional and is so...
1,@user @user thanks for #lyft credit i can't us...,user user thanks for lyft credit i cant use ca...
2,bihday your majesty,bihday your majesty
3,#model i love u take with u all the time in ...,model i love u take with u all the time in u...
4,factsguide: society now #motivation,factsguide society now motivation


## Removal of Stopwords

In [58]:
import nltk


nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from nltk.corpus import stopwords


sent_tokens = sent_tokenize(text)
print("Sentence Tokens:", sent_tokens)

word_tokens = word_tokenize(text)
print("Word Tokens:", word_tokens)

lemmatizer = WordNetLemmatizer()
print("Lemmatized word 'workers':", lemmatizer.lemmatize('workers'))

pos_tags = pos_tag(['fighting'])
print("POS Tagging:", pos_tags)


stop_words = set(stopwords.words('english'))
filtered_words = [word for word in word_tokens if word.lower() not in stop_words]
print("Filtered Words:", filtered_words)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\gokul\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\gokul\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\gokul\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\gokul\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


Sentence Tokens: ['Hi Everyone!', 'This is TeamNLP.', 'We are learning Natural Language Processing.']
Word Tokens: ['Hi', 'Everyone', '!', 'This', 'is', 'TeamNLP', '.', 'We', 'are', 'learning', 'Natural', 'Language', 'Processing', '.']
Lemmatized word 'workers': worker
POS Tagging: [('fighting', 'VBG')]
Filtered Words: ['Hi', 'Everyone', '!', 'TeamNLP', '.', 'learning', 'Natural', 'Language', 'Processing', '.']


In [59]:
from nltk.corpus import stopwords
", ".join(stopwords.words('english'))

"i, me, my, myself, we, our, ours, ourselves, you, you're, you've, you'll, you'd, your, yours, yourself, yourselves, he, him, his, himself, she, she's, her, hers, herself, it, it's, its, itself, they, them, their, theirs, themselves, what, which, who, whom, this, that, that'll, these, those, am, is, are, was, were, be, been, being, have, has, had, having, do, does, did, doing, a, an, the, and, but, if, or, because, as, until, while, of, at, by, for, with, about, against, between, into, through, during, before, after, above, below, to, from, up, down, in, out, on, off, over, under, again, further, then, once, here, there, when, where, why, how, all, any, both, each, few, more, most, other, some, such, no, nor, not, only, own, same, so, than, too, very, s, t, can, will, just, don, don't, should, should've, now, d, ll, m, o, re, ve, y, ain, aren, aren't, couldn, couldn't, didn, didn't, doesn, doesn't, hadn, hadn't, hasn, hasn't, haven, haven't, isn, isn't, ma, mightn, mightn't, mustn, mus

In [60]:
STOPWORDS = set(stopwords.words('english'))
def remove_stopwords(text):
    return " ".join([word for word in text.split() if word not in STOPWORDS])

In [61]:
df['clean_text'] = df['clean_text'].apply(lambda x: remove_stopwords(x))
df.head()

Unnamed: 0,tweet,clean_text
0,@user when a father is dysfunctional and is s...,user father dysfunctional selfish drags kids d...
1,@user @user thanks for #lyft credit i can't us...,user user thanks lyft credit cant use cause do...
2,bihday your majesty,bihday majesty
3,#model i love u take with u all the time in ...,model love u take u time urð± ðððð...
4,factsguide: society now #motivation,factsguide society motivation


## Removal of Frequent Words

In [62]:
from collections import Counter
word_count = Counter()
for text in df['clean_text']:
    for word in text.split():
        word_count[word] += 1
        
word_count.most_common(10)

[('user', 17473),
 ('love', 2647),
 ('day', 2198),
 ('happy', 1663),
 ('amp', 1582),
 ('im', 1139),
 ('u', 1136),
 ('time', 1110),
 ('life', 1086),
 ('like', 1042)]

In [63]:
FREQUENT_WORDS = set(word for (word, wc) in word_count.most_common(3))
def remove_freq_words(text):
    return " ".join([word for word in text.split() if word not in FREQUENT_WORDS])

In [64]:
df['clean_text'] = df['clean_text'].apply(lambda x: remove_freq_words(x))
df.head()

Unnamed: 0,tweet,clean_text
0,@user when a father is dysfunctional and is s...,father dysfunctional selfish drags kids dysfun...
1,@user @user thanks for #lyft credit i can't us...,thanks lyft credit cant use cause dont offer w...
2,bihday your majesty,bihday majesty
3,#model i love u take with u all the time in ...,model u take u time urð± ðððð ð...
4,factsguide: society now #motivation,factsguide society motivation


## Removal of Rare Words

In [65]:
RARE_WORDS = set(word for (word, wc) in word_count.most_common()[:-10:-1])
RARE_WORDS

{'airwaves',
 'carnt',
 'chisolm',
 'ibizabringitonmallorcaholidayssummer',
 'isz',
 'mantle',
 'shirley',
 'youuuð\x9f\x98\x8dð\x9f\x98\x8dð\x9f\x98\x8dð\x9f\x98\x8dð\x9f\x98\x8dð\x9f\x98\x8dð\x9f\x98\x8dð\x9f\x98\x8dð\x9f\x98\x8dâ\x9d¤ï¸\x8f',
 'ð\x9f\x99\x8fð\x9f\x8f¼ð\x9f\x8d¹ð\x9f\x98\x8eð\x9f\x8eµ'}

In [66]:
def remove_rare_words(text):
    return " ".join([word for word in text.split() if word not in RARE_WORDS])

In [67]:
df['clean_text'] = df['clean_text'].apply(lambda x: remove_rare_words(x))
df.head()

Unnamed: 0,tweet,clean_text
0,@user when a father is dysfunctional and is s...,father dysfunctional selfish drags kids dysfun...
1,@user @user thanks for #lyft credit i can't us...,thanks lyft credit cant use cause dont offer w...
2,bihday your majesty,bihday majesty
3,#model i love u take with u all the time in ...,model u take u time urð± ðððð ð...
4,factsguide: society now #motivation,factsguide society motivation


## Removal of Special characters

In [68]:
import re
def remove_spl_chars(text):
    text = re.sub('[^a-zA-Z0-9]', ' ', text)
    text = re.sub('\s+', ' ', text)
    return text

In [69]:
df['clean_text'] = df['clean_text'].apply(lambda x: remove_spl_chars(x))
df.head()

Unnamed: 0,tweet,clean_text
0,@user when a father is dysfunctional and is s...,father dysfunctional selfish drags kids dysfun...
1,@user @user thanks for #lyft credit i can't us...,thanks lyft credit cant use cause dont offer w...
2,bihday your majesty,bihday majesty
3,#model i love u take with u all the time in ...,model u take u time ur
4,factsguide: society now #motivation,factsguide society motivation


## Stemming

In [70]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
def stem_words(text):
    return " ".join([ps.stem(word) for word in text.split()])

In [71]:
df['stemmed_text'] = df['clean_text'].apply(lambda x: stem_words(x))
df.head()

Unnamed: 0,tweet,clean_text,stemmed_text
0,@user when a father is dysfunctional and is s...,father dysfunctional selfish drags kids dysfun...,father dysfunct selfish drag kid dysfunct run
1,@user @user thanks for #lyft credit i can't us...,thanks lyft credit cant use cause dont offer w...,thank lyft credit cant use caus dont offer whe...
2,bihday your majesty,bihday majesty,bihday majesti
3,#model i love u take with u all the time in ...,model u take u time ur,model u take u time ur
4,factsguide: society now #motivation,factsguide society motivation,factsguid societi motiv


## Lemmatization & POS Tagging

In [72]:
from nltk import pos_tag
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
wordnet_map = {"N":wordnet.NOUN, "V": wordnet.VERB, "J": wordnet.ADJ, "R": wordnet.ADV}

def lemmatize_words(text):
    # find pos tags
    pos_text = pos_tag(text.split())
    return " ".join([lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_text])

In [73]:
wordnet.NOUN

'n'

In [81]:
df['lemmatized_text'] = df['clean_text'].apply(lambda x: lemmatize_words(x))
df.head()

Unnamed: 0,tweet,clean_text,stemmed_text,lemmatized_text
0,@user when a father is dysfunctional and is s...,father dysfunctional selfish drags kids dysfun...,father dysfunct selfish drag kid dysfunct run,father dysfunctional selfish drag kid dysfunct...
1,@user @user thanks for #lyft credit i can't us...,thanks lyft credit cant use cause dont offer w...,thank lyft credit cant use caus dont offer whe...,thanks lyft credit cant use cause dont offer w...
2,bihday your majesty,bihday majesty,bihday majesti,bihday majesty
3,#model i love u take with u all the time in ...,model u take u time ur,model u take u time ur,model u take u time ur
4,factsguide: society now #motivation,factsguide society motivation,factsguid societi motiv,factsguide society motivation


In [75]:
df.sample(frac=1).head(10)

Unnamed: 0,tweet,clean_text,stemmed_text,lemmatized_text
22023,@user almost as exciting as #cf16 @user ðb...,almost exciting cf16 business cards arrived ca...,almost excit cf16 busi card arriv campaignfest...,almost exciting cf16 business card arrive camp...
12005,dream fm #inshot #girls #cute #summer #blur #s...,dream fm inshot girls cute summer blur sun fun...,dream fm inshot girl cute summer blur sun fun ...,dream fm inshot girl cute summer blur sun fun ...
23062,join us for a &amp;#039;christmas in july&amp;...,join us amp039christmas julyamp039 happy hour ...,join us amp039christma julyamp039 happi hour f...,join u amp039christmas julyamp039 happy hour f...
11034,bigman got us tickets to see @user ðð¼ð...,bigman got us tickets see cantwait,bigman got us ticket see cantwait,bigman get u ticket see cantwait
28810,the reason #millennials arenÂt at #work fro...,reason millennials aren t work keith breene ch...,reason millenni aren t work keith breen challe...,reason millennials aren t work keith breene ch...
12105,that moment when the #roti does exactly what...,moment roti exactly supposed flame smallthings...,moment roti exactli suppos flame smallth 17 mi...,moment roti exactly suppose flame smallthings ...
18698,so ð#admire you all the moreð@user @use...,admire more user kindness friendship compassi...,admir more user kind friendship compass love,admire more user kindness friendship compassio...
15656,@user @user this joker doesn't craft policy. h...,joker doesnt craft policy spitballs punchdrunk...,joker doesnt craft polici spitbal punchdrunk c...,joker doesnt craft policy spitballs punchdrunk...
3877,@user live right noww!!!!!!!!!!!! #vocal voya...,live right noww vocal voyage 24pm gmt sweet re...,live right noww vocal voyag 24pm gmt sweet reg...,live right noww vocal voyage 24pm gmt sweet re...
5974,repostðºsoffierose ðð #thankyou #love ...,repost soffierose thankyou follow share coming...,repost soffieros thankyou follow share comings...,repost soffierose thankyou follow share coming...


# Spelling Correction

In [113]:
!pip install pyspellchecker

Collecting pyspellchecker
  Obtaining dependency information for pyspellchecker from https://files.pythonhosted.org/packages/e1/d2/c7e3b3a61a34b9320399fa731d1f9f0c73db8a1f28c6764e9e11efa68a29/pyspellchecker-0.8.1-py3-none-any.whl.metadata
  Downloading pyspellchecker-0.8.1-py3-none-any.whl.metadata (9.4 kB)
Downloading pyspellchecker-0.8.1-py3-none-any.whl (6.8 MB)
   ---------------------------------------- 0.0/6.8 MB ? eta -:--:--
   - -------------------------------------- 0.2/6.8 MB 5.4 MB/s eta 0:00:02
   -- ------------------------------------- 0.4/6.8 MB 4.7 MB/s eta 0:00:02
   --- ------------------------------------ 0.6/6.8 MB 4.3 MB/s eta 0:00:02
   ---- ----------------------------------- 0.8/6.8 MB 4.8 MB/s eta 0:00:02
   ----- ---------------------------------- 1.0/6.8 MB 4.3 MB/s eta 0:00:02
   ------- -------------------------------- 1.2/6.8 MB 4.5 MB/s eta 0:00:02
   -------- ------------------------------- 1.4/6.8 MB 4.7 MB/s eta 0:00:02
   --------- ------------------

In [135]:
text = 'natur is a beuty'

In [136]:
from spellchecker import SpellChecker
spell = SpellChecker()

def correct_spellings(text):
    corrected_text = []
    misspelled_text = spell.unknown(text.split())
    # print(misspelled_text)
    for word in text.split():
        if word in misspelled_text:
            corrected_text.append(spell.correction(word))
        else:
            corrected_text.append(word)
            
    return " ".join(corrected_text)

In [137]:
correct_spellings(text)

'nature is a beauty'