<a href="https://www.kaggle.com/code/iqmansingh/getting-started-with-nlp?scriptVersionId=135703733" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

<img src="https://cdn.discordapp.com/attachments/1111599839663370271/1124998618529681428/NLP-Banner.jpg">

# **Getting Started with NLP**

---

### This notebook contains various types of Text Preprocessing Techniques used for NLP like
 1. Tokenization
 2. Stemming
 3. Lemmatization
 4. Vectorization
  - sklearn.feature_extraction.text.CountVectorizer
  - sklearn.feature_extraction.text.TfidfVectorizer
  - Gensim.Word2Vec
  - tf.keras.layers.Embedding

In [204]:
import numpy as np
import pandas as pd 
import tensorflow as tf
import datetime
import warnings
import nltk
import random
import re
import sklearn
import zipfile
import gensim
import matplotlib.pyplot as plt
import seaborn as sns

nltk.download('punkt',download_dir="/kaggle/working/")
nltk.download('wordnet',download_dir="/kaggle/working/")
nltk.download('stopwords',download_dir="/kaggle/working/")
nltk.data.path.append('/kaggle/working/')

with zipfile.ZipFile("/kaggle/working/corpora/wordnet.zip", 'r') as zip_f:
    zip_f.extractall("/kaggle/working/corpora/")
    
warnings.filterwarnings("ignore")
pd.plotting.register_matplotlib_converters()
%matplotlib inline
plt.style.use('dark_background')

[nltk_data] Downloading package punkt to /kaggle/working/...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /kaggle/working/...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /kaggle/working/...
[nltk_data]   Package stopwords is already up-to-date!


In [205]:
df = pd.read_csv("/kaggle/input/ted-ultimate-dataset/2020-05-01/ted_talks_en.csv")
df.sort_values(by="views",ascending=False,inplace=True)

In [206]:
# Tim Urban: "Inside the mind of a master procrastinator" Speech                                            
speech = df.iloc[6].transcript
speech[0:100]

'So in college, I was a government major, which means I had to write a lot of papers. Now, when a nor'

---
# 1. Tokenization


# 1.1 Sentence Tokenization 

In [207]:
sentences = nltk.sent_tokenize(speech)
sentences[:5]

['So in college, I was a government major, which means I had to write a lot of papers.',
 'Now, when a normal student writes a paper, they might spread the work out a little like this.',
 'So, you know — (Laughter) you get started maybe a little slowly, but you get enough done in the first week that, with some heavier days later on, everything gets done, things stay civil.',
 '(Laughter) And I would want to do that like that.',
 'That would be the plan.']

# 1.2 Word Tokenization 

In [208]:
words = nltk.word_tokenize(speech)
len(words)

2769

In [209]:
for i in range(random.randint(1,50),random.randint(100,200)):
    print(words[i],end=" ")

when a normal student writes a paper , they might spread the work out a little like this . So , you know — ( Laughter ) you get started maybe a little slowly , but you get enough done in the first week that , with some heavier days later on , everything gets done , things stay civil . ( Laughter ) And I would want to do that like that . That would be the plan . I would have it all ready to go , but then , actually , the paper would come along , and then I would kind of do this . ( Laughter ) And that would happen every single paper . But then came my 90-page senior thesis , a paper you 're supposed to spend a year on . And I knew for a paper like that , my normal work flow was not an option . It was way too big a project 

---

# 2. Stemming vs Lemmatization


# 2.1 Stemming

In [210]:
stopwords = nltk.corpus.stopwords.words("english")
print(stopwords)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [211]:
stemmer = nltk.PorterStemmer()
stemmedSentences = []

for j in range(len(sentences)):
    words = nltk.word_tokenize(sentences[j])
#     words = [re.sub('[!,*)@#%(&$_?.^]',"",i).lower() for i in words]
    words = [re.sub("[^a-zA-Z0-9]","",i).lower().lstrip() for i in words]
    words = [stemmer.stem(i) for i in words if i not in stopwords]
    stemmedSentences.append(" ".join(words))
stemmedSentences[:5]

['colleg  govern major  mean write lot paper ',
 ' normal student write paper  might spread work littl like ',
 ' know   laughter  get start mayb littl slowli  get enough done first week  heavier day later  everyth get done  thing stay civil ',
 ' laughter  would want like ',
 'would plan ']

# 2.2 Lemmatization 

In [212]:
lemmatizer = nltk.stem.WordNetLemmatizer()
lemmatizedSentences = []

for j in range(len(sentences)):
    words = nltk.word_tokenize(sentences[j])
#     words = [re.sub("[!,*)@#%(&$_?.:’'^]","",i).lower() for i in words]
    words = [re.sub("[^a-zA-Z0-9]","",i).lower().strip() for i in words]
    words = [lemmatizer.lemmatize(i) for i in words if i not in stopwords]
    lemmatizedSentences.append(" ".join(words))
lemmatizedSentences[:5]

['college  government major  mean write lot paper ',
 ' normal student writes paper  might spread work little like ',
 ' know   laughter  get started maybe little slowly  get enough done first week  heavier day later  everything get done  thing stay civil ',
 ' laughter  would want like ',
 'would plan ']

# 2.3 Comparing Stemming vs Lemmatization

In [213]:
for i in range(5,10):
    print(stemmedSentences[i])
    print(lemmatizedSentences[i])
    print("-"*50)

would readi go   actual  paper would come along  would kind 
would ready go   actually  paper would come along  would kind 
--------------------------------------------------
 laughter  would happen everi singl paper 
 laughter  would happen every single paper 
--------------------------------------------------
came 90page senior thesi  paper suppos spend year 
came 90page senior thesis  paper supposed spend year 
--------------------------------------------------
knew paper like  normal work flow option 
knew paper like  normal work flow option 
--------------------------------------------------
way big project 
way big project 
--------------------------------------------------


---

# 3. Vectorization


# 3.1 Bag of Words (CountVectorizer)
### - sklearn.feature_extraction.text.CountVectorizer

In [214]:
# Frequncy BoW
countVectorizer = sklearn.feature_extraction.text.CountVectorizer(max_features=2000)
X = countVectorizer.fit_transform(lemmatizedSentences).toarray() 
X.shape
# 20 - no of sentences
# 78 - no of features

(142, 482)

In [215]:
print(X)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


# 3.2 TF-IDF (Term Frequency * Inverse Document Frequency)
### - sklearn.feature_extraction.text.TfidfVectorizer

In [216]:
tfidfVecorier = sklearn.feature_extraction.text.TfidfVectorizer(max_features=2000)
X = tfidfVecorier.fit_transform(lemmatizedSentences).toarray() 
X.shape

(142, 482)

In [217]:
print(X)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


# 3.3 Word2Vec

In [239]:
# speech1 = re.sub("[^a-zA-Z0-9\s]","",speech).lower()
wordVecSentences = []

for j in range(len(sentences)):
    words = nltk.word_tokenize(re.sub("[^a-zA-Z0-9\s]","",sentences[j]))
    words = [i.lower().strip() for i in words if i not in stopwords]
    wordVecSentences.append(words)
[print(wordVecSentences[i]) for i in range(3)];

['so', 'college', 'i', 'government', 'major', 'means', 'i', 'write', 'lot', 'papers']
['now', 'normal', 'student', 'writes', 'paper', 'might', 'spread', 'work', 'little', 'like']
['so', 'know', 'laughter', 'get', 'started', 'maybe', 'little', 'slowly', 'get', 'enough', 'done', 'first', 'week', 'heavier', 'days', 'later', 'everything', 'gets', 'done', 'things', 'stay', 'civil']


In [219]:
word2Vec = gensim.models.Word2Vec(wordVecSentences,min_count=1)
print("Length of Word2Vec Vocab:",len(word2Vec.wv))

Length of Word2Vec Vocab: 575


In [223]:
word2Vec.wv["good"]

array([-8.7385764e-03,  7.6911743e-03,  8.0137365e-03, -1.0687785e-03,
        7.0834076e-03,  8.4049255e-03,  6.3676531e-03,  7.0108664e-03,
        8.0570821e-03, -9.5732817e-03, -7.2213686e-03,  9.5317587e-03,
       -5.6686443e-03, -2.2825995e-03, -3.3824297e-03,  4.5296713e-03,
       -3.3953134e-03, -1.8759486e-03, -1.1732690e-03,  1.3023549e-03,
        2.5288290e-03,  8.8041816e-03, -5.3585088e-03,  2.6844721e-03,
       -9.7650746e-03, -7.9475138e-03,  2.5814411e-03,  6.6914572e-03,
        8.8532967e-03,  5.7035079e-03, -6.2186299e-03, -8.9407517e-03,
        3.0737192e-05, -7.7307997e-03, -7.9747047e-03, -4.4920738e-04,
       -8.9009283e-03, -8.2602882e-04, -1.4954179e-03, -7.7554099e-03,
       -2.2529138e-03, -4.9651666e-03, -4.2590206e-03,  7.6394891e-03,
       -8.3245551e-03, -3.6238070e-04,  7.5763999e-03, -9.8938541e-03,
        5.9386245e-03, -9.6813478e-03, -4.4315946e-03, -2.0397895e-03,
       -6.7910412e-03,  2.8875035e-03,  8.5233096e-03,  7.3832534e-03,
      

In [221]:
word2Vec.wv.similar_by_word("good")[:5]

[('slowly', 0.32574841380119324),
 ('across', 0.30761581659317017),
 ('enough', 0.282128244638443),
 ('whatever', 0.25818419456481934),
 ('ready', 0.2548826336860657)]

# 3.4 AvgWord2Vec

In [222]:
word2Vec.wv.index_to_key[:5]

['i', 'and', 'laughter', 'now', 'like']

In [224]:
print("so" in word2Vec.wv.index_to_key)

True


In [253]:
def avgWord2Vec(sent):
    meanSent = [np.mean(word2Vec.wv[word]) for word in sent if (word in word2Vec.wv.index_to_key)]
    return meanSent

In [254]:
avgWord2Vec(["good"])

[-0.00035779606]

In [265]:
avgW2VSentences = []
for i in range(len(wordVecSentences)):
    avgW2VSentences.append(avgWord2Vec(wordVecSentences[i]))
    
[print(avgW2VSentences[i]) for i in range(1,10,5)];

[0.00065491156, 8.164659e-05, 0.00026728242, 1.4213135e-05, 0.00066849, -0.00014551873, -0.0006034722, 0.00082181784, 0.00043954133, -0.00019839092]
[-1.8426916e-05, 0.00067272916, 0.00013808568, -8.467185e-06, 0.0002796575, 0.00019812353, 0.00066849]


# 3.5 Word Embedding
### - Keras OneHot removes Special Chars and Lowers the Text 

In [229]:
#One Hot Representation
VOCAB_SIZE = 10000

oneHot = [tf.keras.preprocessing.text.one_hot(i,VOCAB_SIZE) for i in sentences]
print(oneHot[:2])

[[936, 336, 1359, 3489, 9436, 5133, 3473, 6153, 1483, 4018, 3489, 4249, 636, 8187, 5133, 1061, 4251, 5988], [4126, 2224, 5133, 7387, 8066, 6581, 5133, 8127, 9527, 2532, 8957, 2110, 7917, 3504, 5133, 1024, 1946, 8645]]


In [230]:
#Padding the Vectors
MAXLEN = 50

paddedVecs = tf.keras.utils.pad_sequences(oneHot,padding="pre",maxlen=MAXLEN)
print(paddedVecs[:2])

[[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0  936  336 1359 3489 9436 5133 3473 6153 1483 4018
  3489 4249  636 8187 5133 1061 4251 5988]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0 4126 2224 5133 7387 8066 6581 5133 8127 9527 2532
  8957 2110 7917 3504 5133 1024 1946 8645]]


In [231]:
#Embedding Matrix
DIMENSION = 100

model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Embedding(VOCAB_SIZE,DIMENSION,input_length=MAXLEN))
model.compile(optimizer="adam",loss="mse")
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 50, 100)           1000000   
                                                                 
Total params: 1,000,000
Trainable params: 1,000,000
Non-trainable params: 0
_________________________________________________________________


In [232]:
embeddedVecs = model.predict(paddedVecs)
print(embeddedVecs[0].shape)
# 50 = no of input words
# 100 = no of features in Embedding Matrix

(50, 100)
