In [74]:
!pip install spacy==3.0
!pip install nltk



In [4]:
import pandas as pd

# **Prepare Data**

In [8]:
#assign ID to each token
sentences = ["I Love NLP", 
             "NLP is a branch of AI", 
             "AI is a branch of ComputerScience", 
             "AI is a new electicity"]

tokenized_sentences = [[t for t in sentence.split()] for sentence in sentences]

vocabulary = set([w for s in tokenized_sentences for w in s])

[[w, i] for i,w in enumerate(vocabulary)]

[['Love', 0],
 ['of', 1],
 ['new', 2],
 ['is', 3],
 ['NLP', 4],
 ['AI', 5],
 ['I', 6],
 ['electicity', 7],
 ['a', 8],
 ['branch', 9],
 ['ComputerScience', 10]]

# **One-Hot encoding**

In [9]:
def onehot_encode(tokenized_sentence):
    return [1 if w in tokenized_sentence else 0 for w in vocabulary]

onehot = [onehot_encode(tokenized_sentence) for tokenized_sentence in tokenized_sentences]

for (sentence, oh) in zip(sentences, onehot):
    print("%s: %s" % (oh, sentence))

[1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0]: I Love NLP
[0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0]: NLP is a branch of AI
[0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1]: AI is a branch of ComputerScience
[0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0]: AI is a new electicity


In [10]:
pd.DataFrame(onehot, columns=vocabulary)

Unnamed: 0,Love,of,new,is,NLP,AI,I,electicity,a,branch,ComputerScience
0,1,0,0,0,1,0,1,0,0,0,0
1,0,1,0,1,1,1,0,0,1,1,0
2,0,1,0,1,0,1,0,0,1,1,1
3,0,0,1,1,0,1,0,1,1,0,0


# **Out of Vocabulary**

In [18]:
##Since these words are not part of our vocab , all elements of vector are marked as 0
onehot_encode("I likes outdoor games".split())

[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0]

# **CountVectorizer**

In [20]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()

In [33]:
more_sentences = sentences + ["I likes outdoor games more than indoor games",
                              "My friend like horror movies"]
pd.DataFrame(more_sentences)

Unnamed: 0,0
0,I Love NLP
1,NLP is a branch of AI
2,AI is a branch of ComputerScience
3,AI is a new electicity
4,I likes outdoor games more than indoor games
5,My friend like horror movies


In [34]:
cv.fit(more_sentences)

CountVectorizer()

In [35]:
print(cv.get_feature_names())

['ai', 'branch', 'computerscience', 'electicity', 'friend', 'games', 'horror', 'indoor', 'is', 'like', 'likes', 'love', 'more', 'movies', 'my', 'new', 'nlp', 'of', 'outdoor', 'than']




In [36]:
#CountVectorizer transform  documents to the vector representation
#count how many times each vocab word occurs in a sentences
dt = cv.transform(more_sentences)

In [37]:
pd.DataFrame(dt.toarray(), columns=cv.get_feature_names_out())

Unnamed: 0,ai,branch,computerscience,electicity,friend,games,horror,indoor,is,like,likes,love,more,movies,my,new,nlp,of,outdoor,than
0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0
1,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,1,0,0
2,1,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0
3,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0
4,0,0,0,0,0,2,0,1,0,0,1,0,1,0,0,0,0,0,1,1
5,0,0,0,0,1,0,1,0,0,1,0,0,0,1,1,0,0,0,0,0


**Cosine similarity between two sentences on CountVectorize vector**

In [51]:
from sklearn.metrics.pairwise import cosine_similarity

sent_1_index=1
sent_2_index=2

print("1. " ,more_sentences[sent_1_index])
print("2. " ,more_sentences[sent_2_index])

print("Cosine Similarity between two sentences : " ,cosine_similarity(dt[sent_1_index], dt[sent_2_index]))

1.  NLP is a branch of AI
2.  AI is a branch of ComputerScience
Cosine Similarity between two sentences :  [[0.8]]


In [52]:
len(more_sentences)

6

In [54]:
#Cosin similarity of each sentence with all other sentences
# Note similarity with itself is highest ( value 1).
# 1= similarity is highest , 0=lowest 
pd.DataFrame(cosine_similarity(dt, dt))

Unnamed: 0,0,1,2,3,4,5
0,1.0,0.316228,0.0,0.0,0.0,0.0
1,0.316228,1.0,0.8,0.447214,0.0,0.0
2,0.0,0.8,1.0,0.447214,0.0,0.0
3,0.0,0.447214,0.447214,1.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.0
5,0.0,0.0,0.0,0.0,0.0,1.0


# **TF/IDF**

 
1.   One-hot encoding : It check word occurance in a sentence or not ,marked with 0 and 1

2.   CountVectorize : It checks word frequency in a sentence 

3.   TF/IDF : Not only check word occurence and frequency but also calculate weitage. It gives less weitage to more frequent word across the docs and give more weitage to the words occurs in a specific doc

In [55]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf = TfidfTransformer()
tfidf_dt = tfidf.fit_transform(dt)

In [57]:
pd.DataFrame(tfidf_dt.toarray(), columns=cv.get_feature_names_out ())

Unnamed: 0,ai,branch,computerscience,electicity,friend,games,horror,indoor,is,like,likes,love,more,movies,my,new,nlp,of,outdoor,than
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.773262,0.0,0.0,0.0,0.0,0.634086,0.0,0.0,0.0
1,0.401324,0.475352,0.0,0.0,0.0,0.0,0.0,0.0,0.401324,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.475352,0.475352,0.0,0.0
2,0.380907,0.451168,0.550195,0.0,0.0,0.0,0.0,0.0,0.380907,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.451168,0.0,0.0
3,0.402494,0.0,0.0,0.581376,0.0,0.0,0.0,0.0,0.402494,0.0,0.0,0.0,0.0,0.0,0.0,0.581376,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.666667,0.0,0.333333,0.0,0.0,0.333333,0.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.333333,0.333333
5,0.0,0.0,0.0,0.0,0.447214,0.0,0.447214,0.0,0.0,0.447214,0.0,0.0,0.0,0.447214,0.447214,0.0,0.0,0.0,0.0,0.0


**Cosine similarity on TF/IDF vector**

In [58]:
# cosine similarity on TF/IDF vector
pd.DataFrame(cosine_similarity(tfidf_dt, tfidf_dt))

Unnamed: 0,0,1,2,3,4,5
0,1.0,0.301414,0.0,0.0,0.0,0.0
1,0.301414,1.0,0.734661,0.323061,0.0,0.0
2,0.0,0.734661,1.0,0.306626,0.0,0.0
3,0.0,0.323061,0.306626,1.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.0
5,0.0,0.0,0.0,0.0,0.0,1.0


In [67]:
df_text=pd.DataFrame(more_sentences,columns=["Text"])
df_text

Unnamed: 0,Text
0,I Love NLP
1,NLP is a branch of AI
2,AI is a branch of ComputerScience
3,AI is a new electicity
4,I likes outdoor games more than indoor games
5,My friend like horror movies


# **PERFORMING LINGUISTIC ANALYSIS**

Reducing feature by  
  
*   removing STOP words
*   extract Lemmas 
*   extract NOUN_ADJ_VERB




In [68]:
## reducing features by removing STOP words , set min frequency and n-gram 

from sklearn.feature_extraction.text import TfidfVectorizer
from spacy.lang.en.stop_words import STOP_WORDS as stopwords

tfidf = TfidfVectorizer(stop_words=stopwords, ngram_range=(1,2), min_df=2)
dt = tfidf.fit_transform(df_text["Text"])
print(dt.shape)
print(dt.data.nbytes)
tfidf = TfidfVectorizer(stop_words=stopwords, ngram_range=(1,3), min_df=2)
dt = tfidf.fit_transform(df_text["Text"])
print(dt.shape)
print(dt.data.nbytes)

(6, 3)
56
(6, 3)
56


  % sorted(inconsistent)
  % sorted(inconsistent)


In [None]:
!python -m spacy download en_core_web_sm

In [84]:
from tqdm.auto import tqdm
import spacy


nlp = spacy.load('en_core_web_sm')

nouns_adjectives_verbs = ["NOUN", "PROPN", "ADJ", "ADV", "VERB"]
for i, row in tqdm(df_text.iterrows(), total=len(df_text)):
    doc = nlp(str(row["Text"]))
    df_text.at[i, "lemmas"] = " ".join([token.lemma_ for token in doc])
    df_text.at[i, "nav"] = " ".join([token.lemma_ for token in doc if token.pos_ in nouns_adjectives_verbs])

  0%|          | 0/6 [00:00<?, ?it/s]

In [85]:
df_text.head()

Unnamed: 0,Text,lemmas,nav
0,I Love NLP,I love NLP,love NLP
1,NLP is a branch of AI,NLP be a branch of ai,NLP branch ai
2,AI is a branch of ComputerScience,AI be a branch of ComputerScience,AI branch ComputerScience
3,AI is a new electicity,AI be a new electicity,AI new electicity
4,I likes outdoor games more than indoor games,I like outdoor game more than indoor game,like outdoor game more indoor game


In [88]:
# TF/IDF on lemms insted of original text
tfidf = TfidfVectorizer(stop_words=stopwords)
dt = tfidf.fit_transform(df_text["lemmas"].map(str))
dt

  % sorted(inconsistent)


<6x14 sparse matrix of type '<class 'numpy.float64'>'
	with 19 stored elements in Compressed Sparse Row format>

In [89]:
# TF/IDF on nav insted of original text
tfidf = TfidfVectorizer(stop_words=stopwords)
dt = tfidf.fit_transform(df_text["nav"].map(str))
dt

  % sorted(inconsistent)


<6x14 sparse matrix of type '<class 'numpy.float64'>'
	with 18 stored elements in Compressed Sparse Row format>

# **Fiding most similar words**

In [102]:
tfidf_word = TfidfVectorizer(stop_words=stopwords, min_df=2)
dt_word = tfidf_word.fit_transform(df_text["Text"])

  % sorted(inconsistent)


In [103]:
r = cosine_similarity(dt_word.T, dt_word.T)
np.fill_diagonal(r, 0)

In [104]:
voc = tfidf_word.get_feature_names_out ()
size = r.shape[0] # quadratic
for index in np.argsort(r.flatten())[::-1][0:40]:
    a = int(index/size)
    b = index%size
    if a > b:  # avoid repetitions
        print('"%s" related to "%s"' % (voc[a], voc[b]))

"branch" related to "ai"
"nlp" related to "branch"
"nlp" related to "ai"
