In [3]:
texts=["CANADA","Canada","canadA","canada"]
lower_words=[word.lower() for word in texts]
lower_words


['canada', 'canada', 'canada', 'canada']

## Stemming

In [44]:
import nltk
import pandas as pd
import numpy as np
from nltk.stem import PorterStemmer

# init stemmer
porter_stemmer=PorterStemmer()

In [4]:
# stem connect variations
words=["connect","connected","connection","connections","connects"]
stemmed_words=[porter_stemmer.stem(word=word) for word in words]

stemdf= pd.DataFrame({'original_word': words,'stemmed_word': stemmed_words})
stemdf

Unnamed: 0,original_word,stemmed_word
0,connect,connect
1,connected,connect
2,connection,connect
3,connections,connect
4,connects,connect


In [5]:
# stem trouble variations
words=["trouble","troubled","troubles","troublemsome"]
stemmed_words=[porter_stemmer.stem(word=word) for word in words]

stemdf= pd.DataFrame({'original_word': words,'stemmed_word': stemmed_words})
stemdf

Unnamed: 0,original_word,stemmed_word
0,trouble,troubl
1,troubled,troubl
2,troubles,troubl
3,troublemsome,troublemsom


## Lemmatization

In [6]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

# init lemmatizer
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\USX28939\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [7]:
#lemmatize trouble variations
words=["trouble","troubling","troubled","troubles",]
lemmatized_words=[lemmatizer.lemmatize(word=word,pos='v') for word in words]
lemmatizeddf= pd.DataFrame({'original_word': words,'lemmatized_word': lemmatized_words})
lemmatizeddf=lemmatizeddf[['original_word','lemmatized_word']]
lemmatizeddf

Unnamed: 0,original_word,lemmatized_word
0,trouble,trouble
1,troubling,trouble
2,troubled,trouble
3,troubles,trouble


In [8]:
#lemmatize goose variations
words=["goose","geese"]
lemmatized_words=[lemmatizer.lemmatize(word=word,pos='n') for word in words]
lemmatizeddf= pd.DataFrame({'original_word': words,'lemmatized_word': lemmatized_words})
lemmatizeddf=lemmatizeddf[['original_word','lemmatized_word']]
lemmatizeddf

Unnamed: 0,original_word,lemmatized_word
0,goose,goose
1,geese,goose


## Stop Word Removal

In [9]:
stopwords=['this','that','and','a','we','it','to','is','of','up','need']
text="this is a text full of content and we need to clean it up"

In [10]:
words=text.split(" ")
shortlisted_words=[]

#remove stop words
for w in words:
    if w not in stopwords:
        shortlisted_words.append(w)
    else:
        shortlisted_words.append("W")

print("original sentence = ",text)    
print("sentence with stop words removed= ",' '.join(shortlisted_words))

original sentence =  this is a text full of content and we need to clean it up
sentence with stop words removed=  W W W text full W content W W W W clean W W


## Noise Removal

In [11]:
import nltk
import pandas as pd
import re
from nltk.stem import PorterStemmer

porter_stemmer=PorterStemmer()

In [12]:
# stem raw words with noise
raw_words=["..trouble..","trouble<","trouble!","<a>trouble</a>",'1.trouble']
stemmed_words=[porter_stemmer.stem(word=word) for word in raw_words]
stemdf= pd.DataFrame({'raw_word': raw_words,'stemmed_word': stemmed_words})
stemdf

Unnamed: 0,raw_word,stemmed_word
0,..trouble..,..trouble..
1,trouble<,trouble<
2,trouble!,trouble!
3,<a>trouble</a>,<a>trouble</a>
4,1.trouble,1.troubl


In [13]:
def scrub_words(text):
    """Basic cleaning of texts."""
    
    # remove html markup
    text=re.sub("(<.*?>)","",text)
    
    #remove non-ascii and digits
    text=re.sub("(\\W|\\d)"," ",text)
    
    #remove whitespace
    text=text.strip()
    return text

In [14]:
# stem words already cleaned
cleaned_words=[scrub_words(w) for w in raw_words]
cleaned_stemmed_words=[porter_stemmer.stem(word=word) for word in cleaned_words]
stemdf= pd.DataFrame({'raw_word': raw_words,'cleaned_word':cleaned_words,'stemmed_word': cleaned_stemmed_words})
stemdf=stemdf[['raw_word','cleaned_word','stemmed_word']]
stemdf

Unnamed: 0,raw_word,cleaned_word,stemmed_word
0,..trouble..,trouble,troubl
1,trouble<,trouble,troubl
2,trouble!,trouble,troubl
3,<a>trouble</a>,trouble,troubl
4,1.trouble,trouble,troubl


## Cosine Similarity

In [39]:
#https://www.machinelearningplus.com/nlp/cosine-similarity/
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [18]:
# Define the documents
doc_trump = "Mr. Trump became president after winning the political election. Though he lost the support of some republican friends, Trump is friends with President Putin"

doc_election = "President Trump says Putin had no political interference is the election outcome. He says it was a witchhunt by political parties. He claimed President Putin is a friend who had nothing to do with the election"

doc_putin = "Post elections, Vladimir Putin became President of Russia. President Putin had served as the Prime Minister earlier in his political career"

documents = [doc_trump, doc_election, doc_putin]

In [28]:
# Create the Document Term Matrix
count_vectorizer = CountVectorizer(stop_words='english')
sparse_matrix = count_vectorizer.fit_transform(documents)
# OPTIONAL: Convert Sparse Matrix to Pandas Dataframe if you want to see the word frequencies.
doc_term_matrix = sparse_matrix.todense()
df = pd.DataFrame(doc_term_matrix, 
                  columns=count_vectorizer.get_feature_names(), 
                  index=['doc_trump', 'doc_election', 'doc_putin'])
df

Unnamed: 0,career,claimed,earlier,election,elections,friend,friends,interference,lost,minister,...,putin,republican,russia,says,served,support,trump,vladimir,winning,witchhunt
doc_trump,0,0,0,1,0,0,2,0,1,0,...,1,1,0,0,0,1,2,0,1,0
doc_election,0,1,0,2,0,1,0,1,0,0,...,2,0,0,2,0,0,1,0,0,1
doc_putin,1,0,1,0,1,0,0,0,0,1,...,2,0,1,0,1,0,0,1,0,0


In [31]:
# Create the Document Term Matrix

#could have used the TfidfVectorizer() instead of CountVectorizer(), 
#because it would have downweighted words that occur frequently across docuemnts.

count_vectorizer = TfidfVectorizer(stop_words='english')
sparse_matrix = count_vectorizer.fit_transform(documents)
# OPTIONAL: Convert Sparse Matrix to Pandas Dataframe if you want to see the word frequencies.
doc_term_matrix = sparse_matrix.todense()
df = pd.DataFrame(doc_term_matrix, 
                  columns=count_vectorizer.get_feature_names(), 
                  index=['doc_trump', 'doc_election', 'doc_putin'])
df

Unnamed: 0,career,claimed,earlier,election,elections,friend,friends,interference,lost,minister,...,putin,republican,russia,says,served,support,trump,vladimir,winning,witchhunt
doc_trump,0.0,0.0,0.0,0.203368,0.0,0.0,0.53481,0.0,0.267405,0.0,...,0.157934,0.267405,0.0,0.0,0.0,0.267405,0.406737,0.0,0.267405,0.0
doc_election,0.0,0.241982,0.0,0.368067,0.0,0.241982,0.0,0.241982,0.0,0.0,...,0.285837,0.0,0.0,0.483963,0.0,0.0,0.184033,0.0,0.0,0.241982
doc_putin,0.287012,0.0,0.287012,0.0,0.287012,0.0,0.0,0.0,0.0,0.287012,...,0.339028,0.0,0.287012,0.0,0.287012,0.0,0.0,0.287012,0.0,0.0


In [34]:
from sklearn.metrics.pairwise import cosine_similarity,cosine_distances

In [33]:
print(cosine_similarity(df, df))

[[1.         0.33027897 0.18740386]
 [0.33027897 1.         0.24226661]
 [0.18740386 0.24226661 1.        ]]


In [35]:
print(cosine_distances(df,df))

[[0.         0.66972103 0.81259614]
 [0.66972103 0.         0.75773339]
 [0.81259614 0.75773339 0.        ]]


## Soft Cosine

we need to consider the semantic meaning should be considered. That is, words similar in meaning should be treated as similar. For Example, ‘President’ vs ‘Prime minister’, ‘Food’ vs ‘Dish’, ‘Hi’ vs ‘Hello’ should be considered similar.

In [36]:
# Define the documents
doc_soup = "Soup is a primarily liquid food, generally served warm or hot (but may be cool or cold), that is made by combining ingredients of meat or vegetables with stock, juice, water, or another liquid. "

doc_noodles = "Noodles are a staple food in many cultures. They are made from unleavened dough which is stretched, extruded, or rolled flat and cut into one of a variety of shapes."

doc_dosa = "Dosa is a type of pancake from the Indian subcontinent, made from a fermented batter. It is somewhat similar to a crepe in appearance. Its main ingredients are rice and black gram."

documents = [doc_trump, doc_election, doc_putin, doc_soup, doc_noodles, doc_dosa]

In [4]:
import gensim
# upgrade gensim if you can't import softcossim
from gensim.matutils import softcossim 
from gensim import corpora
import gensim.downloader as api
from gensim.utils import simple_preprocess
print(gensim.__version__)
#> '3.6.0'

# Download the FastText model
fasttext_model300 = api.load('fasttext-wiki-news-subwords-300')



3.6.0


To compute soft cosines, you need the dictionary (a map of word toTo compute soft cosines, you need the dictionary (a map of word to unique id), the corpus (word counts) for each sentence and the similarity matrix. unique id), 

In [38]:
# Prepare a dictionary and a corpus.
dictionary = corpora.Dictionary([simple_preprocess(doc) for doc in documents])

# Prepare the similarity matrix
similarity_matrix = fasttext_model300.similarity_matrix(dictionary, tfidf=None, threshold=0.0, exponent=2.0, nonzero_limit=100)

# Convert the sentences into bag-of-words vectors.
sent_1 = dictionary.doc2bow(simple_preprocess(doc_trump))
sent_2 = dictionary.doc2bow(simple_preprocess(doc_election))
sent_3 = dictionary.doc2bow(simple_preprocess(doc_putin))
sent_4 = dictionary.doc2bow(simple_preprocess(doc_soup))
sent_5 = dictionary.doc2bow(simple_preprocess(doc_noodles))
sent_6 = dictionary.doc2bow(simple_preprocess(doc_dosa))

sentences = [sent_1, sent_2, sent_3, sent_4, sent_5, sent_6]

  if np.issubdtype(vec.dtype, np.int):


In [41]:
# Compute soft cosine similarity
print(softcossim(sent_1, sent_2, similarity_matrix))

0.5842469768347521


In [46]:
col  = ["doc_trump","doc_election","doc_putin","doc_soup","doc_noodles","doc_dosa"]

def create_soft_cossim_matrix(sentences):
    len_array = np.arange(len(sentences))
    xx, yy = np.meshgrid(len_array, len_array)
    cossim_mat = pd.DataFrame([[round(softcossim(sentences[i],sentences[j], similarity_matrix) ,2) for i, j in zip(x,y)] for y, x in zip(xx, yy)],
                             columns=col, index = col )
    return cossim_mat

create_soft_cossim_matrix(sentences)

Unnamed: 0,doc_trump,doc_election,doc_putin,doc_soup,doc_noodles,doc_dosa
doc_trump,1.0,0.58,0.56,0.28,0.34,0.4
doc_election,0.58,1.0,0.54,0.25,0.31,0.43
doc_putin,0.56,0.54,1.0,0.19,0.25,0.36
doc_soup,0.28,0.25,0.19,1.0,0.5,0.38
doc_noodles,0.34,0.31,0.25,0.5,1.0,0.56
doc_dosa,0.4,0.43,0.36,0.38,0.56,1.0


## TF-IDF

In [47]:
S1 = "The car is driven on the road."
S2 = "The truck is driven on the highway."

In [48]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [53]:
vec = TfidfVectorizer()
out = vec.fit_transform([S1,S2]) # see it pass as a list
print(out)

  (0, 6)	0.6043795515372431
  (0, 0)	0.42471718586982765
  (0, 3)	0.30218977576862155
  (0, 1)	0.30218977576862155
  (0, 4)	0.30218977576862155
  (0, 5)	0.42471718586982765
  (1, 6)	0.6043795515372431
  (1, 3)	0.30218977576862155
  (1, 1)	0.30218977576862155
  (1, 4)	0.30218977576862155
  (1, 7)	0.42471718586982765
  (1, 2)	0.42471718586982765


In [56]:
vec.get_feature_names()
#so the output above is document number and word index in the get feature names

['car', 'driven', 'highway', 'is', 'on', 'road', 'the', 'truck']

In [59]:
type(out)

scipy.sparse.csr.csr_matrix

In [63]:
doc_term_matrix = out.todense()
df = pd.DataFrame(doc_term_matrix, 
                  columns=vec.get_feature_names(), 
                  index=["S1","S2"])
df

Unnamed: 0,car,driven,highway,is,on,road,the,truck
S1,0.424717,0.30219,0.0,0.30219,0.30219,0.424717,0.60438,0.0
S2,0.0,0.30219,0.424717,0.30219,0.30219,0.0,0.60438,0.424717


In [8]:
corpus = ['The sky is blue and beautiful.',
          'Love this blue and beautiful sky!',
   
          #https://towardsdatascience.com/understanding-feature-engineering-part-3-traditional-methods-for-text-data-f6f7d70acd41
]

# Count Vector

In [4]:
from sklearn.feature_extraction.text import CountVectorizer
import nltk as nk

  return f(*args, **kwds)


In [3]:
cv = CountVectorizer(min_df=0., max_df=1.)
cv_matrix = cv.fit_transform(corpus)
cv_matrix = cv_matrix.toarray()
cv_matrix

array([[1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
        1, 1, 0, 0, 0, 0],
       [1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
        1, 0, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0,
        0, 2, 0, 0, 0, 0],
       [1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1,
        0, 0, 0, 1, 0, 0],
       [1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1,
        0, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 2, 0, 0, 1, 0, 0, 1, 0,
        0, 2, 0, 0, 0, 0],
       [1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0,
        2, 2, 0, 0, 1, 2],
       [0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 2, 0, 0, 1, 0, 0, 1, 0,
        0, 2, 0, 0, 0, 0]], dtype=int64)

In [9]:
t = nk.bigrams(corpus)
print(t)

<generator object bigrams at 0x000001C874552D00>
