In [22]:
import numpy as np
import re

In [23]:
corpus = """
Simple example with Cats and Mouse
Another simple example with dogs and cats
Another simple example with mouse and cheese
""".split("\n")[1:-1]

In [24]:
corpus

['Simple example with Cats and Mouse',
 'Another simple example with dogs and cats',
 'Another simple example with mouse and cheese']

In [25]:
# clearing and tokenizing
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')

l_A = word_tokenize(corpus[0].lower())
l_B = word_tokenize(corpus[1].lower())
l_C = word_tokenize(corpus[2].lower())

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Administrator\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [26]:
# Calculating bag of words
word_set = set(l_A).union(set(l_B)).union(set(l_C))

word_dict_A = dict.fromkeys(word_set, 0)
word_dict_B = dict.fromkeys(word_set, 0)
word_dict_C = dict.fromkeys(word_set, 0)

for word in l_A:
    word_dict_A[word] += 1

for word in l_B:
    word_dict_B[word] += 1

for word in l_C:
    word_dict_C[word] += 1

In [27]:
word_dict_A

{'dogs': 0,
 'cats': 1,
 'simple': 1,
 'cheese': 0,
 'example': 1,
 'another': 0,
 'and': 1,
 'with': 1,
 'mouse': 1}

In [28]:
def compute_tf(word_dict, l):
    tf = {}
    sum_nk = len(l)
    for word, count in word_dict.items():
        tf[word] = count/sum_nk
    return tf
  
tf_A = compute_tf(word_dict_A, l_A)
tf_B = compute_tf(word_dict_B, l_B)
tf_C = compute_tf(word_dict_C, l_C)

In [29]:
tf_A

{'dogs': 0.0,
 'cats': 0.16666666666666666,
 'simple': 0.16666666666666666,
 'cheese': 0.0,
 'example': 0.16666666666666666,
 'another': 0.0,
 'and': 0.16666666666666666,
 'with': 0.16666666666666666,
 'mouse': 0.16666666666666666}

In [30]:
tf_B

{'dogs': 0.14285714285714285,
 'cats': 0.14285714285714285,
 'simple': 0.14285714285714285,
 'cheese': 0.0,
 'example': 0.14285714285714285,
 'another': 0.14285714285714285,
 'and': 0.14285714285714285,
 'with': 0.14285714285714285,
 'mouse': 0.0}

In [31]:
tf_C

{'dogs': 0.0,
 'cats': 0.0,
 'simple': 0.14285714285714285,
 'cheese': 0.14285714285714285,
 'example': 0.14285714285714285,
 'another': 0.14285714285714285,
 'and': 0.14285714285714285,
 'with': 0.14285714285714285,
 'mouse': 0.14285714285714285}

In [32]:
def compute_idf(strings_list):
    n = len(strings_list)
    idf = dict.fromkeys(strings_list[0].keys(), 0)
    for l in strings_list:
        for word, count in l.items():
            if count > 0:
                idf[word] += 1
    
    for word, v in idf.items():
        idf[word] = np.log(n / float(v))
    return idf
    
idf = compute_idf([word_dict_A, word_dict_B, word_dict_C])
idf

{'dogs': np.float64(1.0986122886681098),
 'cats': np.float64(0.4054651081081644),
 'simple': np.float64(0.0),
 'cheese': np.float64(1.0986122886681098),
 'example': np.float64(0.0),
 'another': np.float64(0.4054651081081644),
 'and': np.float64(0.0),
 'with': np.float64(0.0),
 'mouse': np.float64(0.4054651081081644)}

In [33]:
def compute_tf_idf(tf, idf):
    tf_idf = dict.fromkeys(tf.keys(), 0)
    for word, v in tf.items():
        tf_idf[word] = v * idf[word]
    return tf_idf
    
tf_idf_A = compute_tf_idf(tf_A, idf)
tf_idf_B = compute_tf_idf(tf_B, idf)
tf_idf_C = compute_tf_idf(tf_C, idf)

In [34]:
tf_idf_A

{'dogs': np.float64(0.0),
 'cats': np.float64(0.06757751801802739),
 'simple': np.float64(0.0),
 'cheese': np.float64(0.0),
 'example': np.float64(0.0),
 'another': np.float64(0.0),
 'and': np.float64(0.0),
 'with': np.float64(0.0),
 'mouse': np.float64(0.06757751801802739)}

In [35]:
tf_idf_B

{'dogs': np.float64(0.15694461266687282),
 'cats': np.float64(0.05792358687259491),
 'simple': np.float64(0.0),
 'cheese': np.float64(0.0),
 'example': np.float64(0.0),
 'another': np.float64(0.05792358687259491),
 'and': np.float64(0.0),
 'with': np.float64(0.0),
 'mouse': np.float64(0.0)}

In [36]:
tf_idf_C

{'dogs': np.float64(0.0),
 'cats': np.float64(0.0),
 'simple': np.float64(0.0),
 'cheese': np.float64(0.15694461266687282),
 'example': np.float64(0.0),
 'another': np.float64(0.05792358687259491),
 'and': np.float64(0.0),
 'with': np.float64(0.0),
 'mouse': np.float64(0.05792358687259491)}

In [37]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans


all_text  =  """
 Google and Facebook are strangling the free press to death. Democracy is the loserGoogle an 
Your 60-second guide to security stuff Google touted today at Next '18
A Guide to Using Android Without Selling Your Soul to Google
Review: Lenovo’s Google Smart Display is pretty and intelligent
Google Maps user spots mysterious object submerged off the coast of Greece - and no-one knows what it is
Android is better than IOS
In information retrieval, tf–idf or TFIDF, short for term frequency–inverse document frequency
is a numerical statistic that is intended to reflect
how important a word is to a document in a collection or corpus.
It is often used as a weighting factor in searches of information retrieval
text mining, and user modeling. The tf-idf value increases proportionally
to the number of times a word appears in the document
and is offset by the frequency of the word in the corpus
""".split("\n")[1:-1]

# Preprocessing and tokenizing
def preprocessing(line):
    line = line.lower()
    line = re.sub(r"[{}]", " ", line)
    return line

In [38]:
tfidf_vectorizer = TfidfVectorizer(preprocessor=preprocessing)
tfidf = tfidf_vectorizer.fit_transform(all_text)

kmeans = KMeans(n_clusters=2).fit(tfidf)

In [21]:
# lines_for_predicting = ["tf and idf is awesome!", "some androids is there"]
lines_for_predicting = ["tf and idf is awesome!", "Selling Your Soul to Google"]
kmeans.predict(tfidf_vectorizer.transform(lines_for_predicting))


array([1, 0], dtype=int32)