In [1]:
from string import punctuation

import nltk
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import WordPunctTokenizer
from sklearn.feature_extraction.text import (CountVectorizer, TfidfTransformer,
                                             TfidfVectorizer)

In [4]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to C:\Users\Rafail
[nltk_data]     Gabdullin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Rafail
[nltk_data]     Gabdullin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to C:\Users\Rafail
[nltk_data]     Gabdullin\AppData\Roaming\nltk_data...


True

In [5]:
def vectorize(tokens):
    ''' This function takes list of words in a sentence as input
    and returns a vector of size of filtered_vocab.It puts 0 if the
    word is not present in tokens and count of token if present.'''
    vector = []
    for w in filtered_vocab:
        vector.append(tokens.count(w))
    return vector


def unique(sequence):
    '''This functions returns a list in which the order remains
    same and no item repeats.Using the set() function does not
    preserve the original ordering,so i didnt use that instead'''
    seen = set()
    return [x for x in sequence if not (x in seen or seen.add(x))]

In [6]:
with open('./quora.txt', encoding='utf-8') as f:
    data = list(f)

tokenizer = WordPunctTokenizer()
data_tok = [tokenizer.tokenize(row.lower()) for row in data]

In [7]:
sub_data_tok = data_tok[:100]

pre_dict = []
for s in sub_data_tok:
    pre_dict.extend(s)
vocab = unique(pre_dict)

filtered_vocab = []
for w in vocab:
    if w not in stopwords.words('english') and w not in punctuation:
        filtered_vocab.append(w)

vectors = [vectorize(s) for s in sub_data_tok]

data_len = len(sub_data_tok)
cosines = {}
for i in range(data_len):
    for j in range(i + 1, data_len):
        cosines[i, j] = (np.dot(vectors[i], vectors[j]) /
                         np.linalg.norm(vectors[i]) / np.linalg.norm(vectors[j]))
max(cosines.values())
closest_pair = max(cosines, key=cosines.get)
for x in closest_pair:
    print(sub_data_tok[x])

['what', 'does', 'entertainment', 'mean', 'for', 'you', '?']
['what', 'does', '"', 'las', 'vegas', '"', 'mean', '?']


In [9]:
text = ["kolkata big city india trade", "mumbai financial capital india",
        "delhi capital india", "kolkata capital colonial times",
        "bangalore tech hub india software",
        "mumbai hub trade commerce stock exchange",
        "kolkata victoria memorial", "delhi india gate",
        "mumbai gate way india trade business", "delhi red fort india",
        "kolkata metro oldest india",
        "delhi metro largest metro network india"
        ]

count = CountVectorizer()
word_count = count.fit_transform(text)
print(word_count)
print(count.get_feature_names())
print(word_count.toarray())

  (0, 14)	1
  (0, 1)	1
  (0, 4)	1
  (0, 13)	1
  (0, 26)	1
  (1, 13)	1
  (1, 18)	1
  (1, 9)	1
  (1, 3)	1
  (2, 13)	1
  (2, 3)	1
  (2, 7)	1
  (3, 14)	1
  (3, 3)	1
  (3, 5)	1
  (3, 25)	1
  (4, 13)	1
  (4, 0)	1
  (4, 24)	1
  (4, 12)	1
  (4, 22)	1
  (5, 26)	1
  (5, 18)	1
  (5, 12)	1
  (5, 6)	1
  :	:
  (6, 14)	1
  (6, 27)	1
  (6, 16)	1
  (7, 13)	1
  (7, 7)	1
  (7, 11)	1
  (8, 13)	1
  (8, 26)	1
  (8, 18)	1
  (8, 11)	1
  (8, 28)	1
  (8, 2)	1
  (9, 13)	1
  (9, 7)	1
  (9, 21)	1
  (9, 10)	1
  (10, 14)	1
  (10, 13)	1
  (10, 17)	1
  (10, 20)	1
  (11, 13)	1
  (11, 7)	1
  (11, 17)	2
  (11, 15)	1
  (11, 19)	1
['bangalore', 'big', 'business', 'capital', 'city', 'colonial', 'commerce', 'delhi', 'exchange', 'financial', 'fort', 'gate', 'hub', 'india', 'kolkata', 'largest', 'memorial', 'metro', 'mumbai', 'network', 'oldest', 'red', 'software', 'stock', 'tech', 'times', 'trade', 'victoria', 'way']
[[0 1 0 0 1 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0]
 [0 0 0 1 0 0 0 0 0 1 0 0 0 1 0 0 0 0 1 0 0 0 0 0

In [12]:
tfidf_transformer = TfidfTransformer(smooth_idf=True, use_idf=True)
tfidf_transformer.fit(word_count)
df_idf = pd.DataFrame(tfidf_transformer.idf_,
                      index=count.get_feature_names(), columns=['idf_weights'])

df_idf.sort_values(by=['idf_weights'])

# tfidf
tf_idf_vector = tfidf_transformer.transform(word_count)
feature_names = count.get_feature_names()

In [13]:
# "mumbai financial capital india"
first_document_vector = tf_idf_vector[1]
df_tfifd = pd.DataFrame(first_document_vector.T.todense(),
                        index=feature_names, columns=["tfidf"])
df_tfifd.sort_values(by=["tfidf"], ascending=False)

Unnamed: 0,tfidf
financial,0.653123
capital,0.495483
mumbai,0.495483
india,0.287095
bangalore,0.0
memorial,0.0
victoria,0.0
trade,0.0
times,0.0
tech,0.0


In [14]:
# quora TF-IDF
count = CountVectorizer(vocabulary=filtered_vocab)
word_count = count.fit_transform([' '.join(s) for s in sub_data_tok])
tfidf_transformer = TfidfTransformer(smooth_idf=True, use_idf=True)
tfidf_transformer.fit(word_count)
tf_idf_vector = tfidf_transformer.transform(word_count)
vectors = tf_idf_vector.toarray()

In [15]:
data_len = len(sub_data_tok)
cosines = {}
for i in range(data_len):
    for j in range(i + 1, data_len):
        cosines[i, j] = (np.dot(vectors[i], vectors[j]) /
                         np.linalg.norm(vectors[i]) / np.linalg.norm(vectors[j]))
max(cosines.values())
closest_pair = max(cosines, key=cosines.get)
for x in closest_pair:
    print(sub_data_tok[x])

['what', 'is', 'your', 'review', 'of', 'osquery', '?']
['what', 'is', 'your', 'review', 'of', 'ipad', 'mini', '2', '?']
