<a href="https://colab.research.google.com/github/Loki-33/Stuffs/blob/main/clustering_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics.pairwise import paired_distances
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer

In [None]:
dataset = pd.read_csv('people_wiki.csv')

In [None]:
vect = CountVectorizer()
word_weight = vect.fit_transform(dataset['text'])

In [None]:
nn = NearestNeighbors(metric='cosine')
nn.fit(word_weight)
obama_index = dataset[dataset['name'] == 'Barack Obama'].index[0]
distances, metrics = nn.kneighbors(word_weight[obama_index], n_neighbors=10)

In [None]:
neigbors = pd.DataFrame({'distance':distances.flatten(), 'id':metrics.flatten()})

In [None]:
nearest_info = (
    dataset.merge(neigbors, left_index=True, right_on='id')
    .sort_values('distance')[['id', 'name', 'distance']]
)
nearest_info

Unnamed: 0,id,name,distance
0,35817,Barack Obama,1.44329e-15
1,24478,Joe Biden,0.1224048
2,25798,Sandro Petrone,0.1357289
3,28447,George W. Bush,0.1394896
4,20389,Ribal al-Assad,0.1421223
5,18873,Edward McCrorie,0.1457754
6,31404,David Floyd Lambertson,0.1484614
7,19983,Peter Paret,0.1485327
8,48163,Irving Petlin,0.1519704
9,13120,Vincent Obsitnik,0.1524563


In [None]:
def upack_word_weight(vect, word_weight):
  feature_names = np.array(vect.get_feature_names_out())
  data = word_weight.data
  indices = word_weight.indices
  indptr = word_weight.indptr
  n_docs = word_weight.shape[0]

  word_weight_list =[]
  for i in range(n_docs):
    doc = slice(indptr[i], indptr[i+1])
    count, idx = data[doc], indices[doc]
    feature = feature_names[idx]
    word_weight_dict = Counter({k:v for k,v in zip(feature, count)})
    word_weight_list.append(word_weight_dict)
  return word_weight_list

In [None]:
dataset['word_weight'] = upack_word_weight(vect, word_weight)
dataset.head(3)

Unnamed: 0,URI,name,text,word_weight
0,<http://dbpedia.org/resource/Digby_Morrell>,Digby Morrell,digby morrell born 10 october 1979 is a former...,"{'digby': 1, 'morrell': 5, 'born': 1, '10': 1,..."
1,<http://dbpedia.org/resource/Alfred_J._Lewy>,Alfred J. Lewy,alfred j lewy aka sandy lewy graduated from un...,"{'is': 2, 'with': 2, 'the': 2, 'and': 10, 'in'..."
2,<http://dbpedia.org/resource/Harpdog_Brown>,Harpdog Brown,harpdog brown is a singer and harmonica player...,"{'is': 7, 'who': 1, 'with': 4, 'the': 12, 'and..."


In [None]:
def get_top_words(dataset, name, column_name, top_n=None):
  row =dataset.loc[dataset['name']==name, column_name]
  word_weight_dict = row.to_dict()[row.index[0]]

  if top_n is None:
    top_n = len(word_weight_dict)

  word_weight_table = pd.DataFrame(word_weight_dict.most_common(top_n), columns=['word', 'weight'])
  return word_weight_table

In [None]:
words_obama = get_top_words(dataset, name = 'Barack Obama', column_name = 'word_weight')
words_biden = get_top_words(dataset, name = 'Joe Biden', column_name = 'word_weight')

In [None]:
words_combined = (words_obama.
                  merge(words_biden, on='word').rename(columns={'weight_x':'Obama', 'weight_y':'Biden'}))

words_combined.head(6)

Unnamed: 0,word,Obama,Biden
0,the,40,33
1,in,30,16
2,and,21,19
3,of,18,12
4,to,14,11
5,his,11,5


In [None]:
def has_words(word_weight_vec, common_words):
  unique_words = set(word_weight_vec.keys())
  boolean = common_words.issubset(unique_words)
  return boolean

In [None]:
common_words = set(words_obama['word'].head(5))

dataset['has_top_words'] = dataset['word_weight'].apply(has_words, args=(common_words,))
dataset['has_top_words'].sum()

np.int64(56066)

In [None]:
# TF-IDF

In [None]:
docs = np.array([
    'The sun is shining',
    'The weathaer is sweet',
    'The sun is shining and the weather is sweet'
])


In [None]:
vect= CountVectorizer()
tf = vect.fit_transform(docs).toarray()
tf

array([[0, 1, 1, 1, 0, 1, 0],
       [0, 1, 0, 0, 1, 1, 1],
       [1, 2, 1, 1, 1, 2, 1]])

In [None]:
vect.vocabulary_

{'the': 5, 'sun': 3, 'is': 1, 'shining': 2, 'weather': 6, 'sweet': 4, 'and': 0}

In [None]:
n_docs = len(docs)
df = np.sum(tf != 0, axis = 0)
idf = np.log(n_docs / df) + 1
tf_idf = tf[0] * idf
print(tf_idf)
print()

# library
tfidf = TfidfTransformer(use_idf = True, smooth_idf = False, norm = None)
doc_tfidf = tfidf.fit_transform(tf).toarray()
print(doc_tfidf[0])

assert np.allclose(tf_idf, doc_tfidf[0])

[0.         1.         1.40546511 1.40546511 0.         1.
 0.        ]

[0.         1.         1.40546511 1.40546511 0.         1.
 0.        ]


In [None]:
# the problem with the above way of calculation is that, the terms appearing more
# will be favoured, and makign them appear as more important than they are
# so the fix is just normalized them

In [None]:
# manual
tf_norm = tf_idf / np.sqrt(np.sum(tf_idf ** 2))
print(tf_norm)
print()

# library
tfidf = TfidfTransformer(use_idf = True, smooth_idf = False, norm = 'l2')
doc_tfidf = tfidf.fit_transform(tf).toarray()
print(doc_tfidf[0])

assert np.allclose(tf_norm, doc_tfidf[0])


[0.         0.40993715 0.57615236 0.57615236 0.         0.40993715
 0.        ]

[0.         0.40993715 0.57615236 0.57615236 0.         0.40993715
 0.        ]


In [None]:
dataset=pd.read_csv('people_wiki.csv')

In [None]:
dataset.head()

Unnamed: 0,URI,name,text
0,<http://dbpedia.org/resource/Digby_Morrell>,Digby Morrell,digby morrell born 10 october 1979 is a former...
1,<http://dbpedia.org/resource/Alfred_J._Lewy>,Alfred J. Lewy,alfred j lewy aka sandy lewy graduated from un...
2,<http://dbpedia.org/resource/Harpdog_Brown>,Harpdog Brown,harpdog brown is a singer and harmonica player...
3,<http://dbpedia.org/resource/Franz_Rottensteiner>,Franz Rottensteiner,franz rottensteiner born in waidmannsfeld lowe...
4,<http://dbpedia.org/resource/G-Enka>,G-Enka,henry krvits born 30 december 1974 in tallinn ...


In [None]:
tfidf_vect = TfidfVectorizer()

In [None]:
tfidf_weights = tfidf_vect.fit_transform(dataset['text'])

In [None]:
nn_cosine = NearestNeighbors(metric='cosine', algorithm='brute')
nn_cosine.fit(tfidf_weights)

In [None]:
obama_index = dataset[dataset['name'] == 'Barack Obama'].index[0]
cosine, indices = nn_cosine.kneighbors(tfidf_weights[obama_index], n_neighbors=100)

In [None]:
neigbors_cosine = pd.DataFrame({'cosine':cosine.flatten(), 'id':indices.flatten()})

In [None]:
nearest_info = (
    dataset.merge(neigbors_cosine, left_index=True, right_on='id')
    .sort_values('cosine')[['id', 'name', 'cosine']]
)

In [None]:
nearest_info.head()

Unnamed: 0,id,name,cosine
0,35817,Barack Obama,0.0
1,24478,Joe Biden,0.570781
2,57108,Hillary Rodham Clinton,0.615934
3,38376,Samantha Power,0.624993
4,38714,Eric Stern (politician),0.649765


In [None]:
def upack_word_weight(vect, word_weight):
  feature_names = np.array(vect.get_feature_names_out())
  data = word_weight.data
  indices = word_weight.indices
  indptr = word_weight.indptr
  n_docs = word_weight.shape[0]

  word_weight_list =[]
  for i in range(n_docs):
    doc = slice(indptr[i], indptr[i+1])
    count, idx = data[doc], indices[doc]
    feature = feature_names[idx]
    word_weight_dict = Counter({k:v for k,v in zip(feature, count)})
    word_weight_list.append(word_weight_dict)
  return word_weight_list

In [None]:
dataset['tfidf_weight'] = upack_word_weight(tfidf_vect, tfidf_weights)
dataset.head(3)

Unnamed: 0,URI,name,text,word_weight,has_top_words,tfidf_weight
0,<http://dbpedia.org/resource/Digby_Morrell>,Digby Morrell,digby morrell born 10 october 1979 is a former...,"{'digby': 1, 'morrell': 5, 'born': 1, '10': 1,...",True,"{'digby': 0.09377484096114971, 'morrell': 0.51..."
1,<http://dbpedia.org/resource/Alfred_J._Lewy>,Alfred J. Lewy,alfred j lewy aka sandy lewy graduated from un...,"{'is': 2, 'with': 2, 'the': 2, 'and': 10, 'in'...",True,"{'is': 0.018289389212318905, 'with': 0.0214013..."
2,<http://dbpedia.org/resource/Harpdog_Brown>,Harpdog Brown,harpdog brown is a singer and harmonica player...,"{'is': 7, 'who': 1, 'with': 4, 'the': 12, 'and...",True,"{'is': 0.08336282198056562, 'who': 0.022133364..."


In [None]:
words_obama = get_top_words(dataset, name = 'Barack Obama', column_name = 'tfidf_weight')
words_biden = get_top_words(dataset, name = 'Joe Biden', column_name = 'tfidf_weight')

words_combined = (words_obama.
                  merge(words_biden, on = 'word').
                  rename(columns = {'weight_x': 'Obama', 'weight_y': 'Biden'}))
words_combined.head(6)


Unnamed: 0,word,Obama,Biden
0,obama,0.365018,0.174794
1,the,0.279323,0.248287
2,act,0.249089,0.167737
3,in,0.209673,0.120486
4,iraq,0.151809,0.040891
5,and,0.146739,0.143045


In [None]:
common_words = set(words_combined['word'].head(5))
print('top 5 common words: ', common_words)

dataset['has_top_words'] = dataset['tfidf_weight'].apply(has_words, args = (common_words,))
print('number of articles that also contain the common words: ', dataset['has_top_words'].sum())


top 5 common words:  {'obama', 'in', 'the', 'iraq', 'act'}
number of articles that also contain the common words:  5
