# Import Libraries

In [1]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plot
%matplotlib inline

# Load Data

In [2]:
people = pd.read_csv('people_wiki.csv')
people.head()

Unnamed: 0,URI,name,text
0,<http://dbpedia.org/resource/Digby_Morrell>,Digby Morrell,digby morrell born 10 october 1979 is a former...
1,<http://dbpedia.org/resource/Alfred_J._Lewy>,Alfred J. Lewy,alfred j lewy aka sandy lewy graduated from un...
2,<http://dbpedia.org/resource/Harpdog_Brown>,Harpdog Brown,harpdog brown is a singer and harmonica player...
3,<http://dbpedia.org/resource/Franz_Rottensteiner>,Franz Rottensteiner,franz rottensteiner born in waidmannsfeld lowe...
4,<http://dbpedia.org/resource/G-Enka>,G-Enka,henry krvits born 30 december 1974 in tallinn ...


# CountVectorizer

In [3]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
f = cv.fit_transform(people['text'])  # Matrix

In [4]:
feature_count = f.sum(axis=0).tolist()[0]
feature_names = cv.get_feature_names()

## word count for Elton John

In [5]:
cv_ej = CountVectorizer()
f_ej = cv_ej.fit_transform(people[people['name']=='Elton John'].text)

In [6]:
index_ej = people[people['name']=='Elton John'].index.values[0]
feature_count2 = f_ej.sum(axis=0).tolist()[0]
feature_name2 = cv_ej.get_feature_names()

In [7]:
# Construct Dataframe
pd.DataFrame(list(zip(feature_name2, feature_count2)), columns=['Feature','Count']).sort_values(by='Count', ascending=False)

Unnamed: 0,Feature,Count
227,the,27
135,in,18
42,and,15
176,of,13
118,has,9
...,...,...
97,events,1
99,fellow,1
100,fifty,1
101,fight,1


# TF-IDF Vectorizer

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer()
tf_matrix = tf.fit_transform(people['text']) # Extract tfidf

## TF-IDF DataFrame

In [9]:
feature_ej_name = tf.get_feature_names() # Extract all features' name

# Find EJ's index
doc = people[people.name == 'Elton John'].index.values[0]

# Extract EJ's tfidf from matrix -> indice
# .nonzero() delete features EJ not have
feature_ej_indice = tf_matrix[doc, :].nonzero()[1]

# Extract scores and pack with indice
tfidf_scores = zip(feature_ej_indice, [tf_matrix[doc, x] for x in feature_ej_indice])

# Construct a dictionary with features' name and tfidf
d = {}
for w,s in [(feature_ej_name[i], s) for (i,s) in tfidf_scores]:
    d[w] = [s]
    
# Visualize in DataFrame -> Descending
pd.DataFrame.from_dict(d, orient='index', columns=['tfidf']).sort_values(by='tfidf', ascending=False)

Unnamed: 0,tfidf
the,0.243684
billboard,0.192207
john,0.188958
elton,0.184686
furnish,0.181221
...,...
who,0.017235
which,0.015950
first,0.015302
from,0.011675


## Cosine Distance

In [11]:
# Find Index for EJ, VB and PM first
indice_ej = doc 
indice_vb = people[people.name == 'Victoria Beckham'].index.values[0]
indice_pm = people[people.name == 'Paul McCartney'].index.values[0]

# Extract tfidf from matrix
tfidf_ej = tf_matrix[indice_ej, :]
tfidf_vb = tf_matrix[indice_vb, :]
tfidf_pm = tf_matrix[indice_pm, :]

In [12]:
# Calculate distance
from sklearn.metrics.pairwise import cosine_distances
print(cosine_distances(tfidf_ej, tfidf_vb))
print(cosine_distances(tfidf_ej, tfidf_pm))

[[0.85192118]]
[[0.69231325]]


# Nearest Neighbors

In [23]:
from sklearn.neighbors import NearestNeighbors
def get_closest_neighbor(name, matrix):
    row = people[people.name == name].index.values[0]   # index for row
    knn = NearestNeighbors(n_neighbors=5, metric='cosine').fit(matrix)   # train model with matrix
    
    # Extract distances and points' indices
    distances, indices = knn.kneighbors(matrix.getrow(row))
    
    # Replace indices with names
    names_similar = pd.Series(indices.flatten()).map(people.reset_index()['name'])
    
    df = pd.DataFrame({'name':names_similar, 'distance':distances.flatten()})
    return df

## CountVectorizer Matrix

In [24]:
get_closest_neighbor('Elton John', f)

Unnamed: 0,name,distance
0,Elton John,2.442491e-15
1,Cliff Richard,0.1687792
2,Sandro Petrone,0.171841
3,Rod Stewart,0.1744907
4,Roger Daltrey,0.184013


## TF-IDF Matrix

In [25]:
get_closest_neighbor('Elton John', tf_matrix)

Unnamed: 0,name,distance
0,Elton John,0.0
1,Rod Stewart,0.589361
2,Phil Collins,0.633658
3,Adele,0.636524
4,Sting (musician),0.642397


## Other People

In [26]:
get_closest_neighbor('Victoria Beckham', f)

Unnamed: 0,name,distance
0,Victoria Beckham,3.330669e-16
1,Mary Fitzgerald (artist),0.2115428
2,Adrienne Corri,0.2185431
3,Beverly Jane Fry,0.2218932
4,Raman Mundair,0.2224486


In [28]:
get_closest_neighbor('Victoria Beckham', tf_matrix)

Unnamed: 0,name,distance
0,Victoria Beckham,0.0
1,David Beckham,0.546477
2,Mel B,0.718422
3,Stephen Dow Beckham,0.745956
4,Hilary Alexander,0.751848
