# K Nearest Neighbors - Document Similarity with Wikipedia Articles

In [1]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.neighbors import NearestNeighbors

import sklearn.metrics.pairwise as smp

In [2]:
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

pd.set_option('display.max_columns', 50)
pd.set_option('display.max_colwidth', 200)

sns.set(context='notebook', style='whitegrid')

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [4]:
rnd_seed=23
np.random.seed(rnd_seed)

## Load text data - from wikipedia: pages on people

In [5]:
people_df = pd.read_csv('data/people_wiki.csv')

Data contains:  link to wikipedia article, name of person, text of article.

In [6]:
people_df.head(5)

Unnamed: 0,URI,name,text
0,<http://dbpedia.org/resource/Digby_Morrell>,Digby Morrell,digby morrell born 10 october 1979 is a former australian rules footballer who played with the kangaroos and carlton in the australian football league aflfrom western australia morrell played his ...
1,<http://dbpedia.org/resource/Alfred_J._Lewy>,Alfred J. Lewy,alfred j lewy aka sandy lewy graduated from university of chicago in 1973 after studying psychiatry pharmacology and ophthalmology he is a full professor and vicechair of the department of psychia...
2,<http://dbpedia.org/resource/Harpdog_Brown>,Harpdog Brown,harpdog brown is a singer and harmonica player who has been active in canadas blues scene since 1982 hailing from vancouver he crossed tens of thousands of miles playing club dates and festivals i...
3,<http://dbpedia.org/resource/Franz_Rottensteiner>,Franz Rottensteiner,franz rottensteiner born in waidmannsfeld lower austria austria on 18 january 1942 is an austrian publisher and critic in the fields of science fiction and the fantasticrottensteiner studied journ...
4,<http://dbpedia.org/resource/G-Enka>,G-Enka,henry krvits born 30 december 1974 in tallinn better known by his stagename genka is an estonian rapper and record producergenka started rapping in 1996 along with revo and dj paul oja who was gen...


In [7]:
# Drop the URI column, its redundant here
people_df = people_df.drop('URI', axis=1)

## Preparing a corpus for Training

Here we will be using here is called a **tf-idf** model. In this kind of model we simplify documents to a multi-set of tf-idf scores.

In [8]:
import re, nltk
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()
def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

In [9]:
def tokenize(text):
    # remove non letters
    text = re.sub("[^a-zA-Z]", " ", text)
    # tokenize
    tokens = nltk.word_tokenize(text)
    # stem
    stems = stem_tokens(tokens, stemmer)
    return stems

In [10]:
tfidf_vectorizer = TfidfVectorizer(analyzer = 'word', tokenizer=tokenize, lowercase=True, stop_words='english', max_features=200)

In [11]:
# The method fit_transform does two functions: 
# First, it fits the model and learns the vocabulary
# second, it transforms corpus data into tf-idf feature vectors
tfidf_matrix_sparse = tfidf_vectorizer.fit_transform(people_df.text)

In [12]:
tfidf_matrix_dense = tfidf_matrix_sparse.toarray()
tfidf_matrix_dense.shape

(59071, 200)

In [13]:
# Take a look at how the Document Tf-Idf Matrix looks
tfidf_matrix_df = pd.DataFrame(tfidf_matrix_dense, columns=tfidf_vectorizer.get_feature_names())
tfidf_matrix_df.head(5)

Unnamed: 0,academi,age,album,american,appear,appoint,april,art,artist,assist,associ,attend,august,australian,author,award,band,base,becam,becom,befor,began,best,board,book,...,state,studi,success,team,televis,th,thi,time,titl,took,tour,unit,univers,use,wa,went,win,women,won,work,world,write,writer,year,york
0,0.0,0.057292,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.061127,0.0,0.0,0.0,0.219382,0.0,0.0,0.0,0.0,0.0,0.0,0.094004,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.038692,0.0,0.0,0.0,0.0,0.0,0.0,0.101884,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.306564,0.0,0.0,0.0,0.0,0.0,0.125706,0.0,0.0,0.0,0.0,0.221496,0.0,0.066202,0.0,0.0,0.0,0.0,0.318631,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.135355,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.111055,0.0,0.0,0.0,0.0,0.276082,0.431636,0.0,0.0,0.0,0.0,0.0,0.413871,0.0,0.0,...,0.088235,0.0,0.0,0.107634,0.0,0.0,0.0,0.083847,0.0,0.0,0.135139,0.102138,0.0,0.0,0.176629,0.0,0.0,0.0,0.097528,0.070843,0.0,0.0,0.0,0.137813,0.0
3,0.0,0.0,0.0,0.059951,0.0,0.0,0.0,0.0,0.0,0.0,0.076806,0.0,0.0,0.0,0.246295,0.063646,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.300572,...,0.061024,0.07071,0.0,0.0,0.0,0.0,0.064487,0.0,0.0,0.0,0.0,0.070639,0.051088,0.0,0.091618,0.0,0.0,0.0,0.0,0.244976,0.0,0.0,0.188993,0.190623,0.076131
4,0.0,0.0,0.573702,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.101639,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.065869,0.0,0.0,0.0,0.19093,0.0,0.0,0.0,0.124774,0.088211,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.097354,0.0


In [14]:
len(people_df), tfidf_matrix_df.shape

(59071, (59071, 200))

In [15]:
tf_idf = list()
for i in range(0,len(people_df)):
    tf_idf_dict = tfidf_matrix_df.iloc[i].to_dict()
    tf_idf_dict = {key: value for key, value in tf_idf_dict.items() if value != 0}
    #print(d)
    tf_idf.append(tf_idf_dict)

In [16]:
people_df['tf-idf'] = pd.Series(tf_idf)
people_df.head(10)

Unnamed: 0,name,text,tf-idf
0,Digby Morrell,digby morrell born 10 october 1979 is a former australian rules footballer who played with the kangaroos and carlton in the australian football league aflfrom western australia morrell played his ...,"{'earli': 0.0552094798347, 'australian': 0.219382423277, 'colleg': 0.0479716014887, 'senior': 0.185626141662, 'continu': 0.057845346429, 'age': 0.0572923127699, 'leagu': 0.231441809379, 'wa': 0.10..."
1,Alfred J. Lewy,alfred j lewy aka sandy lewy graduated from university of chicago in 1973 after studying psychiatry pharmacology and ophthalmology he is a full professor and vicechair of the department of psychia...,"{'senior': 0.201026068744, 'depart': 0.199270049089, 'wa': 0.0662020050995, 'hi': 0.0773488588959, 'decemb': 0.184051953875, 'later': 0.148814745234, 'current': 0.148594748223, 'institut': 0.18132..."
2,Harpdog Brown,harpdog brown is a singer and harmonica player who has been active in canadas blues scene since 1982 hailing from vancouver he crossed tens of thousands of miles playing club dates and festivals i...,"{'life': 0.130137710934, 'record': 0.104192849533, 'year': 0.137812960645, 'team': 0.107634001772, 'tour': 0.13513926372, 'canadian': 0.307682621666, 'band': 0.431636379971, 'wa': 0.176629267923, ..."
3,Franz Rottensteiner,franz rottensteiner born in waidmannsfeld lower austria austria on 18 january 1942 is an austrian publisher and critic in the fields of science fiction and the fantasticrottensteiner studied journ...,"{'award': 0.0636460836029, 'scienc': 0.528632589589, 'receiv': 0.0686029514842, 'start': 0.0767714149972, 'year': 0.190622993055, 'english': 0.0916530332522, 'writer': 0.188993395905, 'produc': 0...."
4,G-Enka,henry krvits born 30 december 1974 in tallinn better known by his stagename genka is an estonian rapper and record producergenka started rapping in 1996 along with revo and dj paul oja who was gen...,"{'record': 0.220811181725, 'start': 0.392081377651, 'year': 0.0973535862686, 'tour': 0.190929676096, 'known': 0.0706705102733, 'band': 0.101638600989, 'wa': 0.12477413294, 'hi': 0.10933736208, 'ne..."
5,Sam Henderson,sam henderson born october 18 1969 is an american cartoonist writer and expert on american comedy historyhenderson was born in woodstock new york he attended boiceville new yorks onteora high scho...,"{'featur': 0.111015940278, 'award': 0.169426751904, 'busi': 0.124430950953, 'colleg': 0.191388471528, 'year': 0.0634301294032, 'titl': 0.116151145793, 'known': 0.0920898712299, 'th': 0.10662067524..."
6,Aaron LaCrate,aaron lacrate is an american music producer recording artist dj fashion designer of milkcrate records and milkcrate clothing and a film director he was born in baltimore maryland and grew up in hi...,"{'director': 0.0611677305962, 'life': 0.154447880126, 'start': 0.0658708264155, 'base': 0.075422683661, 'tour': 0.160383740068, 'began': 0.132439482398, 'intern': 0.0568822834962, 'star': 0.156673..."
7,Trevor Ferguson,trevor ferguson aka john farrow born 11 november 1947 is a canadian novelist who lives in hudson quebec he is the author of nine novels and four plays he has been called canadas best novelist both...,"{'return': 0.11248729529, 'receiv': 0.0966868930486, 'year': 0.0335822740556, 'befor': 0.0496399325215, 'writer': 0.199771116199, 'began': 0.0543862006647, 'histori': 0.124951731933, 'high': 0.057..."
8,Grant Nelson,grant nelson born 27 april 1971 in london also known as wishdokta bump flex and nng is an english dj remixer and record producernelson is heralded as one of the godfathers of uk garage due to his ...,"{'earli': 0.235933104672, 'head': 0.131425168128, 'start': 0.109451954229, 'team': 0.106127684317, 'known': 0.197281123172, 'began': 0.110031716272, 'produc': 0.352476970997, 'hous': 0.25658457619..."
9,Cathy Caruth,cathy caruth born 1955 is frank h t rhodes professor of humane letters at cornell university and is appointed in the departments of english and comparative literature she taught previously at yale...,"{'univers': 0.412277899318, 'receiv': 0.110723579909, 'depart': 0.2967263414, 'critic': 0.308246860449, 'english': 0.147925879742, 'appoint': 0.140313593964, 'work': 0.0790771469022, 'write': 0.14..."


## Train a KNN model

In [17]:
from sklearn.neighbors import NearestNeighbors
import sklearn.metrics.pairwise as smp
# Create and train the Logistic NearestNeighbors Classifier

# euclidean distance metrics
neigh = NearestNeighbors(n_neighbors=5)
neigh.fit(tfidf_matrix_sparse)

# cosine similarity does not work with sparse matrix, so use people_text_tfidfm.toarray()
#neigh = NearestNeighbors(n_neighbors=5, metric=smp.cosine_similarity)
#neigh.fit(people_text_tfidfm.toarray()

NearestNeighbors(algorithm='auto', leaf_size=30, metric='minkowski',
         metric_params=None, n_jobs=1, n_neighbors=5, p=2, radius=1.0)

### Generate Predictions for some personas:

In [18]:
def get_closest_neighs(name):
    person = people_df[people_df['name'] == name]
    person_tfid_matrix = tfidf_matrix_sparse.getrow(person.index.get_values()[0])
    distances, closer_to_person = neigh.kneighbors(person_tfid_matrix, 5, return_distance=True)
    
    #cosine similarity or ball tree does not woark with sparse data, so use person_tfid_matrix.toarray()
    #closer_to_person = neigh.kneighbors(person_tfid_matrix.toarray(), 5, return_distance=True) 
    
    result = people_df.iloc[closer_to_person[0]][['name', 'text']].copy()
    result.reset_index(drop=True, inplace=True)
    result['distances'] = pd.Series(distances[0])
    return result

**People close to president Obama:**

In [19]:
get_closest_neighs('Barack Obama')

Unnamed: 0,name,text,distances
0,Barack Obama,barack hussein obama ii brk husen bm born august 4 1961 is the 44th and current president of the united states and the first african american to hold the office born in honolulu hawaii obama is a ...,0.0
1,Charles Levin (judge),charles leonard levin born april 28 1926 in detroit michigan was a michigan jurist he served as a michigan court of appeals judge from 1966 to 1972 and as a justice of the michigan supreme court f...,0.756101
2,Mitch Daniels,mitchell elias mitch daniels jr born april 7 1949 is an american academic administrator and former politician who was governor of indiana from 2005 to 2013 he is a member of the republican party s...,0.75745
3,Dan Quayle,james danforth dan quayle kwel born february 4 1947the 44th vice president of the united states served with president george h w bush 19891993 he served as a us representative and us senator from ...,0.761354
4,Robbie Wills,former state representative robert d robbie wills jr born may 27 1968 conway arkansas is a retired american politician and attorney residing in conway arkansas he is the former speaker of the hous...,0.793353


**People close to singer Taylor Swift:**

In [20]:
get_closest_neighs('Taylor Swift')

Unnamed: 0,name,text,distances
0,Taylor Swift,taylor alison swift born december 13 1989 is an american singersongwriter raised in wyomissing pennsylvania swift moved to nashville tennessee at the age of 14 to pursue a career in country music ...,0.0
1,LeAnn Rimes,margaret leann rimes cibrian born august 28 1982 known professionally as leann rimes is an american country and pop singer known for her rich vocals rimes rose to stardom at age 13 following the r...,0.539559
2,Ricky Martin,enrique martn morales born december 24 1971 commonly known as ricky martin is a puerto rican singer actor and author martin began his career at age twelve with the allboy pop group menudo after fi...,0.5579
3,Chely Wright,richell rene chely wright li rat born october 25 1970 is an american country music singer and gay rights activist on the strength of her debut album in 1994 the academy of country music acm named ...,0.585367
4,Amy Grant,amy lee grant born november 25 1960 is an american singersongwriter musician author media personality and actress best known for performing christian music she has been referred to as the queen of...,0.603065


**People close to actor and politician Arnold Schwarzenegger:**

In [21]:
get_closest_neighs('Arnold Schwarzenegger')

Unnamed: 0,name,text,distances
0,Arnold Schwarzenegger,arnold alois schwarzenegger wrtsnr german anlt als vatsn born july 30 1947 is an austrianborn american actor producer activist businessman investor writer philanthropist former professional bodybu...,0.0
1,Robert W. Naylor,robert wesley naylor born january 21 1944 is an american lawyer and politician he is a former california state assemblyman who represented the san francisco bay areas 20th assembly district from 1...,0.925038
2,Bruce McPherson,bruce a mcpherson born january 7 1944 is a california politician who was the 30th california secretary of state sworn in march 30 2005 he was nominated to replace former secretary of state kevin s...,0.940729
3,Said Zahari,said zahari was a one time editorinchief of the malaylanguage newspaper utusan melayu and an advocate of unbiased press freedom although he currently resides in malaysia with his family he has ins...,0.942187
4,Shirley Horton,shirley horton born c 1952 is a us politician having served as a republican member of the california state assembly horton represented the 78th assembly district which includes southern san diego ...,0.945193
