In [1]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

# Load some text data - from wikipedia, pages on people

In [2]:
people = pd.read_csv("people_wiki.csv")

In [3]:
people.head()

Unnamed: 0,URI,name,text
0,<http://dbpedia.org/resource/Digby_Morrell>,Digby Morrell,digby morrell born 10 october 1979 is a former...
1,<http://dbpedia.org/resource/Alfred_J._Lewy>,Alfred J. Lewy,alfred j lewy aka sandy lewy graduated from un...
2,<http://dbpedia.org/resource/Harpdog_Brown>,Harpdog Brown,harpdog brown is a singer and harmonica player...
3,<http://dbpedia.org/resource/Franz_Rottensteiner>,Franz Rottensteiner,franz rottensteiner born in waidmannsfeld lowe...
4,<http://dbpedia.org/resource/G-Enka>,G-Enka,henry krvits born 30 december 1974 in tallinn ...


In [4]:
people.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59071 entries, 0 to 59070
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   URI     59071 non-null  object
 1   name    59071 non-null  object
 2   text    59071 non-null  object
dtypes: object(3)
memory usage: 1.4+ MB


In [5]:
len(people)

59071

# Explore the dataset and checkout the text it contains

In [6]:
obama = people[people.name == "Barack Obama"]

In [33]:
obama

Unnamed: 0,URI,name,text
35817,<http://dbpedia.org/resource/Barack_Obama>,Barack Obama,barack hussein obama ii brk husen bm born augu...


In [34]:
obama.text.item()

'barack hussein obama ii brk husen bm born august 4 1961 is the 44th and current president of the united states and the first african american to hold the office born in honolulu hawaii obama is a graduate of columbia university and harvard law school where he served as president of the harvard law review he was a community organizer in chicago before earning his law degree he worked as a civil rights attorney and taught constitutional law at the university of chicago law school from 1992 to 2004 he served three terms representing the 13th district in the illinois senate from 1997 to 2004 running unsuccessfully for the united states house of representatives in 2000in 2004 obama received national attention during his campaign to represent illinois in the united states senate with his victory in the march democratic party primary his keynote address at the democratic national convention in july and his election to the senate in november he began his presidential campaign in 2007 and afte

In [35]:
clooney = people[people.name == "George Clooney"]

In [36]:
clooney.text.item()

'george timothy clooney born may 6 1961 is an american actor writer producer director and activist he has received three golden globe awards for his work as an actor and two academy awards one for acting and the other for producingclooney made his acting debut on television in 1978 and later gained wide recognition in his role as dr doug ross on the longrunning medical drama er from 1994 to 1999 for which he received two emmy award nominations while working on er he began attracting a variety of leading roles in films including the superhero film batman robin 1997 and the crime comedy out of sight 1998 in which he first worked with a director who would become a longtime collaborator steven soderbergh in 1999 clooney took the lead role in three kings a wellreceived war satire set during the gulf warin 2001 clooneys fame widened with the release of his biggest commercial success the heist comedy oceans eleven the first of the film trilogy a remake of the 1960 film with frank sinatra as d

# Get the word counts for Obama article

In [7]:
obama = people[people.name == "Barack Obama"].copy()

In [8]:
from collections import Counter

import nltk

word_counts = Counter(nltk.word_tokenize(obama.text.item()))

## Sort the word counts for the Obama article

In [9]:
obama_word_count_table = pd.DataFrame(word_counts.values(), index=word_counts.keys(), columns=['count'])
obama_word_count_table.index.name = 'word'

In [11]:
obama_word_count_table.sort_values(by='count', ascending=False)

Unnamed: 0_level_0,count
word,Unnamed: 1_level_1
the,40
in,30
and,21
of,18
to,14
...,...
laureateduring,1
two,1
years,1
into,1


* The problem here is that we see some stop words as the most important ones!

# Compute TF-IDF for the corpus

In [12]:
# I'm going to do this part using turicreate, becuase open-source libraries such as scikit-learn
# try to do this using memory, and it always ends up with MemoryError
import turicreate as tc

In [13]:
tpeople = tc.SFrame("people_wiki.csv")

------------------------------------------------------
Inferred types from first 100 line(s) of file as 
column_type_hints=[str,str,str]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------


In [14]:
tpeople['word_counts'] = tc.text_analytics.count_words(tpeople['text'])

In [15]:
tpeople.head()

URI,name,text,word_counts
<http://dbpedia.org/resou rce/Digby_Morrell> ...,Digby Morrell,digby morrell born 10 october 1979 is a former ...,"{'melbourne': 1.0, 'parade': 1.0, ..."
<http://dbpedia.org/resou rce/Alfred_J._Lewy> ...,Alfred J. Lewy,alfred j lewy aka sandy lewy graduated from ...,"{'time': 1.0, 'each': 1.0, 'hour': 1.0, ..."
<http://dbpedia.org/resou rce/Harpdog_Brown> ...,Harpdog Brown,harpdog brown is a singer and harmonica player who ...,"{'society': 1.0, 'hamilton': 1.0, 'to': ..."
<http://dbpedia.org/resou rce/Franz_Rottensteiner> ...,Franz Rottensteiner,franz rottensteiner born in waidmannsfeld lower ...,"{'kurdlawitzpreis': 1.0, 'awarded': 1.0, '2004': ..."
<http://dbpedia.org/resou rce/G-Enka> ...,G-Enka,henry krvits born 30 december 1974 in tallinn ...,"{'curtis': 1.0, '2007': 1.0, 'cent': 1.0, ..."
<http://dbpedia.org/resou rce/Sam_Henderson> ...,Sam Henderson,sam henderson born october 18 1969 is an ...,"{'asses': 1.0, 'sic': 1.0, 'toilets': 1.0, ..."
<http://dbpedia.org/resou rce/Aaron_LaCrate> ...,Aaron LaCrate,aaron lacrate is an american music producer ...,"{'streamz': 1.0, 'including': 1.0, ..."
<http://dbpedia.org/resou rce/Trevor_Ferguson> ...,Trevor Ferguson,trevor ferguson aka john farrow born 11 november ...,"{'concordia': 1.0, 'creative': 1.0, ..."
<http://dbpedia.org/resou rce/Grant_Nelson> ...,Grant Nelson,grant nelson born 27 april 1971 in london ...,"{'heavies': 1.0, 'new': 1.0, 'brand': 1.0, ..."
<http://dbpedia.org/resou rce/Cathy_Caruth> ...,Cathy Caruth,cathy caruth born 1955 is frank h t rhodes ...,"{'2002': 1.0, 'harvard': 1.0, 'twentieth': 1.0, ..."


# Examine the TF-IDF for the Obama article

In [18]:
tfidf = tc.text_analytics.tf_idf(tpeople['word_counts'])

In [19]:
obama

Unnamed: 0,URI,name,text
35817,<http://dbpedia.org/resource/Barack_Obama>,Barack Obama,barack hussein obama ii brk husen bm born augu...


In [20]:
obama_tfidf = tfidf[35817]

In [21]:
pd.DataFrame(
    data=obama_tfidf.values(),
    index=obama_tfidf.keys(),
    columns=['tfidf']
).sort_values(by='tfidf', ascending=False)

Unnamed: 0,tfidf
obama,43.295653
act,27.678223
iraq,17.747379
control,14.887061
law,14.722936
...,...
is,0.055233
a,0.039334
in,0.028962
and,0.015648


## Calculate the TF-IDF using Sklearn

In [31]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vect = TfidfVectorizer(stop_words='english')

# Learn vocabulary and idf from the dataset
term_matrix = tfidf_vect.fit_transform(people.text)

doc_matrix = tfidf_vect.transform(obama.text)

In [23]:
pd.DataFrame(
    data=doc_matrix.toarray().flatten(),
    index=tfidf_vect.get_feature_names(),
    columns=['tfidf']
).sort_values('tfidf', ascending=False)

Unnamed: 0,tfidf
obama,0.413495
act,0.282170
iraq,0.171970
law,0.163903
control,0.149369
...,...
equerry,0.000000
equavalent,0.000000
equatorzipser,0.000000
equatorial,0.000000


# Manually compute distance between a few people

In [28]:
names = ["Bill Clinton", "David Beckham", "Barack Obama"]

In [29]:
people[people.name.isin(names)]

Unnamed: 0,URI,name,text
23386,<http://dbpedia.org/resource/David_Beckham>,David Beckham,david robert joseph beckham obe bkm born 2 may...
35817,<http://dbpedia.org/resource/Barack_Obama>,Barack Obama,barack hussein obama ii brk husen bm born augu...
36452,<http://dbpedia.org/resource/Bill_Clinton>,Bill Clinton,william jefferson bill clinton born william je...


In [37]:
beckham_idx = 23386
obama_idx = 35817
clinton_idx = 36452

## Is Obama closer to Clinton than to Beckham?

In [36]:
from sklearn.metrics.pairwise import cosine_distances

* In this case, the lower the distance is, the closer they are!

In [46]:
# distance between Obama and Clinton
cosine_distances(term_matrix[obama_idx], term_matrix[clinton_idx])

array([[0.81103282]])

In [47]:
# distance between Obama and Beckham
cosine_distances(term_matrix[obama_idx], term_matrix[beckham_idx])

array([[0.97443419]])

# Build a nearest neighbor model for document retrieval

* `sklearn.neighbors` provides functionality for unsupervised and supervised neighbors-based learning methods.

In [49]:
from sklearn.neighbors import NearestNeighbors

In [None]:
nbrs = NearestNeighbors()

nbrs.fit(term_matrix)

## Applying the nearest-neighbors model for retrieval

In [84]:
distances, indices = nbrs.kneighbors(term_matrix[obama_idx])

In [85]:
# We don't see Bill Clinton because I don't know what metric graphlab uses by default!
people.iloc[indices[0], [1]]

Unnamed: 0,name
35817,Barack Obama
24478,Joe Biden
38376,Samantha Power
57108,Hillary Rodham Clinton
38714,Eric Stern (politician)


In [87]:
distances[0]

array([0.        , 1.16514466, 1.2073695 , 1.21963978, 1.22250901])

## Other examples of document retrieval

In [116]:
swift_idx = people[people.name == "Taylor Swift"].index

distances, indices = nbrs.kneighbors(term_matrix[swift_idx])

people.iloc[indices[0]]

Unnamed: 0,URI,name,text
54264,<http://dbpedia.org/resource/Taylor_Swift>,Taylor Swift,taylor alison swift born december 13 1989 is a...
317,<http://dbpedia.org/resource/Carrie_Underwood>,Carrie Underwood,carrie marie underwood born march 10 1983 is a...
9379,<http://dbpedia.org/resource/Al_Swift>,Al Swift,allan byron swift born september 12 1935 an em...
25403,<http://dbpedia.org/resource/Ed_Sheeran>,Ed Sheeran,edward christopher ed sheeran born 17 february...
19943,<http://dbpedia.org/resource/Tim_McGraw>,Tim McGraw,samuel timothy tim mcgraw born may 1 1967 is a...


In [117]:
jolie_idx = people[people.name == "Angelina Jolie"].index

distances, indices = nbrs.kneighbors(term_matrix[jolie_idx])

people.iloc[indices[0]]

Unnamed: 0,URI,name,text
39521,<http://dbpedia.org/resource/Angelina_Jolie>,Angelina Jolie,angelina jolie doli johlee born angelina jolie...
24426,<http://dbpedia.org/resource/Brad_Pitt>,Brad Pitt,william bradley brad pitt born december 18 196...
16625,<http://dbpedia.org/resource/Keith_Jolie>,Keith Jolie,keith jolie is a canadian singersongwriter he ...
21644,<http://dbpedia.org/resource/Jodie_Foster>,Jodie Foster,alicia christian foster born november 19 1962 ...
34756,<http://dbpedia.org/resource/Maggie_Smith>,Maggie Smith,dame margaret natalie maggie smith ch dbe born...


In [115]:
arnold_idx = people[people.name == "Arnold Schwarzenegger"].index

distances, indices = nbrs.kneighbors(term_matrix[arnold_idx])

people.iloc[indices[0]]

Unnamed: 0,URI,name,text
16018,<http://dbpedia.org/resource/Arnold_Schwarzene...,Arnold Schwarzenegger,arnold alois schwarzenegger wrtsnr german anlt...
58965,<http://dbpedia.org/resource/Bonnie_Garcia>,Bonnie Garcia,bonnie garcia is a california politician she w...
35293,<http://dbpedia.org/resource/Paul_Grant_(bodyb...,Paul Grant (bodybuilder),paul grant 1943 2003 was a welsh body builder ...
47709,<http://dbpedia.org/resource/Gray_Davis>,Gray Davis,joseph graham gray davis jr december 26 1942 i...
8050,<http://dbpedia.org/resource/James_Tramel>,James Tramel,james tramel born c 1967 is an episcopal pries...
