In [1]:
import pandas as pd 

# Load some text data from wikipedia, pages on people

In [2]:
people = pd.read_csv('people_wiki.csv')

In [3]:
people.head()

Unnamed: 0,URI,name,text
0,<http://dbpedia.org/resource/Digby_Morrell>,Digby Morrell,digby morrell born 10 october 1979 is a former...
1,<http://dbpedia.org/resource/Alfred_J._Lewy>,Alfred J. Lewy,alfred j lewy aka sandy lewy graduated from un...
2,<http://dbpedia.org/resource/Harpdog_Brown>,Harpdog Brown,harpdog brown is a singer and harmonica player...
3,<http://dbpedia.org/resource/Franz_Rottensteiner>,Franz Rottensteiner,franz rottensteiner born in waidmannsfeld lowe...
4,<http://dbpedia.org/resource/G-Enka>,G-Enka,henry krvits born 30 december 1974 in tallinn ...


In [4]:
len(people)

59071

# Explore the dataset and checkout the text it contains

In [5]:
obama = people[people['name']=='Barack Obama']

In [6]:
obama

Unnamed: 0,URI,name,text
35817,<http://dbpedia.org/resource/Barack_Obama>,Barack Obama,barack hussein obama ii brk husen bm born augu...


In [7]:
print(obama['text'])

35817    barack hussein obama ii brk husen bm born augu...
Name: text, dtype: object


In [8]:
clooney = people[people['name']=='George Clooney']

In [9]:
pd.set_option('display.max_colwidth', -1)

In [10]:
clooney['text']

38514    george timothy clooney born may 6 1961 is an american actor writer producer director and activist he has received three golden globe awards for his work as an actor and two academy awards one for acting and the other for producingclooney made his acting debut on television in 1978 and later gained wide recognition in his role as dr doug ross on the longrunning medical drama er from 1994 to 1999 for which he received two emmy award nominations while working on er he began attracting a variety of leading roles in films including the superhero film batman robin 1997 and the crime comedy out of sight 1998 in which he first worked with a director who would become a longtime collaborator steven soderbergh in 1999 clooney took the lead role in three kings a wellreceived war satire set during the gulf warin 2001 clooneys fame widened with the release of his biggest commercial success the heist comedy oceans eleven the first of the film trilogy a remake of the 1960 film with frank sina

# Get the word counts for Obama article

In [14]:
from sklearn.feature_extraction.text import CountVectorizer
#cv = CountVectorizer(stop_words='english')
cv = CountVectorizer()
f = cv.fit_transform(people['text'])

In [15]:
cv_bo = CountVectorizer()
f_bo = cv_bo.fit_transform(obama['text'])

In [16]:
feature_name = cv_bo.get_feature_names()
feature_count = f_bo.sum(axis=0).tolist()[0]

pd.DataFrame(list(zip(feature_name, feature_count)), columns=['Feature', 'Count']).sort_values(by=['Count'], ascending=False)

Unnamed: 0,Feature,Count
242,the,40
115,in,30
28,and,21
162,of,18
245,to,14
...,...,...
111,husen,1
112,hussein,1
113,ii,1
116,inaugurated,1


# Compute TF-IDF for the corpus

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer()
tfidf_matrix = tf.fit_transform(people['text'])
features_names = tf.get_feature_names()

## Examine TFIDF for Obama article

In [18]:
# Select the index of Obama
doc = people[people['name'] == 'Barack Obama'].index.values[0]  # .index -> Int64Index; .values -> array([])
# .nonzero() returns the indice of the element in matrix -> coordinate of row [0] and column [1]
feature_index = tfidf_matrix[doc,:].nonzero()[1]

In [19]:
# Extract the score of each nonzero feature
tfidf_scores = zip(feature_index, [tfidf_matrix[doc, x] for x in feature_index])

# Construct a dictionary for feature & scores
d = {}
# (i,s) -> index & scores
# (tfnames[i],s) -> features' names & scores by using index
for w, s in [(features_names[i], s) for (i, s) in tfidf_scores]: 
    d[w] = [s]
    
df = pd.DataFrame.from_dict(d, orient='index', columns=['tfidf']).sort_values(by='tfidf', ascending=False)

In [20]:
df

Unnamed: 0,tfidf
obama,0.365018
the,0.279323
act,0.249089
in,0.209673
iraq,0.151809
...,...
is,0.014350
new,0.013177
which,0.012341
that,0.011600


## Examine TFIDF for Clooney

In [21]:
# Select Clooney's index
doc2 = people[people['name']=='George Clooney'].index.values[0]

# Use index to find the nonzero scores of Clooney's tfidf
feature_index2 = tfidf_matrix[doc2, :].nonzero()[1]

# Construct feature_index with scores
tfidf_scores2 = zip(feature_index2, [tfidf_matrix[doc2, i] for i in feature_index2])

# Construct dictionary to store features' name with scores
d2 = {}
for w, s in [(features_names[i], s) for (i, s) in tfidf_scores2]:
    d2[w] = [s]
    
df2 = pd.DataFrame.from_dict(d2, orient='index', columns=['tfidf']).sort_values(by='tfidf', ascending=False)

In [22]:
df2

Unnamed: 0,tfidf
the,0.313680
clooney,0.307745
thriller,0.211806
drama,0.157204
actor,0.141867
...,...
american,0.019064
who,0.017115
from,0.011594
born,0.011365


# Is Obama closer to Clinton than to Beckham?

In [23]:
from sklearn.metrics.pairwise import cosine_similarity, cosine_distances

# Find the index of three people
index_obama = people[people.name=='Barack Obama'].index.values[0]
index_clinton = people[people.name=='Bill Clinton'].index.values[0]
index_becham = people[people.name=='David Beckham'].index.values[0]

# Find the score array of three people in matrix
X1 = tfidf_matrix[index_obama, :]
X2 = tfidf_matrix[index_clinton, :]
X3 = tfidf_matrix[index_becham, :]

In [24]:
# Cosine Distances
print(cosine_distances(X1, X2))
print(cosine_distances(X1, X3))

[[0.67497775]]
[[0.8420454]]


In [25]:
# Cosine Similarity
print(cosine_similarity(X1, X2))
print(cosine_similarity(X1, X3))

[[0.32502225]]
[[0.1579546]]


# Apply nearest neighbors for retrieval of Wikipedia articles

## Use NN model for retrieval... for example, who is closest to Obama?

In [26]:
from sklearn.neighbors import NearestNeighbors

# Construct a function to find closest people
def get_closest_neighs(name, matrix):
    # name: people's name
    # matrix: CountVectorizer Matrix or tfidf_matrix...
    row = people[people.name==name].index.values[0]  # index
    knn_model = NearestNeighbors(n_neighbors=5, metric='cosine').fit(matrix)  # train model
    
    # .getrow(row) -> return a copy of row vector
    # .kneighbors(array) -> return distance and points' indice
    distances, indices = knn_model.kneighbors(matrix.getrow(row))  
    
    # people.reset_index()['name'] -> reset index and extract column 'name'
    # .map(xxx) -> replace the pd.Series with xxx
    # .flaaten() -> make list of list flatten as list
    names_similar = pd.Series(indices.flatten()).map(people.reset_index()['name'])
    
    result = pd.DataFrame({'distance':distances.flatten(), 'name':names_similar})
    return result

In [27]:
print(get_closest_neighs('Barack Obama', tfidf_matrix))

   distance                     name
0  0.000000  Barack Obama           
1  0.570781  Joe Biden              
2  0.615934  Hillary Rodham Clinton 
3  0.624993  Samantha Power         
4  0.649765  Eric Stern (politician)


In [28]:
print(get_closest_neighs('Barack Obama', f))

       distance            name
0  1.443290e-15  Barack Obama  
1  1.224048e-01  Joe Biden     
2  1.357289e-01  Sandro Petrone
3  1.394896e-01  George W. Bush
4  1.421223e-01  Ribal al-Assad
