# Document retrieval project in Sklearn

# Fire up packages

In [3]:
import sklearn
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
import numpy
from sklearn.feature_extraction.text import CountVectorizer


# Load data

In [4]:
people = pd.read_csv('people_wiki.csv')

In [5]:
people.head(10)

Unnamed: 0,URI,name,text
0,<http://dbpedia.org/resource/Digby_Morrell>,Digby Morrell,digby morrell born 10 october 1979 is a former...
1,<http://dbpedia.org/resource/Alfred_J._Lewy>,Alfred J. Lewy,alfred j lewy aka sandy lewy graduated from un...
2,<http://dbpedia.org/resource/Harpdog_Brown>,Harpdog Brown,harpdog brown is a singer and harmonica player...
3,<http://dbpedia.org/resource/Franz_Rottensteiner>,Franz Rottensteiner,franz rottensteiner born in waidmannsfeld lowe...
4,<http://dbpedia.org/resource/G-Enka>,G-Enka,henry krvits born 30 december 1974 in tallinn ...
5,<http://dbpedia.org/resource/Sam_Henderson>,Sam Henderson,sam henderson born october 18 1969 is an ameri...
6,<http://dbpedia.org/resource/Aaron_LaCrate>,Aaron LaCrate,aaron lacrate is an american music producer re...
7,<http://dbpedia.org/resource/Trevor_Ferguson>,Trevor Ferguson,trevor ferguson aka john farrow born 11 novemb...
8,<http://dbpedia.org/resource/Grant_Nelson>,Grant Nelson,grant nelson born 27 april 1971 in london also...
9,<http://dbpedia.org/resource/Cathy_Caruth>,Cathy Caruth,cathy caruth born 1955 is frank h t rhodes pro...


In [8]:
people[people['name']== 'Digby Morrell']['text'][0]

'digby morrell born 10 october 1979 is a former australian rules footballer who played with the kangaroos and carlton in the australian football league aflfrom western australia morrell played his early senior football for west perth his 44game senior career for the falcons spanned 19982000 and he was the clubs leading goalkicker in 2000 at the age of 21 morrell was recruited to the australian football league by the kangaroos football club with its third round selection in the 2001 afl rookie draft as a forward he twice kicked five goals during his time with the kangaroos the first was in a losing cause against sydney in 2002 and the other the following season in a drawn game against brisbaneafter the 2003 season morrell was traded along with david teague to the carlton football club in exchange for corey mckernan he played 32 games for the blues before being delisted at the end of 2005 he continued to play victorian football league vfl football with the northern bullants carltons vfla

In [10]:
def print_word_count_table(text):
    vectorizer = CountVectorizer()
    # call `fit` to build the vocabulary
    vectorizer.fit(text)

    # call `transform` to convert text to a bag of words
    x = vectorizer.transform(text)

    # CountVectorizer uses a sparse array to save memory, but it's easier in this assignment to 
    # convert back to a "normal" numpy array
    x = x.toarray()

    word_count = { word[0]: word[1] for word in zip(vectorizer.get_feature_names(), x[0]) }
    word_count_table = pd.DataFrame.from_dict(word_count, orient='index').reset_index()
    word_count_table.columns = ['word', 'count']
    print(word_count_table.sort_values('count', ascending=False).head(10))

def print_tfidf_table(tfidfvectorizer, Xtfidf, index):
    feature_names = tfidfvectorizer.get_feature_names()
    feature_index = Xtfidf[index,:].nonzero()[1]
    tfidf_scores = zip(feature_index, [Xtfidf[index, x] for x in feature_index])
    dict_tfidf = { w: s for w, s in [(feature_names[i], s) for (i, s) in tfidf_scores] }
    tfidf_table = pd.DataFrame.from_dict(dict_tfidf, orient='index').reset_index()
    tfidf_table.columns = ['word', 'tfidf']
    print(tfidf_table.sort_values('tfidf', ascending=False).head(10))
    
def person(name):
    return int(people[people['name']==name].index.tolist()[0])

In [34]:
# Obama text word count
print_word_count_table(people[people['name']=='Barack Obama']['text'])

      word  count
242    the     40
115     in     30
28     and     21
162     of     18
245     to     14
106    his     11
160  obama      9
18     act      8
104     he      7
30      as      6


# NLP for clustering: create tfidf column in data frame

** The next step will be creating the feature matrix by using tfidf techniques**

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf=TfidfVectorizer(stop_words='english')
features=tfidf.fit_transform(people['text'])

** Then the feature for clustering is created. We can use it to fit in machine learning model**

# Implementing clustering techniques

## Cosine Distance Examining

In [12]:
from sklearn.metrics.pairwise import cosine_distances as CD

** Let us check the Cosine distance of some pairs of people**

In [13]:
obama=people[people['name']=='Barack Obama'].index.tolist()[0]
beckham=people[people['name']=='David Beckham'].index.tolist()[0]
clinton=people[people['name']=='Bill Clinton'].index.tolist()[0]
swift=people[people['name']=='Taylor Swift'].index.tolist()[0]

In [14]:
# Obama tfidf
print_tfidf_table(tfidf, features, obama)

            word     tfidf
131        obama  0.413495
137          act  0.282170
115         iraq  0.171970
97           law  0.163903
145      control  0.149369
167      ordered  0.138633
98      military  0.135368
67    democratic  0.129792
180  involvement  0.124821
164     response  0.124821


In [15]:
print ('Cosine distance between Obama and Beckham is '+' '+ str(CD(features[obama],features[beckham])))
print ('Cosine distance between Obama and Clinton is '+' '+ str(CD(features[obama],features[clinton])))
print ('Cosine distance between Obama and Swift is '+' '+ str(CD(features[obama],features[swift])))

Cosine distance between Obama and Beckham is  [[0.97443419]]
Cosine distance between Obama and Clinton is  [[0.81103282]]
Cosine distance between Obama and Swift is  [[0.96917793]]


** Smaller cosine distance represents more obvious similarity. We can see from the above example that the features really make sense!**

## Searching for nearest neighnours

### K-Nearest-Neighbours

In [16]:
from sklearn.neighbors import NearestNeighbors
knn=NearestNeighbors(n_neighbors=20,algorithm='brute',metric='cosine')

In [17]:
knn_fit=knn.fit(features)

In [19]:
query_name=int(person('Barack Obama'))

In [20]:
Obama_Neighbours=knn_fit.kneighbors(features[query_name])

In [21]:
Result= pd.DataFrame({'Index':Obama_Neighbours[1].tolist()[0]})
Result['Name']=Result['Index'].apply(lambda x: people['name'][x])
Result['Cosine Similariry']=Obama_Neighbours[0].tolist()[0]
Result['Cosine Distance']=Result['Index'].apply(lambda x: CD(features[Result['Index'][0]],features[x]))
Result

Unnamed: 0,Index,Name,Cosine Similariry,Cosine Distance
0,35817,Barack Obama,0.0,[[0.0]]
1,24478,Joe Biden,0.678781,[[0.6787810413312336]]
2,38376,Samantha Power,0.728871,[[0.7288705514270959]]
3,57108,Hillary Rodham Clinton,0.743761,[[0.743760600963479]]
4,38714,Eric Stern (politician),0.747264,[[0.7472641367735133]]
5,46140,Robert Gibbs,0.764069,[[0.7640685263313408]]
6,18827,Henry Waxman,0.772595,[[0.7725948408589108]]
7,44681,Jesse Lee (politician),0.774599,[[0.7745985570739599]]
8,6796,Eric Holder,0.779121,[[0.7791208721788061]]
9,2412,Joe the Plumber,0.78326,[[0.7832599382080224]]


**The result of clustering analysis should be judged by people's experience. In this case, we can see that the result does make sense**

**Since we are making a text retrieval system, I will combine the above steps together in a function so the search for similar articles will be more convenient.**

In [22]:
def knn_query(name,neighbours=20):
    name_index=int(people[people['name']==name].index.tolist()[0])
    knn=NearestNeighbors(n_neighbors=neighbours,algorithm='brute',metric='cosine')
    knn_fit=knn.fit(features)
    knn_result=knn_fit.kneighbors(features[name_index])
    Result= pd.DataFrame({'Index':knn_result[1].tolist()[0]})
    Result['Name']=Result['Index'].apply(lambda x: people['name'][x])
    Result['Cosine Similariry']=knn_result[0].tolist()[0]
    Result['Cosine Distance']=Result['Index'].apply(lambda x: CD(features[Result['Index'][0]],features[x]))
    return Result

In [23]:
knn_query('David Beckham',10)

Unnamed: 0,Index,Name,Cosine Similariry,Cosine Distance
0,23386,David Beckham,0.0,[[0.0]]
1,50411,Victoria Beckham,0.57542,[[0.5754200370497651]]
2,24913,Bobby Charlton,0.735121,[[0.7351208613168277]]
3,53393,Steven Gerrard,0.73981,[[0.7398098085569265]]
4,43981,Fernando Torres,0.755536,[[0.7555361125003]]
5,26762,Wayne Rooney,0.758539,[[0.758538879071492]]
6,43098,Kim Milton Nielsen,0.770575,[[0.7705754199670645]]
7,38672,Shay Given,0.772076,[[0.7720759038420171]]
8,24258,Sol Campbell,0.776365,[[0.7763649806702079]]
9,3031,Michael Owen,0.778567,[[0.7785674587794118]]


**The KNN model really makes sense. Next, I will try other two models and compare the result of clustering analysis**

### Try more cases in order to justify the reasonability of the method

In [24]:
knn_query('Barack Obama')

Unnamed: 0,Index,Name,Cosine Similariry,Cosine Distance
0,35817,Barack Obama,0.0,[[0.0]]
1,24478,Joe Biden,0.678781,[[0.6787810413312336]]
2,38376,Samantha Power,0.728871,[[0.7288705514270959]]
3,57108,Hillary Rodham Clinton,0.743761,[[0.743760600963479]]
4,38714,Eric Stern (politician),0.747264,[[0.7472641367735133]]
5,46140,Robert Gibbs,0.764069,[[0.7640685263313408]]
6,18827,Henry Waxman,0.772595,[[0.7725948408589108]]
7,44681,Jesse Lee (politician),0.774599,[[0.7745985570739599]]
8,6796,Eric Holder,0.779121,[[0.7791208721788061]]
9,2412,Joe the Plumber,0.78326,[[0.7832599382080224]]


In [25]:
knn_query('Taylor Swift')

Unnamed: 0,Index,Name,Cosine Similariry,Cosine Distance
0,54264,Taylor Swift,0.0,[[0.0]]
1,317,Carrie Underwood,0.699749,[[0.6997489648714519]]
2,9379,Al Swift,0.70538,[[0.7053803353918273]]
3,25403,Ed Sheeran,0.712744,[[0.7127442732690743]]
4,19943,Tim McGraw,0.716746,[[0.7167455781247595]]
5,29297,Kelly Clarkson,0.719027,[[0.7190271148022463]]
6,27793,Adele,0.719748,[[0.7197480804862004]]
7,52794,Bill Swift,0.72597,[[0.7259703628160138]]
8,1341,Dolly Parton,0.728522,[[0.728521988236071]]
9,35807,Joss Stone,0.7298,[[0.7297995535147463]]


**We can find that the knn method really makes sense. Also, the speed of the whole procedure is fast enough, which can provide a satisfied result.**