# 文字前處理

## load data

In [1]:
import turicreate as tc

In [2]:
people = tc.SFrame('people_wiki.sframe')

In [3]:
people.head(5)

URI,name,text
<http://dbpedia.org/resou rce/Digby_Morrell> ...,Digby Morrell,digby morrell born 10 october 1979 is a former ...
<http://dbpedia.org/resou rce/Alfred_J._Lewy> ...,Alfred J. Lewy,alfred j lewy aka sandy lewy graduated from ...
<http://dbpedia.org/resou rce/Harpdog_Brown> ...,Harpdog Brown,harpdog brown is a singer and harmonica player who ...
<http://dbpedia.org/resou rce/Franz_Rottensteiner> ...,Franz Rottensteiner,franz rottensteiner born in waidmannsfeld lower ...
<http://dbpedia.org/resou rce/G-Enka> ...,G-Enka,henry krvits born 30 december 1974 in tallinn ...


In [4]:
clooney = people[people["name"] == "George Clooney"]
clooney

URI,name,text
<http://dbpedia.org/resou rce/George_Clooney> ...,George Clooney,george timothy clooney born may 6 1961 is an ...


In [5]:
clooney['text']

dtype: str
Rows: ?
['george timothy clooney born may 6 1961 is an american actor writer producer director and activist he has received three golden globe awards for his work as an actor and two academy awards one for acting and the other for producingclooney made his acting debut on television in 1978 and later gained wide recognition in his role as dr doug ross on the longrunning medical drama er from 1994 to 1999 for which he received two emmy award nominations while working on er he began attracting a variety of leading roles in films including the superhero film batman robin 1997 and the crime comedy out of sight 1998 in which he first worked with a director who would become a longtime collaborator steven soderbergh in 1999 clooney took the lead role in three kings a wellreceived war satire set during the gulf warin 2001 clooneys fame widened with the release of his biggest commercial success the heist comedy oceans eleven the first of the film trilogy a remake of the 1960 film wit

## word_count_table 

In [6]:
obama = people[people['name'] == "Barack Obama"]
obama['word_count'] = tc.text_analytics.count_words(obama['text'])
obama

URI,name,text,word_count
<http://dbpedia.org/resou rce/Barack_Obama> ...,Barack Obama,barack hussein obama ii brk husen bm born august ...,"{'cuba': 1.0, 'relations': 1.0, ..."


In [7]:
print(type(obama[['word_count']]))
print(type(obama['word_count'])) #become array
obama[['word_count']]  #keep the type in SFrame

<class 'turicreate.data_structures.sframe.SFrame'>
<class 'turicreate.data_structures.sarray.SArray'>


word_count
"{'cuba': 1.0, 'relations': 1.0, ..."


https://apple.github.io/turicreate/docs/api/generated/turicreate.SFrame.stack.html

In [8]:
word_count_table = obama[['word_count']].stack('word_count',
                                             new_column_name = ['word', 'count'])

#The stack works only for columns of dict, list, or array type
#for dict type
#two new columns are created : one column holds the key and another column holds the value.
word_count_table.tail(5)

word,count
receive,1.0
response,3.0
ii,1.0
act,8.0
rights,1.0


In [9]:
word_count_table.sort("count", ascending=False).head(5)

word,count
the,40.0
in,30.0
and,21.0
of,18.0
to,14.0


## compute tf-idf for the corpus

In [10]:
# build word_count column
people['word_count'] = tc.text_analytics.count_words(people['text'])
people.head(2)

URI,name,text,word_count
<http://dbpedia.org/resou rce/Digby_Morrell> ...,Digby Morrell,digby morrell born 10 october 1979 is a former ...,"{'melbourne': 1.0, 'college': 1.0, 'para ..."
<http://dbpedia.org/resou rce/Alfred_J._Lewy> ...,Alfred J. Lewy,alfred j lewy aka sandy lewy graduated from ...,"{'time': 1.0, 'each': 1.0, 'rhythms': 1.0, ..."


In [11]:
#tf-idf analysis for all data
tfidf = tc.text_analytics.tf_idf(people['word_count'])

In [12]:
# add as a column
people['tfidf'] = tfidf

In [13]:
# give the result of tf-idf for obama
Barack_Obama = people[people['name'] == "Barack Obama"]
Barack_Obama['tfidf'].stack().sort("X2",ascending=False)

X1,X2
obama,43.2956530720749
act,27.67822262297991
iraq,17.747378587965535
control,14.887060845181308
law,14.722935761763422
ordered,14.533373950913514
military,13.115932778499417
involvement,12.784385241175055
response,12.784385241175055
democratic,12.410688697332166


## computing distance

In [14]:
people.head(1)

URI,name,text,word_count
<http://dbpedia.org/resou rce/Digby_Morrell> ...,Digby Morrell,digby morrell born 10 october 1979 is a former ...,"{'melbourne': 1.0, 'college': 1.0, 'para ..."

tfidf
"{'melbourne': 3.8914310119380633, ..."


In [15]:
# is obama closer to person_A or person_B

clinton = people[people['name'] =='Bill Clinton']
Morrell = people[people['name'] =='Digby Morrell'] #澳洲前足球員
print("clinton v.s. Morrell")
print(tc.distances.cosine(clinton['tfidf'][0],Morrell['tfidf'][0]))
print(' ')
print("clinton v.s. Barack_Obama")
print(tc.distances.cosine(clinton['tfidf'][0],Barack_Obama['tfidf'][0]))
print(' ')
print("Morrell v.s. Barack_Obama")
print(tc.distances.cosine(Morrell['tfidf'][0],Barack_Obama['tfidf'][0]))
print(' ')
print("the distance between clinton and Barack_Obama is the shortest")
print("so we can know that Clinton is closer to Obama,compared to the other two guys")

clinton v.s. Morrell
0.9840604838818148
 
clinton v.s. Barack_Obama
0.8339854936884277
 
Morrell v.s. Barack_Obama
0.9917749608949761
 
the distance between clinton and Barack_Obama is the shortest
so we can know that Clinton is closer to Obama,compared to the other two guys


## build knn_model

In [16]:
knn_model = tc.nearest_neighbors.create(people, features=['tfidf'],label='name', distance='cosine')


In [17]:
knn_model.query(Barack_Obama)  #放入dataset

query_label,reference_label,distance,rank
0,Barack Obama,0.0,1
0,Joe Biden,0.703138676733575,2
0,Samantha Power,0.7429819023278823,3
0,Hillary Rodham Clinton,0.7583583978869676,4
0,Eric Stern (politician),0.7705612276009975,5


https://apple.github.io/turicreate/docs/api/generated/turicreate.nearest_neighbors.NearestNeighborsModel.query.html

## other examples

In [18]:
jeter = people[people['name']=="Derek Jeter"]
knn_model.query(jeter)

query_label,reference_label,distance,rank
0,Derek Jeter,1.1102230246251563e-16,1
0,Alex Rodriguez,0.7440620226254882,2
0,Alfonso Soriano,0.7499882210138276,3
0,Mark Newman (baseball),0.75237301334487,4
0,Reggie Jackson,0.7786521125286558,5


In [19]:
arod = people[people['name']=='Alex Rodriguez']
knn_model.query(arod)

query_label,reference_label,distance,rank
0,Alex Rodriguez,0.0,1
0,John Rodriguez (baseball),0.6436622210720697,2
0,Alfonso Soriano,0.7167665052708209,3
0,Eladio Rodriguez,0.7279523802694244,4
0,Maggie Rodriguez,0.7300771384909863,5


## search for tennis player

In [20]:
people['tennis'] = people['word_count'].apply(lambda x : x.get('tennis') )

In [21]:
tennis = people[people['tennis'] >= 1]

len(tennis)



433

In [22]:
#find an interested tennis player named, Stan Wawrinka
Wawrinka = people[people['name'] == 'Stan Wawrinka']
knn_model.query(Wawrinka)

query_label,reference_label,distance,rank
0,Stan Wawrinka,0.0,1
0,Tom%C3%A1%C5%A1 Berdych,0.6289856059729872,2
0,Jo-Wilfried Tsonga,0.7434750231547309,3
0,Daniel Nestor,0.7490523489749115,4
0,Yann Marti,0.757875109292687,5


## find if novak djkovijic in the data

In [29]:
tennis['novak'] = tennis['word_count'].apply(lambda name : name.get('Novak'))
ifnovak = tennis[tennis['novak'] > 0]
len(ifnovak)

0

novak djkovijic is not in this dataset