## Agenda
- BOW representation of text
- Finding key-words using BOW vectors
- Finding document similarity using cosine distance

In [1]:
from sklearn.feature_extraction import text

In [2]:
corpus=["This is description one",
        "This is description two",
        "This is description three",
        "This is description four"]

In [3]:
cv=text.CountVectorizer(input=corpus)

In [4]:
bow=cv.fit_transform(corpus)

In [5]:
bowx

<4x7 sparse matrix of type '<class 'numpy.int64'>'
	with 16 stored elements in Compressed Sparse Row format>

In [6]:
bow.toarray() ### dense form ## unigram representation

array([[1, 0, 1, 1, 1, 0, 0],
       [1, 0, 1, 0, 1, 0, 1],
       [1, 0, 1, 0, 1, 1, 0],
       [1, 1, 1, 0, 1, 0, 0]])

In [8]:
cv.get_feature_names_out()

array(['description', 'four', 'is', 'one', 'this', 'three', 'two'],
      dtype=object)

In [9]:
import pandas as pd
pd.DataFrame(bow.toarray(),columns=cv.get_feature_names_out())

Unnamed: 0,description,four,is,one,this,three,two
0,1,0,1,1,1,0,0
1,1,0,1,0,1,0,1
2,1,0,1,0,1,1,0
3,1,1,1,0,1,0,0


In [10]:
cv2=text.CountVectorizer(input=corpus,ngram_range=(1,3))

In [11]:
bow2=cv2.fit_transform(corpus)

In [12]:
pd.DataFrame(bow2.toarray(),columns=cv2.get_feature_names_out())

Unnamed: 0,description,description four,description one,description three,description two,four,is,is description,is description four,is description one,is description three,is description two,one,this,this is,this is description,three,two
0,1,0,1,0,0,0,1,1,0,1,0,0,1,1,1,1,0,0
1,1,0,0,0,1,0,1,1,0,0,0,1,0,1,1,1,0,1
2,1,0,0,1,0,0,1,1,0,0,1,0,0,1,1,1,1,0
3,1,1,0,0,0,1,1,1,1,0,0,0,0,1,1,1,0,0


In [13]:
#### How to compute tfidf vectors #####
tfidf=text.TfidfVectorizer(input=corpus)
tfidf_matrix=tfidf.fit_transform(corpus)

In [14]:
pd.DataFrame(tfidf_matrix.toarray(),columns=tfidf.get_feature_names_out())

Unnamed: 0,description,four,is,one,this,three,two
0,0.387139,0.0,0.387139,0.74187,0.387139,0.0,0.0
1,0.387139,0.0,0.387139,0.0,0.387139,0.0,0.74187
2,0.387139,0.0,0.387139,0.0,0.387139,0.74187,0.0
3,0.387139,0.74187,0.387139,0.0,0.387139,0.0,0.0


Data [link](https://drive.google.com/file/d/11jIco80zFY_nEfVApqo-DD4UXx8J5wGs/view?usp=sharing)

In [19]:
path = '/Users/gunnvantsaini/Documents/Data/Kaggle/ted-data/transcripts.csv'

In [20]:
transcripts=pd.read_csv(path)

In [21]:
transcripts.head(2)

Unnamed: 0,transcript,url
0,Good morning. How are you?(Laughter)It's been ...,https://www.ted.com/talks/ken_robinson_says_sc...
1,"Thank you so much, Chris. And it's truly a gre...",https://www.ted.com/talks/al_gore_on_averting_...


In [22]:
transcripts['transcript'].iloc[0]

'Good morning. How are you?(Laughter)It\'s been great, hasn\'t it? I\'ve been blown away by the whole thing. In fact, I\'m leaving.(Laughter)There have been three themes running through the conference which are relevant to what I want to talk about. One is the extraordinary evidence of human creativity in all of the presentations that we\'ve had and in all of the people here. Just the variety of it and the range of it. The second is that it\'s put us in a place where we have no idea what\'s going to happen, in terms of the future. No idea how this may play out.I have an interest in education. Actually, what I find is everybody has an interest in education. Don\'t you? I find this very interesting. If you\'re at a dinner party, and you say you work in education — Actually, you\'re not often at dinner parties, frankly.(Laughter)If you work in education, you\'re not asked.(Laughter)And you\'re never asked back, curiously. That\'s strange to me. But if you are, and you say to somebody, you

In [23]:
transcripts.shape

(2467, 2)

In [24]:
transcripts['url'].iloc[0]

'https://www.ted.com/talks/ken_robinson_says_schools_kill_creativity\n'

In [25]:
def get_title(x):
    return x.split("/")[-1].strip()
transcripts['title']=transcripts['url'].map(get_title)

In [26]:
transcripts.head(2)

Unnamed: 0,transcript,url,title
0,Good morning. How are you?(Laughter)It's been ...,https://www.ted.com/talks/ken_robinson_says_sc...,ken_robinson_says_schools_kill_creativity
1,"Thank you so much, Chris. And it's truly a gre...",https://www.ted.com/talks/al_gore_on_averting_...,al_gore_on_averting_climate_crisis


In [27]:
####
tfidf=text.TfidfVectorizer(input=transcripts['transcript'].tolist(),stop_words='english')

In [28]:
tfidf_matrix=tfidf.fit_transform(transcripts['transcript'].tolist())

In [29]:
tfidf_matrix.shape

(2467, 58489)

In [30]:
col_names=tfidf.get_feature_names_out()

In [31]:
import numpy as np
col_names=np.array(col_names)

In [32]:
%%time
key_words=[]
for i in tfidf_matrix.toarray():
    idx=i.argsort()[-5:]
    key_words.append(col_names[idx])

CPU times: user 1.92 s, sys: 243 ms, total: 2.16 s
Wall time: 2.39 s


In [33]:
transcripts['keywords']=key_words

In [34]:
transcripts.head(5)

Unnamed: 0,transcript,url,title,keywords
0,Good morning. How are you?(Laughter)It's been ...,https://www.ted.com/talks/ken_robinson_says_sc...,ken_robinson_says_schools_kill_creativity,"[think, said, laughter, gillian, education]"
1,"Thank you so much, Chris. And it's truly a gre...",https://www.ted.com/talks/al_gore_on_averting_...,al_gore_on_averting_climate_crisis,"[nashville, laughter, slideshow, carbon, tipper]"
2,"(Music: ""The Sound of Silence,"" Simon & Garfun...",https://www.ted.com/talks/david_pogue_says_sim...,david_pogue_says_simplicity_sells,"[apple, features, laughter, software, microsoft]"
3,If you're here today — and I'm very happy that...,https://www.ted.com/talks/majora_carter_s_tale...,majora_carter_s_tale_of_urban_renewal,"[waterfront, community, environmental, south, ..."
4,"About 10 years ago, I took on the task to teac...",https://www.ted.com/talks/hans_rosling_shows_t...,hans_rosling_shows_the_best_stats_you_ve_ever_...,"[income, world, africa, data, countries]"


In [35]:
##### Find similar documents using tfidf representation #####
tfidf_matrix.shape

(2467, 58489)

In [36]:
from sklearn.metrics.pairwise import cosine_similarity

In [37]:
sim_matrix=cosine_similarity(tfidf_matrix)

In [38]:
sim_matrix.shape

(2467, 2467)

In [39]:
sim_matrix #### How to find top-5 similar documents to each doc?

array([[1.        , 0.15310631, 0.17626477, ..., 0.07427816, 0.1408498 ,
        0.06360819],
       [0.15310631, 1.        , 0.16020169, ..., 0.03288679, 0.11310243,
        0.06340374],
       [0.17626477, 0.16020169, 1.        , ..., 0.06118662, 0.14853416,
        0.05664366],
       ...,
       [0.07427816, 0.03288679, 0.06118662, ..., 1.        , 0.0549175 ,
        0.04427896],
       [0.1408498 , 0.11310243, 0.14853416, ..., 0.0549175 , 1.        ,
        0.07149283],
       [0.06360819, 0.06340374, 0.05664366, ..., 0.04427896, 0.07149283,
        1.        ]])

In [40]:
transcripts.iloc[0]['title']

'ken_robinson_says_schools_kill_creativity'

In [41]:
transcripts.iloc[1968]['title']

'sakena_yacoobi_how_i_stopped_the_taliban_from_shutting_down_my_school'

In [42]:
idx=[1832 ,557,1968, 663, 1421]
transcripts.iloc[idx]['title'].tolist()

['ricardo_semler_how_to_run_a_company_with_almost_no_rules',
 'rory_bremner_s_one_man_world_summit',
 'sakena_yacoobi_how_i_stopped_the_taliban_from_shutting_down_my_school',
 'sir_ken_robinson_bring_on_the_revolution',
 'ken_robinson_how_to_escape_education_s_death_valley']

In [44]:
top_list=[]
for i in sim_matrix:
    idx=(i.argsort()[-6:-1])
    top_k=transcripts.iloc[idx]['title'].tolist()
    top_list.append(top_k)

In [45]:
transcripts['similar_talks']=top_list

In [46]:
transcripts.head(2)

Unnamed: 0,transcript,url,title,keywords,similar_talks
0,Good morning. How are you?(Laughter)It's been ...,https://www.ted.com/talks/ken_robinson_says_sc...,ken_robinson_says_schools_kill_creativity,"[think, said, laughter, gillian, education]",[ricardo_semler_how_to_run_a_company_with_almo...
1,"Thank you so much, Chris. And it's truly a gre...",https://www.ted.com/talks/al_gore_on_averting_...,al_gore_on_averting_climate_crisis,"[nashville, laughter, slideshow, carbon, tipper]","[david_carson_on_design, rory_bremner_s_one_ma..."


In [47]:
transcripts.iloc[0]['similar_talks']

['ricardo_semler_how_to_run_a_company_with_almost_no_rules',
 'rory_bremner_s_one_man_world_summit',
 'sakena_yacoobi_how_i_stopped_the_taliban_from_shutting_down_my_school',
 'sir_ken_robinson_bring_on_the_revolution',
 'ken_robinson_how_to_escape_education_s_death_valley']