In [1]:
import pandas as pd
import gensim

In [2]:
IN_DATA_DIR = "../../data/raw/"
INPUT_FILE_NAME = 'cleaned.parquet'
OUT_DATA_DIR = "../../data/processed/"

In [3]:
df = pd.read_parquet(IN_DATA_DIR + INPUT_FILE_NAME)
df.head()
len(df)

2386

In [4]:
def count_tags(tag_column):
    tags = tag_column.str.replace(', ', ',').str.lower().str.strip()
    joined_tags = tags.str.cat(sep=',').split(',')
    all_tags_w_dup = pd.Series(joined_tags)

    tag_counts = all_tags_w_dup.value_counts()
    tag_list = list(tag_counts.index)
    return tag_counts, tag_list

In [5]:
# split_tags = tags.str.split(',')
# tag_counts_per_talk = split_tags.apply(len)

tag_counts, tag_list = count_tags(df['tags'])

In [6]:
tag_counts

technology           695
science              522
global issues        483
culture              470
design               400
tedx                 398
business             329
entertainment        285
health               226
innovation           212
education            206
art                  204
society              202
social change        198
communication        185
politics             183
future               181
creativity           174
biology              174
humanity             164
collaboration        163
environment          155
medicine             154
economics            154
brain                148
activism             147
community            136
invention            136
history              135
children             135
                    ... 
3d printing            3
ptsd                   3
syria                  2
cello                  2
anthropocene           2
arts                   2
novel                  2
driverless cars        2
ted residency          2


In [7]:
tag_cutoff = int(0.02*len(df.index))

squashed_tags = pd.DataFrame(tag_counts)
squashed_tags = squashed_tags[(squashed_tags[0]>tag_cutoff)]
squash_list = list(squashed_tags.index.values)


In [8]:
squashed_tags

Unnamed: 0,0
technology,695
science,522
global issues,483
culture,470
design,400
tedx,398
business,329
entertainment,285
health,226
innovation,212


In [9]:
#remove ted tags
ted_tags=[]
for tag in squash_list:
    if 'ted' in tag:
        ted_tags.append(tag)
print(ted_tags)
squashed_tags = squashed_tags.drop(labels = ted_tags)
squash_list = list(squashed_tags.index.values)

['tedx', 'ted fellows', 'ted brain trust']


In [10]:
squashed_tags
print(squash_list)

['technology', 'science', 'global issues', 'culture', 'design', 'business', 'entertainment', 'health', 'innovation', 'education', 'art', 'society', 'social change', 'communication', 'politics', 'future', 'creativity', 'biology', 'humanity', 'collaboration', 'environment', 'medicine', 'economics', 'brain', 'activism', 'community', 'invention', 'history', 'children', 'health care', 'music', 'women', 'cities', 'storytelling', 'war', 'animals', 'leadership', 'engineering', 'nature', 'identity', 'computers', 'psychology', 'humor', 'life', 'performance', 'exploration', 'africa', 'data', 'photography', 'medical research', 'inequality', 'personal growth', 'neuroscience', 'government', 'climate change', 'visualizations', 'internet', 'architecture', 'sustainability', 'oceans', 'disease', 'happiness', 'green', 'biotech', 'potential', 'work', 'physics', 'media', 'film', 'violence', 'evolution', 'mind', 'big problems', 'writing', 'motivation', 'philosophy', 'entrepreneur', 'live music', 'biodiversi

In [11]:
def squashing (x,squash_list):
    original_tags = x
#     print(x)
    tags = original_tags.replace(', ', ',').lower().strip()
    split_tags = tags.split(',')
    final_tags = []
    for tag in split_tags:
        if tag in squash_list:
            
            final_tags.append(tag)
    final_string = ','.join(final_tags)
    return final_string

In [12]:
df1 = df.copy()
df1['squash_tags'] = df1['tags']
df1['squash_tags'] = df1['squash_tags'].map(lambda x: squashing(x,squash_list))
df1 = df1[df1['squash_tags']!='']

In [13]:
# df1[df1['squash_tags']==float('nan')]
df1[df1['squash_tags']==None]

Unnamed: 0,speaker,headline,description,duration,tags,transcript,WC,clean_transcript,clean_transcript_string,squash_tags


In [14]:
import spacy
nlp = spacy.load('en_core_web_lg')

In [15]:
nlp_list = []
for i in range(len(squash_list)):
    nlp_list.append(nlp(squash_list[i]))
# print(nlp_list)

In [16]:
simlist = []
tagsimcount = [0]*len(squash_list)
tagsimdict = {el:[] for el in squash_list}
for outer in range(len(nlp_list)):
    for inner in range(len(nlp_list)):
        if inner<outer:
            sim = nlp_list[outer].similarity(nlp_list[inner])
            if sim > 0.65:
                ow = squash_list[outer]
                iw = squash_list[inner]
                ow_count = tag_counts.at[ow]
                iw_count = tag_counts.at[iw]
                simlist.append([ow,iw,sim])
                tagsimcount[outer] += 1
                tagsimcount[inner] += 1
                tagsimdict[ow].append(iw)
                tagsimdict[iw].append(ow)
                
items =list(tagsimdict.items())

for k,v in items:
    if v == []:
        del tagsimdict[k]             

# print(simlist)
# print(tagsimcount)
# print(tagsimdict)
sim_tags = list(tagsimdict.keys())

In [17]:
# replace tags
# GENERALISE: 'education','teaching' -> 'education'
# REMAIN: 'africa', 'asia' -> 'africa', 'asia'

#anything with ted remove hahaha
print(tagsimcount)
print(tagsimdict)

[1, 4, 3, 1, 0, 0, 0, 3, 3, 0, 1, 2, 5, 0, 0, 1, 1, 6, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 3, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 1, 0, 0, 0, 0, 1, 5, 1, 2, 3, 1, 2, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 2, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 4, 0, 4, 0, 0, 0, 0, 5, 0, 0, 2, 0, 0, 1, 0, 2, 0, 0, 1, 0]
{'technology': ['innovation'], 'science': ['biology', 'psychology', 'physics', 'cognitive science'], 'global issues': ['social change', 'climate change', 'global development'], 'culture': ['society'], 'health': ['health care', 'medical research', 'mental health'], 'innovation': ['technology', 'creativity', 'global development'], 'art': ['photography'], 'society': ['culture', 'social change'], 'social change': ['global issues', 'society', 'personal growth', 'climate change', 'global development'], 'future': ['potential'], 'creativity': ['innovation'], 'biology': ['science', 'neuroscience', 'physics', 'genetics', 'cognitive science', 'ecology'], 'medicine': ['medical research'], 'health care'

In [18]:
def sim_counts(a):
#     print(a)
    a_count=tag_counts.at[a]
    all_tags = [a]
    all_count = [a_count]
    b = tagsimdict[a]
    for word in b:
        b_count=tag_counts.at[word]
        all_tags.append(word)
        all_count.append(b_count)
    max_count = max(all_count)
    max_index = all_count.index(max_count)
    major = all_tags[max_index]
    minor = all_tags
    minor.remove(major)
    final = [major]+minor
    return final

In [19]:
sim_tags_ordered = []
for tag in squash_list:
    if tag in sim_tags:
        sim_tags_ordered.append(tag)
# print(sim_tags_ordered)
sim_tags_reversed = sim_tags_ordered
sim_tags_reversed.reverse()
print(sim_tags_reversed)
print(len(sim_tags),len(sim_tags_reversed))

['policy', 'illness', 'poverty', 'ecology', 'cognitive science', 'global development', 'mental health', 'genetics', 'biodiversity', 'live music', 'philosophy', 'mind', 'physics', 'potential', 'disease', 'climate change', 'government', 'neuroscience', 'personal growth', 'inequality', 'medical research', 'photography', 'life', 'psychology', 'music', 'health care', 'medicine', 'biology', 'creativity', 'future', 'social change', 'society', 'art', 'innovation', 'health', 'culture', 'global issues', 'science', 'technology']
39 39


In [20]:
final_replacement = {}
for i in range(len(sim_tags_reversed)):
    tag = sim_tags_reversed[i]
    sims = tagsimdict[tag]
    #check if words that main tag is similar too have similar words too
    for sim in sims:
        if sim in final_replacement.keys():
            simlist = final_replacement[sim]
            sims+=simlist
            del final_replacement[sim]
    #remove duplicates
    sims = list(dict.fromkeys(sims))
    final_replacement[tag]=sims
print(final_replacement)

{'biodiversity': ['ecology', 'biology', 'biodiversity'], 'government': ['policy', 'government'], 'inequality': ['poverty', 'inequality'], 'life': ['mind', 'life'], 'music': ['live music', 'music'], 'future': ['potential', 'future'], 'art': ['photography', 'art'], 'culture': ['society', 'culture', 'social change', 'global issues', 'personal growth', 'climate change', 'global development', 'innovation', 'technology', 'creativity'], 'global issues': ['social change', 'climate change', 'global development'], 'science': ['biology', 'psychology', 'physics', 'cognitive science', 'science', 'neuroscience', 'genetics', 'ecology', 'philosophy', 'medical research', 'health', 'medicine', 'health care', 'mental health', 'illness', 'disease'], 'technology': ['innovation']}


In [21]:
print(final_replacement.keys())

dict_keys(['biodiversity', 'government', 'inequality', 'life', 'music', 'future', 'art', 'culture', 'global issues', 'science', 'technology'])


In [22]:
def replacing (x):
    original_tags = x
    tags = original_tags.replace(', ', ',').lower().strip()
    split_tags = tags.split(',')
    final_tags = []
    for tag in split_tags:
        if tag in final_replacement.keys():
            #tag is a major
            final_tags.append(tag)
        else:
            #tag is a minor
            r = []
            for k,v in final_replacement.items():
                if tag in v:
                    #tag is a minor w/ a major
                    r.append(k)
            if r == []:
                r.append(tag)
            final_tags += r
    final_tags = list(dict.fromkeys(final_tags))
    final_string = ','.join(final_tags)
    return final_string

In [23]:
df2 = df1.copy()
df2['squash2_tags'] = df2['squash_tags']
df2['squash2_tags'] = df2['squash2_tags'].map(lambda x: replacing(x))
df2 = df2[df2['squash2_tags']!='']

In [24]:
df2[df2['squash2_tags']==float('nan')]

Unnamed: 0,speaker,headline,description,duration,tags,transcript,WC,clean_transcript,clean_transcript_string,squash_tags,squash2_tags


In [25]:
s_tag_counts,s_tag_list = count_tags(df2['squash2_tags'])

In [26]:
print(len(s_tag_list))
s_tag_counts

72


culture           1106
science            868
technology         787
global issues      679
design             400
business           329
entertainment      285
art                261
future             218
biodiversity       215
education          206
communication      185
politics           183
humanity           164
collaboration      163
life               156
environment        155
economics          154
brain              148
activism           147
community          136
invention          136
children           135
history            135
inequality         128
music              126
women              115
cities             113
storytelling       112
war                108
                  ... 
visualizations      80
internet            77
architecture        76
sustainability      74
oceans              73
happiness           71
green               71
biotech             70
work                69
film                66
media               66
violence            65
evolution  

In [27]:
tag_cutoff3 = int(0.05*len(df2.index))

squashed_tags3 = pd.DataFrame(s_tag_counts)
squashed_tags3 = squashed_tags3[(squashed_tags3[0]>tag_cutoff3)]
squash_list3 = list(squashed_tags3.index.values)
print(len(squash_list3))
squashed_tags3

26


Unnamed: 0,0
culture,1106
science,868
technology,787
global issues,679
design,400
business,329
entertainment,285
art,261
future,218
biodiversity,215


In [28]:
df3 = df2.copy()
df3['squash3_tags'] = df3['squash2_tags']
df3['squash3_tags'] = df3['squash3_tags'].map(lambda x: squashing(x,squash_list3))
df3 = df3[df3['squash3_tags']!='']

s3_tag_counts,s3_tag_list = count_tags(df3['squash3_tags'])
s3_tag_counts

culture          1106
science           868
technology        787
global issues     679
design            400
business          329
entertainment     285
art               261
future            218
biodiversity      215
education         206
communication     185
politics          183
humanity          164
collaboration     163
life              156
environment       155
economics         154
brain             148
activism          147
invention         136
community         136
history           135
children          135
inequality        128
music             126
dtype: int64

In [29]:
# df3[df3['squash3_tags']==df3['squash3_tags'][149]]
# df3[df3['squash3_tags']==float('nan')]
# df3[df3['speaker']=='Norman Foster']

In [30]:
tag_cutoff4 = 200

squashed_tags4 = pd.DataFrame(s_tag_counts)
squashed_tags4 = squashed_tags4[(squashed_tags4[0]>tag_cutoff4)]
squash_list4 = list(squashed_tags4.index.values)
print(len(squash_list4))
squashed_tags4

11


Unnamed: 0,0
culture,1106
science,868
technology,787
global issues,679
design,400
business,329
entertainment,285
art,261
future,218
biodiversity,215


In [31]:
df4 = df3.copy()
df4['squash4_tags'] = df4['squash3_tags']
df4['squash4_tags'] = df4['squash4_tags'].map(lambda x: squashing(x,squash_list4))
df4 = df4[df4['squash4_tags']!='']

s4_tag_counts,s4_tag_list = count_tags(df4['squash4_tags'])
s4_tag_counts

culture          1106
science           868
technology        787
global issues     679
design            400
business          329
entertainment     285
art               261
future            218
biodiversity      215
education         206
dtype: int64

In [32]:
def cutting(x,cutlist):
#     print(x)
    have = False
    for tag in cutlist:
#         print(tag)
        if tag in x:
            have = True
    if have:
        return x
    else:
        return ''

In [33]:
df4_cut = df4.copy()
df4_cut['squash4_tags'] = df4_cut['squash4_tags'].map(lambda x: cutting(x,['entertainment','art','future','biodiversity','education']))
df4_cut = df4_cut[df4_cut['squash4_tags']!='']
df4_cut


Unnamed: 0,speaker,headline,description,duration,tags,transcript,WC,clean_transcript,clean_transcript_string,squash_tags,squash2_tags,squash3_tags,squash4_tags
4,Chris Bangle,Great cars are great art,American designer Chris Bangle explains his ph...,0:20:04,"cars,industrial design,transportation,inventio...","0:12\r\r\rWhat I want to talk about is, as bac...",3781.0,"[want, talk, background, idea, car, art, actua...",want talk background idea car art actually mea...,"invention,design,technology,business,art","invention,design,technology,business,art","invention,design,technology,business,art","design,technology,business,art"
5,Craig Venter,Sampling the ocean's DNA,Genomics pioneer Craig Venter takes a break fr...,0:16:51,"biotech,invention,oceans,genetics,DNA,biology,...","0:11\r\r\rAt the break, I was asked by several...",2548.0,"[break, ask, people, comment, age, debate, com...",break ask people comment age debate comment un...,"biotech,invention,oceans,genetics,biology,scie...","biotech,invention,oceans,science,biodiversity,...","invention,science,biodiversity,technology","science,biodiversity,technology"
6,David Pogue,Simplicity sells,New York Times columnist David Pogue takes aim...,0:21:26,"simplicity,computers,software,interface design...","0:12\r\r\r(Music: ""The Sound of Silence,""\rSim...",3584.0,"[music, sound, silence, simon, garfunkel, hell...",music sound silence simon garfunkel hello voic...,"computers,music,media,entertainment,performanc...","computers,music,media,entertainment,performanc...","music,entertainment,technology","entertainment,technology"
8,Dean Kamen,To invent is to give,Inventor Dean Kamen lays out his argument for ...,0:20:07,"robots,cars,industrial design,transportation,i...","0:11\r\r\rAs you pointed out, every time you c...",3278.0,"[point, time, come, learn, morning, world, exp...",point time come learn morning world expert gue...,"robots,invention,education,innovation,social c...","robots,invention,education,culture,technology,...","invention,education,culture,technology,global ...","education,culture,technology,global issues,sci..."
13,Golan Levin,Software (as) art,Engineer and artist Golan Levin pushes the bou...,0:14:53,"invention,software,music,entertainment,perform...",0:13\r\r\rImagine spending seven years at MIT ...,442.0,"[imagine, spend, seven, year, mit, research, l...",imagine spend seven year mit research laborato...,"invention,music,entertainment,performance,tech...","invention,music,entertainment,performance,tech...","invention,music,entertainment,technology,art","entertainment,technology,art"
15,Janine Benyus,Biomimicry's surprising lessons from nature's ...,In this inspiring talk about recent developmen...,0:23:19,"biomimicry,DNA,evolution,biology,fish,science,...",0:11\r\r\rIt is a thrill to be here at a confe...,3373.0,"[thrill, conference, devote, inspire, nature, ...",thrill conference devote inspire nature imagin...,"evolution,biology,science,environment,animals,...","evolution,biodiversity,science,environment,ani...","biodiversity,science,environment,design,techno...","biodiversity,science,design,technology"
16,Kevin Kelly,How technology evolves,"Tech enthusiast Kevin Kelly asks ""What does te...",0:20:00,"philosophy,evolution,culture,choice,history,sc...","0:11\r\r\rI don't know about you, but I haven'...",3379.0,"[know, figure, exactly, technology, mean, life...",know figure exactly technology mean life spend...,"philosophy,evolution,culture,history,science,f...","science,evolution,culture,history,future,techn...","science,culture,history,future,technology","science,culture,future,technology"
18,Mena Trott,Meet the founder of the blog revolution,"The founding mother of the blog revolution, Mo...",0:16:46,"software,culture,design,entertainment,storytel...","0:11\r\r\rOver the past couple of days,\ras I'...",3157.0,"[past, couple, day, prepare, speech, nervous, ...",past couple day prepare speech nervous go stag...,"culture,design,entertainment,storytelling,busi...","culture,design,entertainment,storytelling,busi...","culture,design,entertainment,business,communic...","culture,design,entertainment,business"
19,Michael Shermer,Why people believe weird things,Why do people see the Virgin Mary on a cheese ...,0:13:25,"faith,illusion,culture,religion,science,entert...","0:11\r\r\rI'm Michael Shermer,\rdirector of th...",2399.0,"[michael, shermer, director, skeptic, society,...",michael shermer director skeptic society publi...,"culture,religion,science,entertainment","culture,religion,science,entertainment","culture,science,entertainment","culture,science,entertainment"
20,Peter Gabriel,Fight injustice with raw video,Musician and activist Peter Gabriel shares his...,0:14:08,"TED Brain Trust,film,culture,music,activism,so...","0:11\r\r\rI love trees, and I'm very lucky,\rb...",1779.0,"[love, tree, lucky, live, near, wonderful, arb...",love tree lucky live near wonderful arboretum ...,"film,culture,music,activism,social change,stor...","film,culture,music,activism,global issues,stor...","culture,music,activism,global issues,collabora...","culture,global issues,art"


In [34]:
s4cut_tag_counts,s4cut_tag_list = count_tags(df4_cut['squash4_tags'])


In [35]:
s4cut_tag_counts

culture          507
science          385
technology       367
entertainment    285
art              261
global issues    249
future           218
biodiversity     215
education        206
design           201
business          98
dtype: int64

In [36]:
tag_cutoff5 = 300

squashed_tags5 = pd.DataFrame(s_tag_counts)
squashed_tags5 = squashed_tags5[(squashed_tags5[0]>tag_cutoff5)]
squash_list5 = list(squashed_tags5.index.values)
print(len(squash_list5))
squashed_tags5

6


Unnamed: 0,0
culture,1106
science,868
technology,787
global issues,679
design,400
business,329


In [37]:
df5 = df4.copy()
df5['squash5_tags'] = df5['squash4_tags']
df5['squash5_tags'] = df5['squash5_tags'].map(lambda x: squashing(x,squash_list5))
df5 = df5[df5['squash5_tags']!='']

s5_tag_counts,s5_tag_list = count_tags(df5['squash5_tags'])
s5_tag_counts

culture          1106
science           868
technology        787
global issues     679
design            400
business          329
dtype: int64

In [38]:
df1 = df1.reset_index(drop=True)
df2 = df2.reset_index(drop=True)
df3 = df3.reset_index(drop=True)
df4 = df4.reset_index(drop=True)
df5 = df5.reindex(list(range(len(df5))))

In [39]:
# df1.to_parquet(OUT_DATA_DIR+'cleaned_squashed1.parquet')
# df2.to_parquet(OUT_DATA_DIR+'cleaned_squashed2.parquet')
# df3.to_parquet(OUT_DATA_DIR+'cleaned_squashed3.parquet')
# df4.to_parquet(OUT_DATA_DIR+'cleaned_squashed4.parquet')
# df5.to_parquet(OUT_DATA_DIR+'cleaned_squashed4.parquet')

In [40]:
df3.iloc[140:150]

Unnamed: 0,speaker,headline,description,duration,tags,transcript,WC,clean_transcript,clean_transcript_string,squash_tags,squash2_tags,squash3_tags
140,Theo Jansen,"My creations, a new form of life",Artist Theo Jansen demonstrates the amazingly ...,0:08:13,"demo,science and art,biomechanics,animals,desi...",0:11\r\r\rI would like to tell you about a pro...,910.0,"[like, tell, project, start, year, ago, make, ...",like tell project start year ago make new form...,"demo,animals,design,entertainment,creativity,t...","demo,animals,design,entertainment,culture,tech...","design,entertainment,culture,technology,art"
141,Steven Pinker,The surprising decline in violence,Steven Pinker charts the decline of violence f...,0:19:15,"TED Brain Trust,sociology,culture,media,war,vi...","0:11\r\r\rImages like this, from the Auschwitz...",3096.0,"[image, like, auschwitz, concentration, camp, ...",image like auschwitz concentration camp sear c...,"culture,media,war,violence,global issues,business","culture,media,war,violence,global issues,business","culture,global issues,business"
142,Steven Pinker,What our language habits reveal,In an exclusive preview of his book <i>The Stu...,0:17:27,"TED Brain Trust,psychology,culture,language,sc...","0:12\r\r\rThis is a picture of Maurice Druon,\...",2981.0,"[picture, maurice, druon, honorary, perpetual,...",picture maurice druon honorary perpetual secre...,"psychology,culture,language,science","science,culture,language","science,culture"
143,Hod Lipson,"Building ""self-aware"" robots",Hod Lipson demonstrates a few of his cool litt...,0:06:18,"robots,cognitive science,demo,AI,evolution,des...","0:11\r\r\rSo, where are the robots?\rWe've bee...",1121.0,"[robot, tell, year, come, soon, soon, cook, cl...",robot tell year come soon soon cook clean buy ...,"robots,cognitive science,demo,evolution,design...","robots,science,demo,evolution,design,technology","science,design,technology"
144,Stephen Petranek,10 ways the world could end,How might the human race end? Stephen Petranek...,0:29:42,"solar system,space,humanity,future,climate cha...",0:11\r\r\rThe advances that have taken place i...,4810.0,"[advance, take, place, astronomy, cosmology, b...",advance take place astronomy cosmology biology...,"space,humanity,future,climate change,global is...","space,humanity,future,culture,global issues,te...","humanity,future,culture,global issues,technology"
145,Zeresenay Alemseged,The search for humanity's roots,Paleoanthropologist Zeresenay Alemseged looks ...,0:15:51,"human origins,anthropology,paleontology,Africa...",0:11\r\r\rI have 18 minutes to tell you what h...,2523.0,"[minute, tell, happen, past, million, year, ri...",minute tell happen past million year right com...,"africa,exploration,humanity,science,global issues","africa,exploration,humanity,science,global issues","humanity,science,global issues"
146,Jeff Skoll,My journey into movies that matter,Film producer Jeff Skoll (An Inconvenient Trut...,0:15:31,"philanthropy,film,social change,entertainment,...",0:11\r\r\rI've actually been waiting by the ph...,2666.0,"[actually, wait, phone, ted, year, fact, ready...",actually wait phone ted year fact ready talk e...,"film,social change,entertainment,global issues...","film,culture,global issues,entertainment,business","culture,global issues,entertainment,business"
147,Deborah Scranton,An Iraq war movie crowd-sourced from soldiers,Filmmaker Deborah Scranton talks about and sho...,0:17:36,"film,war,entertainment,storytelling,global iss...","0:12\r\r\rThree years ago, I got a phone call,...",2959.0,"[year, ago, get, phone, base, early, film, off...",year ago get phone base early film offer embed...,"film,war,entertainment,storytelling,global iss...","film,war,entertainment,storytelling,global iss...","entertainment,global issues,technology"
148,John Maeda,Designing for simplicity,The MIT Media Lab's John Maeda lives at the in...,0:15:59,"simplicity,design,technology,art",0:11\r\r\rOn simplicity. What a great way to s...,3188.0,"[simplicity, great, way, start, watch, trend, ...",simplicity great way start watch trend book li...,"design,technology,art","design,technology,art","design,technology,art"
150,Sugata Mitra,Kids can teach themselves,"Speaking at LIFT 2007, Sugata Mitra talks abou...",0:20:59,"education,culture,cities,children,teaching,glo...","0:11\r\r\rI have a tough job to do.\rYou know,...",3251.0,"[tough, job, know, look, profile, audience, co...",tough job know look profile audience connotati...,"education,culture,cities,children,global issues","education,culture,cities,children,global issues","education,culture,children,global issues"


In [53]:
d = pd.DataFrame(['a','b','c'])
d = d.drop(1)

In [55]:
d = d.reset_index(drop=True)

In [56]:
d

Unnamed: 0,0
0,a
1,c


In [57]:
df3.iloc[145:150]

Unnamed: 0,speaker,headline,description,duration,tags,transcript,WC,clean_transcript,clean_transcript_string,squash_tags,squash2_tags,squash3_tags
145,Zeresenay Alemseged,The search for humanity's roots,Paleoanthropologist Zeresenay Alemseged looks ...,0:15:51,"human origins,anthropology,paleontology,Africa...",0:11\r\r\rI have 18 minutes to tell you what h...,2523.0,"[minute, tell, happen, past, million, year, ri...",minute tell happen past million year right com...,"africa,exploration,humanity,science,global issues","africa,exploration,humanity,science,global issues","humanity,science,global issues"
146,Jeff Skoll,My journey into movies that matter,Film producer Jeff Skoll (An Inconvenient Trut...,0:15:31,"philanthropy,film,social change,entertainment,...",0:11\r\r\rI've actually been waiting by the ph...,2666.0,"[actually, wait, phone, ted, year, fact, ready...",actually wait phone ted year fact ready talk e...,"film,social change,entertainment,global issues...","film,culture,global issues,entertainment,business","culture,global issues,entertainment,business"
147,Deborah Scranton,An Iraq war movie crowd-sourced from soldiers,Filmmaker Deborah Scranton talks about and sho...,0:17:36,"film,war,entertainment,storytelling,global iss...","0:12\r\r\rThree years ago, I got a phone call,...",2959.0,"[year, ago, get, phone, base, early, film, off...",year ago get phone base early film offer embed...,"film,war,entertainment,storytelling,global iss...","film,war,entertainment,storytelling,global iss...","entertainment,global issues,technology"
148,John Maeda,Designing for simplicity,The MIT Media Lab's John Maeda lives at the in...,0:15:59,"simplicity,design,technology,art",0:11\r\r\rOn simplicity. What a great way to s...,3188.0,"[simplicity, great, way, start, watch, trend, ...",simplicity great way start watch trend book li...,"design,technology,art","design,technology,art","design,technology,art"
150,Sugata Mitra,Kids can teach themselves,"Speaking at LIFT 2007, Sugata Mitra talks abou...",0:20:59,"education,culture,cities,children,teaching,glo...","0:11\r\r\rI have a tough job to do.\rYou know,...",3251.0,"[tough, job, know, look, profile, audience, co...",tough job know look profile audience connotati...,"education,culture,cities,children,global issues","education,culture,cities,children,global issues","education,culture,children,global issues"
