In [1]:
import pandas as pd
import gensim

In [2]:
DATA_DIR = "../../data/raw/"
INPUT_FILE_NAME = 'cleaned.parquet'
OUTPUT_FILE_NAME = 'cleaned_squashed.parquet'

In [3]:
df = pd.read_parquet(DATA_DIR + INPUT_FILE_NAME)
df.head()

Unnamed: 0,speaker,headline,description,duration,tags,transcript,WC,clean_transcript,clean_transcript_string
0,Al Gore,Averting the climate crisis,With the same humor and humanity he exuded in ...,0:16:17,"cars,alternative energy,culture,politics,scien...","0:14\r\r\rThank you so much, Chris.\rAnd it's ...",2281.0,"[thank, chris, truly, great, honor, opportunit...",thank chris truly great honor opportunity come...
1,Amy Smith,Simple designs to save a life,Fumes from indoor cooking fires kill more than...,0:15:06,"MacArthur grant,simplicity,industrial design,a...","0:11\r\r\rIn terms of invention,\rI'd like to ...",2687.0,"[term, invention, like, tell, tale, favorite, ...",term invention like tell tale favorite project...
2,Ashraf Ghani,How to rebuild a broken state,Ashraf Ghani's passionate and powerful 10-minu...,0:18:45,"corruption,poverty,economics,investment,milita...","0:12\r\r\rA public, Dewey long ago observed,\r...",2506.0,"[public, dewey, long, ago, observe, constitute...",public dewey long ago observe constitute discu...
3,Burt Rutan,The real future of space exploration,"In this passionate talk, legendary spacecraft ...",0:19:37,"aircraft,flight,industrial design,NASA,rocket ...","0:11\r\r\rI want to start off by saying, Houst...",3092.0,"[want, start, say, houston, problem, enter, se...",want start say houston problem enter second ge...
4,Chris Bangle,Great cars are great art,American designer Chris Bangle explains his ph...,0:20:04,"cars,industrial design,transportation,inventio...","0:12\r\r\rWhat I want to talk about is, as bac...",3781.0,"[want, talk, background, idea, car, art, actua...",want talk background idea car art actually mea...


In [4]:
tags = df['tags'].str.replace(', ', ',').str.lower().str.strip()
split_tags = tags.str.split(',')
tag_counts_per_talk = split_tags.apply(len)

joined_tags = tags.str.cat(sep=',').split(',')
all_tags_w_dup = pd.Series(joined_tags)

tag_counts = all_tags_w_dup.value_counts()


In [None]:
print(tag_counts)

In [None]:
tag_documents = tags.str.cat(sep='_').split('_')
def tokenise(documents):
    for string in documents:
        yield gensim.utils.simple_preprocess(string)
tag_documents = list(tokenise(tag_documents))

transcript_documents = df['clean_transcript'].map(lambda x: x.tolist())
transcript_documents = list(transcript_documents)

In [None]:
tag_model = gensim.models.Word2Vec(tag_documents,size=150,window=5,min_count=2,workers=4)
tag_model.train(tag_documents,total_examples=len(tag_documents),epochs=10)

transcript_model = gensim.models.Word2Vec(transcript_documents,size=150,window=5,min_count=2,workers=4)
transcript_model.train(transcript_documents,total_examples=len(transcript_documents),epochs=10)

In [None]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
%matplotlib inline

def tsne_plot(model,squash):
    "Creates and TSNE model and plots it"
    labels = []
    tokens = []
    
    count = 0
    for word in model.wv.vocab:
        # TODO get the labels
        if squash:
            if word in squash_list:
                tokens.append(model[word])
                labels.append(word)    
        else:
            tokens.append(model[word])
            labels.append(word)


    # set the t-sne values
    tsne_model = TSNE(perplexity=40, n_components=2, init='pca', n_iter=3500, random_state=32)
    new_values = tsne_model.fit_transform(tokens)
    # TODO fit the t-sne model

    x = []
    y = []
    for value in new_values:
        x.append(value[0])
        y.append(value[1])
        
    plt.figure(figsize=(16, 16)) 
    for i in range(len(x)):
        plt.scatter(x[i],y[i])
        plt.annotate(labels[i],
                     xy=(x[i], y[i]),
                     xytext=(5, 2),
                     textcoords='offset points',
                     ha='right',
                     va='bottom')
    plt.show()

In [5]:
tag_cutoff = int(0.01*len(df.index))

squashed_tags = pd.DataFrame(tag_counts)
squashed_tags = squashed_tags[(squashed_tags[0]>tag_cutoff)]
squash_list = list(squashed_tags.index.values)


In [6]:
print(squashed_tags)

                     0
technology         695
science            522
global issues      483
culture            470
design             400
tedx               398
business           329
entertainment      285
health             226
innovation         212
education          206
art                204
society            202
social change      198
communication      185
politics           183
future             181
biology            174
creativity         174
humanity           164
collaboration      163
environment        155
economics          154
medicine           154
brain              148
activism           147
community          136
invention          136
history            135
children           135
...                ...
philanthropy        32
sports              31
algorithm           30
investment          29
gaming              29
feminism            29
disability          29
plants              28
statistics          28
microbiology        28
success             28
money      

In [8]:
def squashing (x):
    original_tags = x
    tags = original_tags.replace(', ', ',').lower().strip()
    split_tags = tags.split(',')
    final_tags = []
    for tag in split_tags:
        if tag in squash_list:
            final_tags.append(tag)
    final_string = ','.join(final_tags)
    return final_string

In [10]:
df['squash_tags'] = df['tags']
df['squash_tags'] = df['squash_tags'].map(lambda x: squashing(x))
df.to_parquet(DATA_DIR+OUTPUT_FILE_NAME)

In [None]:
squash_tags = df['squash_tags'].str.replace(', ', ',').str.lower().str.strip()
squash_tag_documents = squash_tags.str.cat(sep='_').split('_')
squash_tag_documents = list(tokenise(squash_tag_documents))

squash_tag_model = gensim.models.Word2Vec(squash_tag_documents,size=150,window=5,min_count=2,workers=4)
squash_tag_model.train(squash_tag_documents,total_examples=len(tag_documents),epochs=10)


In [None]:
tsne_plot(tag_model,False)
tsne_plot(tag_model,True)

In [None]:
tsne_plot(squash_tag_model,True)

In [None]:
# df[df['tags']=='']

In [None]:
# df.to_pickle("./squashed_processed_data.pkl")

In [None]:
# df['tags']

In [None]:
w = 'alternative'
tag_model.wv.most_similar(positive=w,topn=10)
# transcript_model.wv.most_similar(positive=w1,topn=10)

In [None]:
import spacy
nlp = spacy.load('en_core_web_lg')

In [None]:
nlp_list = []
for i in range(len(squash_list)):
    nlp_list.append(nlp(squash_list[i]))

In [None]:
simlist = []
tagsimcount = [0]*len(squash_list)
tagsimdict = {el:[] for el in squash_list}
for outer in range(len(nlp_list)):
    for inner in range(len(nlp_list)):
        if inner<outer:
            sim = nlp_list[outer].similarity(nlp_list[inner])
            if sim > 0.7:
                ow = squash_list[outer]
                iw = squash_list[inner]      
                simlist.append([ow,iw,sim])
                tagsimcount[outer] += 1
                tagsimcount[inner] += 1
                tagsimdict[ow].append(iw)
                tagsimdict[iw].append(ow)
                
                
print(simlist)
print(tagsimcount)
print(tagsimdict)

In [None]:
# replace tags
# SPLIT UP: 'women in business' -> 'women','business'
# GENERALISE: 'education','teaching' -> 'education'
# REMAIN: 'africa', 'asia' -> 'africa', 'asia'

In [None]:
tagsimdict['teaching']

In [None]:
longtags = []
for t in squash_list:
    if ' ' in t:
        longtags.append(t)
print(longtags)

In [None]:
potential = []
for lt in longtags:
    split = lt.split()
    new = []
    for i in range(len(split)):
        if split[i] in squash_list:
            new.append(split[i])
    if new == []:
        potential.append(lt)
    else:
        potential.append(new)
print(potential)