In [None]:
import pandas as pd
import gensim

In [None]:
DATA_DIR = "../../data/raw/"
INPUT_FILE_NAME = 'cleaned.parquet'
OUTPUT_FILE_NAME = 'cleaned_squashed.parquet'

In [None]:
df = pd.read_parquet(DATA_DIR + INPUT_FILE_NAME)
df.head()

In [None]:
tags = df['tags'].str.replace(', ', ',').str.lower().str.strip()
split_tags = tags.str.split(',')
tag_counts_per_talk = split_tags.apply(len)

joined_tags = tags.str.cat(sep=',').split(',')
all_tags_w_dup = pd.Series(joined_tags)

tag_counts = all_tags_w_dup.value_counts()


In [None]:
print(tag_counts)

In [None]:
tag_documents = tags.str.cat(sep='_').split('_')
def tokenise(documents):
    for string in documents:
        yield gensim.utils.simple_preprocess(string)
tag_documents = list(tokenise(tag_documents))

transcript_documents = df['clean_transcript'].map(lambda x: x.tolist())
transcript_documents = list(transcript_documents)

In [None]:
tag_model = gensim.models.Word2Vec(tag_documents,size=150,window=5,min_count=2,workers=4)
tag_model.train(tag_documents,total_examples=len(tag_documents),epochs=10)

transcript_model = gensim.models.Word2Vec(transcript_documents,size=150,window=5,min_count=2,workers=4)
transcript_model.train(transcript_documents,total_examples=len(transcript_documents),epochs=10)

In [None]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
%matplotlib inline

def tsne_plot(model,squash):
    "Creates and TSNE model and plots it"
    labels = []
    tokens = []
    
    count = 0
    for word in model.wv.vocab:
        # TODO get the labels
        if squash:
            if word in squash_list:
                tokens.append(model[word])
                labels.append(word)    
        else:
            tokens.append(model[word])
            labels.append(word)


    # set the t-sne values
    tsne_model = TSNE(perplexity=40, n_components=2, init='pca', n_iter=3500, random_state=32)
    new_values = tsne_model.fit_transform(tokens)
    # TODO fit the t-sne model

    x = []
    y = []
    for value in new_values:
        x.append(value[0])
        y.append(value[1])
        
    plt.figure(figsize=(16, 16)) 
    for i in range(len(x)):
        plt.scatter(x[i],y[i])
        plt.annotate(labels[i],
                     xy=(x[i], y[i]),
                     xytext=(5, 2),
                     textcoords='offset points',
                     ha='right',
                     va='bottom')
    plt.show()

In [None]:
tag_cutoff = int(0.01*len(df.index))

squashed_tags = pd.DataFrame(tag_counts)
squashed_tags = squashed_tags[(squashed_tags[0]>tag_cutoff)]
squash_list = list(squashed_tags.index.values)


In [None]:
print(squashed_tags)

In [None]:
def squashing (x):
    original_tags = x
    tags = original_tags.replace(', ', ',').lower().strip()
    split_tags = tags.split(',')
    final_tags = []
    for tag in split_tags:
        if tag in squash_list:
            final_tags.append(tag)
    final_string = ','.join(final_tags)
    return final_string

In [None]:
df['squash_tags'] = df['tags']
df['squash_tags'] = df['squash_tags'].map(lambda x: squashing(x))
# df.to_parquet(DATA_DIR+OUTPUT_FILE_NAME)

In [None]:
# df[df['squash_tags']=='']
print(0.3*len(df))

In [None]:
squash_tags = df['squash_tags'].str.replace(', ', ',').str.lower().str.strip()
squash_tag_documents = squash_tags.str.cat(sep='_').split('_')
squash_tag_documents = list(tokenise(squash_tag_documents))

squash_tag_model = gensim.models.Word2Vec(squash_tag_documents,size=150,window=5,min_count=2,workers=4)
squash_tag_model.train(squash_tag_documents,total_examples=len(tag_documents),epochs=10)


In [None]:
tsne_plot(tag_model,False)
tsne_plot(tag_model,True)

In [None]:
tsne_plot(squash_tag_model,True)