# Idea:
Our solution: LDA + BERT based embeddings of noun phrases and verbs :
- Each noun phrase and verb in the texts is  transformed to embedding vector using Universal Sentence Encoder (transformer based on BERT)
- Embedding vectors from (a) are clustered (HDBSCAN + UNET)
- Words/phrases with embedding vectors closest to the centers of resulting clusters form key word/phrase
- Each text in the training sample is converted to collection of key-phrases by replacing its noun phrases and verbs with keyword/phrases and deleting other words
- LDA is performed on the transformed texts


**Reference:**<br>
- Daniel Cer, Yinfei Yang, Sheng-yi Kong, Nan Hua, Nicole Limtiaco, Rhomni St. John, Noah Constant, Mario Guajardo-Céspedes, Steve Yuan, Chris Tar, Yun-Hsuan Sung, Brian Strope, Ray Kurzweil. **Universal Sentence Encoder.** *arXiv:1803.11175, 2018.*
- McInnes, L, Healy, J, **UMAP: Uniform Manifold Approximation and Projection for Dimension Reduction**, *ArXiv e-prints 1802.03426, 2018*

# Load data and python libraries

In [1]:
# data processing libraries
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'

# display wider columns in pandas data frames where necessary
pd.set_option('max_colwidth',150)

import tensorflow as tf
print("TensorFlow version:", tf.__version__)

import tensorflow_hub as hub
#Load the Universal Sentence Encoder's TF Hub module
module_url = "https://tfhub.dev/google/universal-sentence-encoder-large/5"
model = hub.load(module_url)
print ("module %s loaded" % module_url)

import umap
import hdbscan

TensorFlow version: 2.2.0
module https://tfhub.dev/google/universal-sentence-encoder-large/5 loaded


In [2]:
df_train = pd.read_csv("./transition_files/train.tsv", sep='\t')
print("df_train.shape:", df_train.shape)
print("df_train.shape:",df_train.columns)

df_train.shape: (33982, 12)
df_train.shape: Index(['date', 'author', 'title', 'url', 'section', 'publication',
       'first_10_sents', 'list_of_first_10_sents', 'list_of_verb_lemmas',
       'noun_phrases', 'list_of_nouns', 'list_of_lemmas'],
      dtype='object')


# Getting text clusters through sentence embedding comparison

In [3]:
def get_embeddings(input):
    return model(input)

In [4]:
def get_word_embeddings(df_data, column = "word", N_batches=1):
    #split data into N batches
    N = N_batches

    part = int(len(df_data)/N)
    print(N, "batches with", part + 1, column + "s each")

    #get embeddings for each N words
    index = 0
    batch_num = 0
    list_dfs = []

    while index < len(df_data): 
        df_tmp = df_data.iloc[index : index + part].copy()
        df_tmp = df_tmp.reset_index(drop=True)
        print ("Batch number:", batch_num + 1, "out of ", N)

        df_batch_embeddings = pd.DataFrame(get_embeddings(list(df_tmp[column])).numpy())

        num_embeddings = df_batch_embeddings.shape[1]
        columns = ["emb_" + str(i) for i in range(512)]
        df_tmp[columns] = df_batch_embeddings

        list_dfs.append(df_tmp)
        batch_num = batch_num + 1
        index = index + part

    #concatinate batches into single dataset
    df_emb = pd.concat(list_dfs)

    return df_emb

In [5]:
df_train['noun_phrases'] = df_train['noun_phrases'].str[2:-2]
df_train['noun_phrases'] = df_train['noun_phrases'].str.lower().str.split("', '")
df_train['noun_phrases'].head()

0    [rise, big emerging economy, china, india, steady march, globalisation, surge, number, people, business, tourism, result, demand, visa, unpreceden...
1    [pfizer, commitment, corporate social responsibility csr, drugs giant talk, responsibility, society, world, access, product, work, ngos, global he...
2    [week, federal reserve, interest rate, time, year, world, central bank, rate, recent year, long spell, course, chart, outcome, americas rate rise,...
3    [cruise line, wave, year, nearly, holiday, sea, result, december 18th carnival, worlds largest operator, global market, fullyear earning, demand, ...
4    [investors, calendar year, buoyant mood, unexpected event, consensus, respect, view, investor, market price, column, potential surprise, definitio...
Name: noun_phrases, dtype: object

In [6]:
all_NPs = list(df_train['noun_phrases'])
all_NPs = [np for l in all_NPs for np in l if len(np)>0]
all_NPs[:5], len(all_NPs)

(['rise', 'big emerging economy', 'china', 'india', 'steady march'], 1417049)

In [7]:
df_train['list_of_verb_lemmas'].iloc[0]

'[emerging, led, wanting, travel, granted, Upgrade, travel, apply, submit, streamline, scrap]'

In [8]:
df_train['list_of_verb_lemmas'] = df_train['list_of_verb_lemmas'].str[2:-2]
df_train['list_of_verb_lemmas'] = df_train['list_of_verb_lemmas'].str.lower().str.split(", ")
df_train['list_of_verb_lemmas'].head()

0                                                               [merging, led, wanting, travel, granted, upgrade, travel, apply, submit, streamline, scra]
1    [rided, embracing, insists, gain, strengthen, improve, deterred, seeking, intends, shift, domiciled, rejoiced, saved, paid, outraged, promised, im...
2    [aised, ended, celebrate, tried, lift, forced, reverse, cut, help, understand, upgrade, strike, wish, save, spend, try, escape, slashing, encourag...
3      [race, booked, improve, announced, control, demand, peaking, piling, based, got, moving, upgrade, increase, announced, establish, aimed, based, ad]
4    [tart, caught, proved, reflected, like, suggest, judged, betting, expect, upgrade, weakens, having, pushed, tighten, buy, priced, doubt, tighten, ...
Name: list_of_verb_lemmas, dtype: object

In [9]:
all_Vs = list(df_train['list_of_verb_lemmas'])
all_Vs = [v for l in all_Vs for v in l if len(v)>0]
all_Vs[:5], len(all_Vs)

(['merging', 'led', 'wanting', 'travel', 'granted'], 675330)

In [10]:
all_words =  list(set(all_NPs + all_Vs))
len(set(all_words))

419327

In [11]:
df_words = pd.DataFrame({'word': all_words})
df_words.head()

Unnamed: 0,word
0,ar art installation
1,st louis
2,passive investment vehicle
3,nicotinefueled train
4,outrageous controversial statement


In [12]:
%%time
#creating word2vec matrix
df_w2v = get_word_embeddings(df_words, column = "word", N_batches=100)
df_w2v.head()

100 batches with 4194 words each
Batch number: 1 out of  100
Batch number: 2 out of  100
Batch number: 3 out of  100
Batch number: 4 out of  100
Batch number: 5 out of  100
Batch number: 6 out of  100
Batch number: 7 out of  100
Batch number: 8 out of  100
Batch number: 9 out of  100
Batch number: 10 out of  100
Batch number: 11 out of  100
Batch number: 12 out of  100
Batch number: 13 out of  100
Batch number: 14 out of  100
Batch number: 15 out of  100
Batch number: 16 out of  100
Batch number: 17 out of  100
Batch number: 18 out of  100
Batch number: 19 out of  100
Batch number: 20 out of  100
Batch number: 21 out of  100
Batch number: 22 out of  100
Batch number: 23 out of  100
Batch number: 24 out of  100
Batch number: 25 out of  100
Batch number: 26 out of  100
Batch number: 27 out of  100
Batch number: 28 out of  100
Batch number: 29 out of  100
Batch number: 30 out of  100
Batch number: 31 out of  100
Batch number: 32 out of  100
Batch number: 33 out of  100
Batch number: 34 ou

Unnamed: 0,word,emb_0,emb_1,emb_2,emb_3,emb_4,emb_5,emb_6,emb_7,emb_8,...,emb_502,emb_503,emb_504,emb_505,emb_506,emb_507,emb_508,emb_509,emb_510,emb_511
0,ar art installation,0.020122,0.000388,0.012315,0.029161,-0.001987,-0.042893,-0.069303,-0.028578,-0.031996,...,0.003335,-0.017962,0.055528,-0.029352,0.046558,0.058642,0.027662,0.008687,0.004643,0.015045
1,st louis,0.025534,-0.046714,-0.019601,0.01069,0.000144,-0.040848,-0.023721,-0.039049,-0.056522,...,-0.062257,-0.069166,0.062389,-0.046099,-0.073728,0.037204,-0.011648,-0.01984,-0.031511,-0.064533
2,passive investment vehicle,0.031301,-0.088394,-0.007285,0.043246,0.006554,-0.005797,-0.048646,0.060434,0.053056,...,-0.045909,0.051535,-0.043772,-0.007596,0.085432,0.065079,0.022551,-0.070039,0.006904,-0.003364
3,nicotinefueled train,0.063141,0.028098,-0.080064,0.013365,0.008811,-0.047909,-0.034389,0.037728,0.027899,...,-0.011127,-0.027788,0.025181,-0.000268,0.000478,-0.014033,0.01618,0.03393,-0.026782,0.01801
4,outrageous controversial statement,-0.034301,-0.025041,-0.005895,0.019051,0.020005,0.038791,-0.044979,0.06052,-0.018211,...,-0.040841,-0.068408,0.037497,-0.004308,0.010036,0.020996,0.03082,0.062054,0.017476,0.065568


In [13]:
df_w2v.iloc[::150001]

Unnamed: 0,word,emb_0,emb_1,emb_2,emb_3,emb_4,emb_5,emb_6,emb_7,emb_8,...,emb_502,emb_503,emb_504,emb_505,emb_506,emb_507,emb_508,emb_509,emb_510,emb_511
0,ar art installation,0.020122,0.000388,0.012315,0.029161,-0.001987,-0.042893,-0.069303,-0.028578,-0.031996,...,0.003335,-0.017962,0.055528,-0.029352,0.046558,0.058642,0.027662,0.008687,0.004643,0.015045
3246,preacher,-0.019613,-0.003985,0.020316,-0.021083,0.007729,0.035804,0.052691,-0.028713,0.005826,...,-0.032107,0.020066,-0.045432,-0.023615,-0.03004,0.028765,-0.031556,0.043194,-0.001436,-0.047522
2299,pixabaythe centers,0.114876,0.00298,-0.048121,0.031062,0.019793,-0.031369,-0.051993,-0.012182,-0.071958,...,0.011845,-0.012947,0.008947,-0.029392,0.051362,0.036043,0.023112,0.007506,0.024925,-0.047059


# Dimensionality reduction 
UMAP https://github.com/lmcinnes/umap

In [14]:
columns = ["emb_" + str(i) for i in range(512)]
embeddings = df_w2v[columns].values
embeddings.shape

(419327, 512)

In [15]:
%%time
umap_embeddings = umap.UMAP(n_neighbors=15, 
                            n_components=5, 
                            metric='cosine').fit_transform(embeddings)
umap_embeddings.shape

CPU times: user 57min 30s, sys: 1min 8s, total: 58min 39s
Wall time: 6min 3s


(419327, 5)

# Clustering

In [16]:
cluster = hdbscan.HDBSCAN(min_cluster_size=15,
                          metric='euclidean',                      
                          cluster_selection_method='eom').fit(umap_embeddings)

In [17]:
#cluster labels
labels = cluster.labels_
labels.shape

(419327,)

In [18]:
#number of clusters (key-words/phrases)
print("Number of clusters:",labels.max() + 1)

Number of clusters: 4297


# Prepare data for LDA

##### get cluster label as most frequent word/phrase of the cluster

In [105]:
df_tmp = df_w2v[['word']].copy()
df_tmp['cluster_number'] = labels
df_tmp.head()

Unnamed: 0,word,cluster_number
0,ar art installation,-1
1,st louis,-1
2,passive investment vehicle,3067
3,nicotinefueled train,2663
4,outrageous controversial statement,2020


In [106]:
df_all_words = pd.DataFrame({'word': all_NPs + all_Vs})
df_all_words = df_all_words.merge(df_tmp, on='word', how='inner')
print(df_all_words.shape)
df_all_words.head()

(2092379, 2)


Unnamed: 0,word,cluster_number
0,rise,-1
1,rise,-1
2,rise,-1
3,rise,-1
4,rise,-1


In [107]:
#label cluster with noise words as "noise"
def get_label(n,w):
    if n == -1:
        return "noise"
    else:
        return w

df_all_words['cluster_label'] = df_all_words.apply(lambda x: get_label(x.cluster_number, x.word), axis=1) 
df_all_words.iloc[::222222]

Unnamed: 0,word,cluster_number,cluster_label
0,rise,-1,noise
222222,detroit,3330,detroit
444444,president trump,2999,president trump
666666,business community,1960,business community
888888,annual shindig,-1,noise
1111110,short distance,2389,short distance
1333332,blue origin ceo bob smith,-1,noise
1555554,direct report,501,direct report
1777776,needed,3183,needed
1999998,monitors,2088,monitors


In [108]:
df_all_words['word_frequency'] = df_all_words.groupby(['cluster_number', 
                                                       'cluster_label'])['word'].transform("count")
df_all_words['word_max_frequency'] = df_all_words.groupby(['cluster_number'])['word_frequency'].transform("max")
df_all_words.iloc[::222222]

Unnamed: 0,word,cluster_number,cluster_label,word_frequency,word_max_frequency
0,rise,-1,noise,744651,744651
222222,detroit,3330,detroit,85,85
444444,president trump,2999,president trump,276,850
666666,business community,1960,business community,4,487
888888,annual shindig,-1,noise,744651,744651
1111110,short distance,2389,short distance,6,208
1333332,blue origin ceo bob smith,-1,noise,744651,744651
1555554,direct report,501,direct report,3,58
1777776,needed,3183,needed,812,2756
1999998,monitors,2088,monitors,40,340


In [109]:
pd.DataFrame(df_all_words.describe(percentiles=[0.01,0.1,0.20,0.3,0.4,0.5,0.75,0.95,0.99])).T

Unnamed: 0,count,mean,std,min,1%,10%,20%,30%,40%,50%,75%,95%,99%,max
cluster_number,2092379.0,1561.009356,1543.147129,-1.0,-1.0,-1.0,-1.0,-1.0,381.0,1223.0,2929.0,4211.0,4296.0,4297.0
word_frequency,2092379.0,265606.304395,356089.683416,1.0,1.0,2.0,23.0,123.0,367.0,861.0,744651.0,744651.0,744651.0,744651.0
word_max_frequency,2092379.0,265844.97292,355915.140473,1.0,8.0,62.0,163.0,351.0,622.0,1211.0,744651.0,744651.0,744651.0,744651.0


In [110]:
df_all_words['word_max_frequency'] = df_all_words.groupby(['cluster_number'])['word_frequency'].transform("max")

df_tmp_noise = pd.DataFrame(df_all_words[df_all_words['cluster_label'] == "noise"])
df_tmp_other = df_all_words[df_all_words['cluster_label'] != "noise"]
print(len(df_tmp_noise),len(df_tmp_other))
df_noise.head()

744734 1347645


Unnamed: 0,word,cluster_number,cluster_label,word_frequency,word_max_frequency
0,rise,-1,noise,744651,744651
1,rise,-1,noise,744651,744651
2,rise,-1,noise,744651,744651
3,rise,-1,noise,744651,744651
4,rise,-1,noise,744651,744651


In [111]:
df_tmp = df_tmp_other[df_tmp_other['word_max_frequency'] == df_tmp_other['word_frequency']]
df_tmp = df_tmp.groupby('cluster_number')['cluster_label'].last().reset_index()
print(df_tmp.shape)
df_tmp.iloc[::1000]

(4298, 2)


Unnamed: 0,cluster_number,cluster_label
0,0,gopro
1000,1000,data center
2000,2000,nazis
3000,3000,earnings report
4000,4000,point


In [112]:
df_word_clusters = df_all_words[['word', 'cluster_number']]
df_word_clusters = df_word_clusters.merge(df_tmp, on='cluster_number', how='left')

df_word_clusters['cluster_label'] = df_word_clusters['cluster_label'].fillna("noise")

print(df_word_clusters.shape)
df_word_clusters.iloc[::222222].T

(2092379, 3)


Unnamed: 0,0,222222,444444,666666,888888,1111110,1333332,1555554,1777776,1999998
word,rise,detroit,president trump,business community,annual shindig,short distance,blue origin ceo bob smith,direct report,needed,monitors
cluster_number,-1,3330,2999,1960,-1,2389,-1,501,3183,2088
cluster_label,noise,detroit,trump,community,noise,distance,noise,direct,need,monitor


In [113]:
#create word -> cluster_name series
del df_word_clusters['cluster_number']
df_word_clusters = df_word_clusters.drop_duplicates()

word_cluster_label = df_word_clusters.set_index("word")
print(len(word_cluster_label))
word_cluster_label.head()

419327


Unnamed: 0_level_0,cluster_label
word,Unnamed: 1_level_1
rise,noise
big emerging economy,economy
china,noise
india,noise
steady march,noise


# Next:
- replacing texts with list of key-words
- running LDA


https://towardsdatascience.com/topic-modeling-with-bert-779f7db187e6