# Idea:
Our solution: LDA + keywords from clusters of BERT based embeddings of noun phrases and verbs :
- Each noun phrase and verb in the texts is  transformed to embedding vector using Universal Sentence Encoder (transformer based on BERT)
- Embedding vectors from (a) are clustered (HDBSCAN + UNET)
- Words/phrases with embedding vectors closest to the centers of resulting clusters form key word/phrase
- Each text in the training sample is converted to collection of key-phrases by replacing its noun phrases and verbs with keyword/phrases and deleting other words
- LDA is performed on the transformed texts


**Reference:**<br>
- Daniel Cer, Yinfei Yang, Sheng-yi Kong, Nan Hua, Nicole Limtiaco, Rhomni St. John, Noah Constant, Mario Guajardo-Céspedes, Steve Yuan, Chris Tar, Yun-Hsuan Sung, Brian Strope, Ray Kurzweil. **Universal Sentence Encoder.** *arXiv:1803.11175, 2018.*
- McInnes, L, Healy, J, **UMAP: Uniform Manifold Approximation and Projection for Dimension Reduction**, *ArXiv e-prints 1802.03426, 2018*

# Load data and python libraries

In [14]:
# data processing libraries
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'

# display wider columns in pandas data frames where necessary
pd.set_option('max_colwidth',150)

import tensorflow as tf
print("TensorFlow version:", tf.__version__)

import tensorflow_hub as hub
#Load the Universal Sentence Encoder's TF Hub module
module_url = "https://tfhub.dev/google/universal-sentence-encoder-large/5"
model = hub.load(module_url)
print ("module %s loaded" % module_url)

import umap
import hdbscan

import pickle

TensorFlow version: 2.2.0
module https://tfhub.dev/google/universal-sentence-encoder-large/5 loaded


In [15]:
df_train = pd.read_csv("./data/train_grouped.tsv", sep="\t")
print("df_train.shape:", df_train.shape)
print("df_train.shape:",df_train.columns)

df_train.shape: (33982, 16)
df_train.shape: Index(['date', 'author', 'title', 'url', 'section', 'publication',
       'first_10_sents', 'list_of_first_10_sents', 'list_of_verb_lemmas',
       'noun_phrases', 'list_of_nouns', 'list_of_lemmas', 'ID',
       'group_level_1', 'group_level_2', 'group_level_3'],
      dtype='object')


# Getting text clusters through sentence embedding comparison

In [16]:
def get_embeddings(input):
    return model(input)

In [17]:
def get_word_embeddings(df_data, column = "word", N_batches=1):
    #split data into N batches
    N = N_batches

    part = int(len(df_data)/N)
    print(N, "batches with", part + 1, column + "s each")

    #get embeddings for each N words
    index = 0
    batch_num = 0
    list_dfs = []

    while index < len(df_data): 
        df_tmp = df_data.iloc[index : index + part].copy()
        df_tmp = df_tmp.reset_index(drop=True)
        print ("Batch number:", batch_num + 1, "out of ", N)

        df_batch_embeddings = pd.DataFrame(get_embeddings(list(df_tmp[column])).numpy())

        num_embeddings = df_batch_embeddings.shape[1]
        columns = ["emb_" + str(i) for i in range(512)]
        df_tmp[columns] = df_batch_embeddings

        list_dfs.append(df_tmp)
        batch_num = batch_num + 1
        index = index + part

    #concatinate batches into single dataset
    df_emb = pd.concat(list_dfs)

    return df_emb

In [18]:
df_train['noun_phrases'] = df_train['noun_phrases'].str[2:-2]
df_train['noun_phrases'] = df_train['noun_phrases'].str.lower().str.split("', '")
df_train['noun_phrases'].head()

0    [rise, big emerging economy, china, india, steady march, globalisation, surge, number, people, business, tourism, result, demand, visa, unpreceden...
1    [pfizer, commitment, corporate social responsibility csr, drugs giant talk, responsibility, society, world, access, product, work, ngos, global he...
2    [week, federal reserve, interest rate, time, year, world, central bank, rate, recent year, long spell, course, chart, outcome, americas rate rise,...
3    [cruise line, wave, year, nearly, holiday, sea, result, december 18th carnival, worlds largest operator, global market, fullyear earning, demand, ...
4    [investors, calendar year, buoyant mood, unexpected event, consensus, respect, view, investor, market price, column, potential surprise, definitio...
Name: noun_phrases, dtype: object

In [19]:
all_NPs = list(df_train['noun_phrases'])
all_NPs = [np for l in all_NPs for np in l if len(np)>0]
all_NPs[:5], len(all_NPs)

(['rise', 'big emerging economy', 'china', 'india', 'steady march'], 1417049)

In [20]:
df_train['list_of_verb_lemmas'].iloc[0]

'[emerging, led, wanting, travel, granted, Upgrade, travel, apply, submit, streamline, scrap]'

In [21]:
df_train['list_of_verb_lemmas'] = df_train['list_of_verb_lemmas'].str[2:-2]
df_train['list_of_verb_lemmas'] = df_train['list_of_verb_lemmas'].str.lower().str.split(", ")
df_train['list_of_verb_lemmas'].head()

0                                                               [merging, led, wanting, travel, granted, upgrade, travel, apply, submit, streamline, scra]
1    [rided, embracing, insists, gain, strengthen, improve, deterred, seeking, intends, shift, domiciled, rejoiced, saved, paid, outraged, promised, im...
2    [aised, ended, celebrate, tried, lift, forced, reverse, cut, help, understand, upgrade, strike, wish, save, spend, try, escape, slashing, encourag...
3      [race, booked, improve, announced, control, demand, peaking, piling, based, got, moving, upgrade, increase, announced, establish, aimed, based, ad]
4    [tart, caught, proved, reflected, like, suggest, judged, betting, expect, upgrade, weakens, having, pushed, tighten, buy, priced, doubt, tighten, ...
Name: list_of_verb_lemmas, dtype: object

In [22]:
all_Vs = list(df_train['list_of_verb_lemmas'])
all_Vs = [v for l in all_Vs for v in l if len(v)>0]
all_Vs[:5], len(all_Vs)

(['merging', 'led', 'wanting', 'travel', 'granted'], 675330)

In [23]:
all_words =  list(set(all_NPs + all_Vs))
len(set(all_words))

419327

In [24]:
df_words = pd.DataFrame({'word': all_words})
df_words.head()

Unnamed: 0,word
0,english navy
1,affordable iphone xr
2,conley
3,nsa hacking team
4,driverless


In [25]:
%%time
#creating word2vec matrix
df_w2v = get_word_embeddings(df_words, column = "word", N_batches=100)
df_w2v.head()

100 batches with 4194 words each
Batch number: 1 out of  100
Batch number: 2 out of  100
Batch number: 3 out of  100
Batch number: 4 out of  100
Batch number: 5 out of  100
Batch number: 6 out of  100
Batch number: 7 out of  100
Batch number: 8 out of  100
Batch number: 9 out of  100
Batch number: 10 out of  100
Batch number: 11 out of  100
Batch number: 12 out of  100
Batch number: 13 out of  100
Batch number: 14 out of  100
Batch number: 15 out of  100
Batch number: 16 out of  100
Batch number: 17 out of  100
Batch number: 18 out of  100
Batch number: 19 out of  100
Batch number: 20 out of  100
Batch number: 21 out of  100
Batch number: 22 out of  100
Batch number: 23 out of  100
Batch number: 24 out of  100
Batch number: 25 out of  100
Batch number: 26 out of  100
Batch number: 27 out of  100
Batch number: 28 out of  100
Batch number: 29 out of  100
Batch number: 30 out of  100
Batch number: 31 out of  100
Batch number: 32 out of  100
Batch number: 33 out of  100
Batch number: 34 ou

Unnamed: 0,word,emb_0,emb_1,emb_2,emb_3,emb_4,emb_5,emb_6,emb_7,emb_8,...,emb_502,emb_503,emb_504,emb_505,emb_506,emb_507,emb_508,emb_509,emb_510,emb_511
0,english navy,0.019252,0.056447,-0.013358,0.032645,0.050376,-0.137254,-0.014414,-0.051146,0.01326,...,-0.053418,0.002012,0.104392,-0.024585,-0.007928,0.035448,0.011937,0.013509,-0.004595,0.003696
1,affordable iphone xr,-0.01707,0.066874,-0.045078,-0.001512,0.035852,-0.063014,0.00425,0.021522,0.120475,...,-0.025386,0.000343,-0.035876,-0.007965,0.030047,-0.041157,0.025264,-0.02475,-0.00667,-0.018801
2,conley,-0.086235,0.052709,-0.020092,-0.017369,0.010625,-0.070125,0.03213,-0.012201,0.005054,...,-0.054062,-0.049556,0.016414,0.009363,0.031734,0.021291,-0.007574,0.02839,-0.017262,0.011601
3,nsa hacking team,-0.007682,0.017487,0.066631,0.053021,-0.04503,0.017534,-0.015002,0.021589,-0.010603,...,-0.016597,0.027437,0.081878,-0.05794,-0.021009,0.067911,-0.003568,0.034031,0.059068,0.08852
4,driverless,0.000395,0.00978,0.045812,0.014513,0.029443,-0.05847,0.045797,0.006264,-4e-05,...,0.015909,-0.040082,0.045964,-0.007353,-0.003374,0.059233,0.005722,0.019076,0.002116,0.031821


In [26]:
df_w2v.iloc[::150001]

Unnamed: 0,word,emb_0,emb_1,emb_2,emb_3,emb_4,emb_5,emb_6,emb_7,emb_8,...,emb_502,emb_503,emb_504,emb_505,emb_506,emb_507,emb_508,emb_509,emb_510,emb_511
0,english navy,0.019252,0.056447,-0.013358,0.032645,0.050376,-0.137254,-0.014414,-0.051146,0.01326,...,-0.053418,0.002012,0.104392,-0.024585,-0.007928,0.035448,0.011937,0.013509,-0.004595,0.003696
3246,insane smartphone growth,0.027154,0.035906,0.041326,0.001631,-0.040255,0.019108,0.020335,0.11342,0.153041,...,0.013288,0.045752,-0.036631,-0.006495,0.077364,0.004067,-0.098226,0.066084,0.060544,0.043139
2299,visceral,-0.029612,0.0005,0.036229,-0.062932,-0.003113,0.049451,0.071272,0.055529,0.000105,...,-0.070087,-0.008031,-0.024188,0.002269,-0.07114,0.006097,0.001805,0.010009,-0.053891,-0.005071


# Dimensionality reduction 
UMAP https://github.com/lmcinnes/umap

In [27]:
columns = ["emb_" + str(i) for i in range(512)]
embeddings = df_w2v[columns].values
embeddings.shape

(419327, 512)

In [28]:
%%time
umap_model = umap.UMAP(n_neighbors=15, 
                        n_components=5, 
                        metric='cosine').fit(embeddings)

CPU times: user 58min 18s, sys: 1min 8s, total: 59min 26s
Wall time: 6min 2s


In [29]:
%%time
umap_embeddings = umap_model.transform(embeddings)
umap_embeddings.shape

CPU times: user 1.63 s, sys: 588 ms, total: 2.22 s
Wall time: 2.22 s


(419327, 5)

In [30]:
with open('./transition_files/umap_model.pickle', 'wb') as f:
    # Pickle the 'data' dictionary using the highest protocol available.
    pickle.dump(umap_model, f, pickle.HIGHEST_PROTOCOL)

# Clustering

In [31]:
hdbscan_cluster = hdbscan.HDBSCAN(min_cluster_size=15,
                          metric='euclidean',                      
                          cluster_selection_method='eom').fit(umap_embeddings)

In [32]:
#cluster labels
labels = hdbscan_cluster.labels_
labels.shape

(419327,)

In [33]:
#number of clusters (key-words/phrases)
print("Number of clusters:",labels.max() + 1)

Number of clusters: 4303


In [34]:
with open('./transition_files/hdbscan_cluster.pickle', 'wb') as f:
    # Pickle the 'data' dictionary using the highest protocol available.
    pickle.dump(hdbscan_cluster, f, pickle.HIGHEST_PROTOCOL)

# Prepare data for LDA

##### get cluster label as most frequent word/phrase of the cluster

In [35]:
df_tmp = df_w2v[['word']].copy()
df_tmp['cluster_number'] = labels
df_tmp.head()

Unnamed: 0,word,cluster_number
0,english navy,761
1,affordable iphone xr,2260
2,conley,-1
3,nsa hacking team,-1
4,driverless,-1


In [36]:
df_all_words = pd.DataFrame({'word': list(all_NPs + all_Vs),
                             'cluster_label': list(all_NPs + all_Vs)
                            })
df_all_words = df_all_words.merge(df_tmp, on='word', how='inner')
print(df_all_words.shape)

(2092379, 3)


In [37]:
df_all_words['word_frequency'] = df_all_words.groupby(['cluster_number', 
                                                       'cluster_label'])['word'].transform("count")
df_all_words['word_max_frequency'] = df_all_words.groupby(['cluster_number'])['word_frequency'].transform("max")
df_all_words.iloc[::222220]

Unnamed: 0,word,cluster_label,cluster_number,word_frequency,word_max_frequency
0,rise,rise,-1,795,9553
222220,detroit,detroit,3332,85,85
444440,president trump,president trump,3370,276,850
666660,hostility,hostility,1308,20,20
888880,ink,ink,879,29,29
1111100,school bus,school bus,3229,23,130
1333320,private new space company,private new space company,-1,1,9553
1555540,pterodactyl,pterodactyl,1564,1,41
1777760,needed,needed,3433,812,2756
1999980,monitors,monitors,-1,40,9553


In [38]:
pd.DataFrame(df_all_words.describe(percentiles=[0.01,0.1,0.20,0.3,0.4,0.5,0.75,0.95,0.99])).T

Unnamed: 0,count,mean,std,min,1%,10%,20%,30%,40%,50%,75%,95%,99%,max
cluster_number,2092379.0,1555.766894,1543.933302,-1.0,-1.0,-1.0,-1.0,-1.0,365.0,1205.0,3047.0,4209.0,4294.0,4302.0
word_frequency,2092379.0,840.679864,2361.341249,1.0,1.0,1.0,3.0,13.0,46.0,122.0,694.0,3290.0,10360.0,20130.0
word_max_frequency,2092379.0,4254.39878,4711.107384,1.0,8.0,61.0,165.0,365.0,658.0,1252.0,9553.0,9553.0,20130.0,20130.0


In [39]:
df_tmp_noise = pd.DataFrame(df_all_words[df_all_words['cluster_number'] == -1])
df_tmp_other = df_all_words[df_all_words['cluster_number'] != -1]
print(len(df_tmp_noise),len(df_tmp_other))
df_tmp_noise.head()

745515 1346864


Unnamed: 0,word,cluster_label,cluster_number,word_frequency,word_max_frequency
0,rise,rise,-1,795,9553
1,rise,rise,-1,795,9553
2,rise,rise,-1,795,9553
3,rise,rise,-1,795,9553
4,rise,rise,-1,795,9553


In [40]:
df_tmp = df_tmp_other[df_tmp_other['word_max_frequency'] == df_tmp_other['word_frequency']]
df_tmp = df_tmp.groupby('cluster_number')['cluster_label'].last().reset_index()
print(df_tmp.shape)
df_tmp.iloc[::1000]

(4303, 2)


Unnamed: 0,cluster_number,cluster_label
0,0,gopro
1000,1000,responsibility
2000,2000,export
3000,3000,lack
4000,4000,object


In [41]:
df_word_clusters = df_all_words[['word', 'cluster_number']]
print(df_word_clusters.shape)

df_word_clusters = df_word_clusters.drop_duplicates()
df_word_clusters = df_word_clusters.merge(df_tmp, on='cluster_number', how='left')
df_word_clusters['cluster_label'] = df_word_clusters['cluster_label'].fillna("noise")

print(df_word_clusters.shape)
df_word_clusters.iloc[::222222].T

(2092379, 2)
(419327, 3)


Unnamed: 0,0,222222
word,rise,um hand
cluster_number,-1,2819
cluster_label,noise,hand


In [42]:
s = df_word_clusters.groupby('cluster_label')['word'].count()
s.describe()

count      4304.000000
mean         97.427277
std        2518.074008
min          15.000000
25%          23.000000
50%          37.000000
75%          65.000000
max      164979.000000
Name: word, dtype: float64

In [43]:
s.tail()

cluster_label
zikacarrying mosquito     80
zombie                    35
zoo                       29
zoom                     151
zuckerberg                91
Name: word, dtype: int64

In [44]:
df_word_clusters[df_word_clusters['cluster_label'] == "zombie"]

Unnamed: 0,word,cluster_number,cluster_label
24011,zombie,2168,zombie
24016,corporate zombie,2168,zombie
24022,zombie firmscompanie,2168,zombie
76572,zombie outbreak,2168,zombie
176015,zombie virus,2168,zombie
182615,zombie star,2168,zombie
193144,zombie infectionsincluding prevention,2168,zombie
193150,zombie paper,2168,zombie
193152,actual fictional zombie literature,2168,zombie
199134,zombie army,2168,zombie


In [45]:
df_word_clusters.to_csv('./transition_files/word_cluster_label.csv', index=False)

***
# Replace text words with their cluster names (KeyWords)
excluding "noise"

In [46]:
df_train.columns

Index(['date', 'author', 'title', 'url', 'section', 'publication',
       'first_10_sents', 'list_of_first_10_sents', 'list_of_verb_lemmas',
       'noun_phrases', 'list_of_nouns', 'list_of_lemmas', 'ID',
       'group_level_1', 'group_level_2', 'group_level_3'],
      dtype='object')

In [47]:
df_train['all_words'] = df_train['list_of_verb_lemmas'] + df_train['noun_phrases']

In [48]:
#delete "noise" 
print(len(df_word_clusters))
df_word_clusters = df_word_clusters[df_word_clusters['cluster_label'] != "nose"]
print(len(df_word_clusters))

419327
419288


In [49]:
word_cluster_label_dict =dict(zip(df_word_clusters['word'], df_word_clusters['cluster_label']))

In [50]:
df_train['all_key_words'] = df_train['all_words'].apply(lambda Wlist: 
                                                        [word_cluster_label_dict[w] for w in  Wlist
                                                                                    if w in word_cluster_label_dict
                                                        ])
df_train[['all_key_words', 'all_words']].head()

Unnamed: 0,all_key_words,all_words
0,"[bound, led, noise, travel, noise, upgrade, travel, noise, noise, noise, sgt, noise, economic, noise, noise, noise, noise, surge, number, people, ...","[merging, led, wanting, travel, granted, upgrade, travel, apply, submit, streamline, scra, rise, big emerging economy, china, india, steady march,..."
1,"[noise, noise, noise, gain, raised, noise, prevent, noise, noise, shift, noise, excited, save, noise, noise, promise, noise, getting, noise, tryin...","[rided, embracing, insists, gain, strengthen, improve, deterred, seeking, intends, shift, domiciled, rejoiced, saved, paid, outraged, promised, im..."
2,"[noise, end, noise, trying, lift, noise, reverse, cut, help, understand, upgrade, strike, wish, save, noise, trying, escape, noise, noise, pulled,...","[aised, ended, celebrate, tried, lift, forced, reverse, cut, help, understand, upgrade, strike, wish, save, spend, try, escape, slashing, encourag..."
3,"[race, booked, noise, noise, control, demand, noise, noise, noise, noise, moving, upgrade, noise, noise, making, target, noise, ad, cruise, wave, ...","[race, booked, improve, announced, control, demand, peaking, piling, based, got, moving, upgrade, increase, announced, establish, aimed, based, ad..."
4,"[noise, noise, noise, noise, noise, suggest, judgment, betting, expect, upgrade, noise, noise, noise, noise, buy, noise, believe, noise, noise, in...","[tart, caught, proved, reflected, like, suggest, judged, betting, expect, upgrade, weakens, having, pushed, tighten, buy, priced, doubt, tighten, ..."


In [51]:
with open('./transition_files/df_train_for_LDA.pickle', 'wb') as f:
    # Pickle the 'data' dictionary using the highest protocol available.
    pickle.dump(df_train, f, pickle.HIGHEST_PROTOCOL)