In [1]:
import re
import pandas as pd
import numpy as np
import os
from datetime import datetime
from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora.dictionary import Dictionary
import gensim.corpora as corpora
from scipy.cluster import hierarchy as sch
from bertopic import BERTopic

os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Load data
lucy = pd.read_csv('/workspace/data/lucy_Ukraine.csv')
lucy['contents'] = lucy['contents'].replace(np.nan, '')
lucy['contents'] = lucy['contents'].replace("\n", '')
# Filter

lucy.contents = lucy.apply(lambda row: " ".join(re.sub("[-=+,#/\?:^$.@*\"※~&%ㆍ!』\\‘|\(\)\[\]\<\>`\'…《\》]", " ", row.contents).split()), 1)
lucy.contents = lucy.apply(lambda row: " ".join(re.sub("([ㄱ-ㅎㅏ-ㅣ]+)", " ", row.contents).split()), 1)
lucy.contents = lucy.apply(lambda row: " ".join(re.sub("([♡❤✌❣♥ᆢ✊❤️✨⤵️☺️;”“]+)", " ", row.contents).split()), 1)
lucy.contents = lucy.apply(lambda row: " ".join(re.sub("_x000D_", "", row.contents).split()), 1)
timestamps = lucy.date.to_list()
contents_data = lucy.contents.to_list()

In [2]:
topic_model = BERTopic(min_topic_size=20, verbose=True, nr_topics=5, embedding_model='jhgan/ko-sbert-sts')
topics, prob = topic_model.fit_transform(contents_data)

Batches:   0%|          | 0/1345 [00:00<?, ?it/s]

2023-01-13 06:40:30,768 - BERTopic - Transformed documents to Embeddings
2023-01-13 06:41:22,255 - BERTopic - Reduced dimensionality
2023-01-13 06:41:25,242 - BERTopic - Clustered reduced embeddings
2023-01-13 06:42:15,768 - BERTopic - Reduced number of topics from 571 to 6


In [166]:
# Hierarchical topics
linkage_function = lambda x: sch.linkage(x, 'single', optimal_ordering=True)
hierarchical_topics = topic_model.hierarchical_topics(contents_data, linkage_function=linkage_function)

100%|██████████| 4/4 [00:00<00:00, 44.35it/s]


In [167]:
topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics, width=800)

In [168]:
global hierarchical_topics

In [169]:
hierarchical_topics

Unnamed: 0,Parent_ID,Parent_Name,Topics,Child_Left_ID,Child_Left_Name,Child_Right_ID,Child_Right_Name,Distance
3,8,위해_우리_있는_우크라이나_여러분,"[0, 1, 2, 3, 4]",5,우크라이나_러시아_미사일_키이우_드론,7,위해_우리_있는_여러분_위한,1.151412
2,7,위해_우리_있는_여러분_위한,"[0, 1, 2]",6,위해_우리_여러분_있는_위한,2,수출_달러_지난해_수출은_대비,1.128212
1,6,위해_우리_여러분_있는_위한,"[0, 1]",1,모든_있는_그리고_내가_우리,0,위해_여러분_위한_우리_통해,0.826611
0,5,우크라이나_러시아_미사일_키이우_드론,"[3, 4]",4,러시아_국방부는_우크라이나_러시아군_하이마스,3,우크라이나_키이우_드론_미사일_러시아,0.692167


In [177]:
h_dict = hierarchical_topics.to_dict()
h_parent_id = list(h_dict['Parent_ID'].values())
h_childl_id = list(h_dict['Child_Left_ID'].values())
h_childr_id = list(h_dict['Child_Right_ID'].values())

In [183]:
print(h_childl_id)
print(h_childr_id)
print(h_parent_id)

['5', '6', '1', '4']
['7', '2', '0', '3']
['8', '7', '6', '5']


In [184]:
unique_id = []

for i in h_childl_id:
    if i not in h_parent_id:
        unique_id.append(i)

for i in h_childr_id:
    if i not in h_parent_id:
        unique_id.append(i)

print(unique_id)

['1', '4', '2', '0', '3']


In [162]:
length_list = []
Topic_length_list = []

for i in range(len(hierarchical_topics['Topics'])):
    length_list.append(len(hierarchical_topics['Topics'][i]))

for value in length_list:
    if value not in Topic_length_list:
        Topic_length_list.append(value)

Future_Topic_length_list = Topic_length_list[:]
Future_Topic_length_list.remove(min(Future_Topic_length_list))

In [163]:
def extract_child(input_df, topic_length, next_topic_length):

    global hierarchical_topics

    min_list = []
    
    for i in range(len(input_df)):
        a = input_df.loc[i]
        if len(a[2]) == topic_length:
            min_list.append(a['Parent_ID'])

    child_name = "Child_Name" + str(topic_length)
    child_id = "Child_ID" + str(topic_length)
    parent_name = "Child_Name" + str(next_topic_length)
    parent_id = "Child_ID" + str(next_topic_length)

    b = input_df.loc[input_df[input_df['Parent_ID'].isin(min_list)].index]
    hierarchical_topics = input_df.drop(hierarchical_topics[hierarchical_topics['Parent_ID'].isin(min_list)].index).reset_index(drop=True)
    b = b.rename(columns={'Parent_Name':parent_name})

    right_child = b.loc[:,['Parent_ID',parent_name,'Topics','Child_Right_ID','Child_Right_Name','Distance']]
    left_child = b.loc[:,['Parent_ID',parent_name,'Topics','Child_Left_ID','Child_Left_Name','Distance']]

    right_child = right_child.rename(columns={'Child_Right_Name':child_name})
    right_child = right_child.rename(columns={'Child_Right_ID':child_id})
    right_child = right_child.rename(columns={'Parent_ID':parent_id})
    left_child = left_child.rename(columns={'Parent_ID':parent_id})
    left_child = left_child.rename(columns={'Child_Left_Name':child_name})
    left_child = left_child.rename(columns={'Child_Left_ID':child_id})

    child_data = pd.concat([right_child,left_child])

    child_data = child_data.reset_index(drop=True)

    return child_data, parent_id, parent_name

In [164]:
for length in range(len(Future_Topic_length_list)):

    globals()["child_data" + str(Topic_length_list[length])], globals()["parent_id" + str(Topic_length_list[length])], globals()["parent_name" + str(Topic_length_list[length])] = extract_child(hierarchical_topics, Topic_length_list[length], Future_Topic_length_list[length])

right_child = hierarchical_topics.loc[:,['Parent_ID','Parent_Name','Topics','Child_Right_ID','Child_Right_Name','Distance']]
left_child = hierarchical_topics.loc[:,['Parent_ID','Parent_Name','Topics','Child_Left_ID','Child_Left_Name','Distance']]

right_child = right_child.rename(columns={'Child_Right_Name':("Child_Name" + str(length))})
right_child = right_child.rename(columns={'Child_Right_ID':("Child_ID" + str(length))})
left_child = left_child.rename(columns={'Child_Left_Name':("Child_Name" + str(length))})
left_child = left_child.rename(columns={'Child_Left_ID':("Child_ID" + str(length))})

hierarchical_data = pd.concat([right_child,left_child])

hierarchical_data = hierarchical_data.reset_index(drop=True)


In [157]:
child_data2

Unnamed: 0,Child_ID3,Child_Name3,Topics,Child_ID2,Child_Name2,Distance
0,6,위해_우리_여러분_있는_위한,"[0, 1]",0,위해_여러분_위한_우리_통해,0.826611
1,5,우크라이나_러시아_미사일_키이우_드론,"[3, 4]",3,우크라이나_키이우_드론_미사일_러시아,0.692167
2,6,위해_우리_여러분_있는_위한,"[0, 1]",1,모든_있는_그리고_내가_우리,0.826611
3,5,우크라이나_러시아_미사일_키이우_드론,"[3, 4]",4,러시아_국방부는_우크라이나_러시아군_하이마스,0.692167


In [158]:
child_data3

Unnamed: 0,Child_ID5,Child_Name5,Topics,Child_ID3,Child_Name3,Distance
0,7,위해_우리_있는_여러분_위한,"[0, 1, 2]",2,수출_달러_지난해_수출은_대비,1.128212
1,7,위해_우리_있는_여러분_위한,"[0, 1, 2]",6,위해_우리_여러분_있는_위한,1.128212


In [156]:
hierarchical_data

Unnamed: 0,Parent_ID,Parent_Name,Topics,parent_id1,parent_name1,Distance
0,8,위해_우리_있는_우크라이나_여러분,"[0, 1, 2, 3, 4]",7,위해_우리_있는_여러분_위한,1.151412
1,8,위해_우리_있는_우크라이나_여러분,"[0, 1, 2, 3, 4]",5,우크라이나_러시아_미사일_키이우_드론,1.151412


In [None]:
for a in range(len(parent_data[("parent_name" + str(length))])):
    for b in range(len(child_data3[("parent_name" + str(length))])):
        if parent_data[("parent_name" + str(length))][a] == child_data3[("parent_name" + str(length))][b]:
            hierarchical_data = pd.concat([parent_data,child_data3]).reset_index(drop=True)
            hierarchical_data = hierarchical_data.drop(index=a).reset_index(drop=True)
            hierarchical_data['Parent_Name'] = hierarchical_data['Parent_Name'][0]
            hierarchical_data['Parent_ID'] = hierarchical_data['Parent_ID'][0]


In [126]:
hierarchical_data = pd.concat([hierarchical_data,child_data2]).reset_index(drop=True)
hierarchical_data['Parent_Name'] = hierarchical_data['Parent_Name'][0]
hierarchical_data['Parent_ID'] = hierarchical_data['Parent_ID'][0]

In [127]:
hierarchical_data

Unnamed: 0,Parent_ID,Parent_Name,Topics,Child_ID5,Child_Name5,Distance,Child_ID3,Child_Name3,Child_ID2,Child_Name2
0,8,위해_우리_있는_우크라이나_여러분,"[0, 1, 2, 3, 4]",5.0,우크라이나_러시아_미사일_키이우_드론,1.151412,,,,
1,8,위해_우리_있는_우크라이나_여러분,"[0, 1, 2]",7.0,위해_우리_있는_여러분_위한,1.128212,2.0,수출_달러_지난해_수출은_대비,,
2,8,위해_우리_있는_우크라이나_여러분,"[0, 1, 2]",7.0,위해_우리_있는_여러분_위한,1.128212,6.0,위해_우리_여러분_있는_위한,,
3,8,위해_우리_있는_우크라이나_여러분,"[0, 1]",,,0.826611,6.0,위해_우리_여러분_있는_위한,0.0,위해_여러분_위한_우리_통해
4,8,위해_우리_있는_우크라이나_여러분,"[3, 4]",,,0.692167,5.0,우크라이나_러시아_미사일_키이우_드론,3.0,우크라이나_키이우_드론_미사일_러시아
5,8,위해_우리_있는_우크라이나_여러분,"[0, 1]",,,0.826611,6.0,위해_우리_여러분_있는_위한,1.0,모든_있는_그리고_내가_우리
6,8,위해_우리_있는_우크라이나_여러분,"[3, 4]",,,0.692167,5.0,우크라이나_러시아_미사일_키이우_드론,4.0,러시아_국방부는_우크라이나_러시아군_하이마스


In [110]:
for a in range(len(parent_data[parent_name])):
    for b in range(len(child_data3[parent_name])):
        if parent_data[parent_name][a] == child_data3[parent_name][b]:
            hierarchical_data = pd.concat([parent_data,child_data3]).reset_index(drop=True)
            hierarchical_data = hierarchical_data.drop(index=a).reset_index(drop=True)
            hierarchical_data['Parent_Name'] = hierarchical_data['Parent_Name'][0]
            hierarchical_data['Parent_ID'] = hierarchical_data['Parent_ID'][0]

            
hierarchical_data = pd.concat([hierarchical_data,child_data2]).reset_index(drop=True)
hierarchical_data['Parent_Name'] = hierarchical_data['Parent_Name'][0]
hierarchical_data['Parent_ID'] = hierarchical_data['Parent_ID'][0]

min_list = []

for i in range(len(hierarchical_data)):
    a = hierarchical_data.loc[i]
    if len(a[2]) == 3:
        min_list.append(a[6])
print(min_list)
index_list = hierarchical_data[hierarchical_data['Child_ID3'].isin(min_list)].index

data = hierarchical_data[parent_name][index_list[0]]
id =  hierarchical_data[parent_id][index_list[0]]
Topic_list = []
print(index_list)
for i in index_list:
    hierarchical_data[parent_name][i] = data
    hierarchical_data[parent_id][i] = id
    if len(hierarchical_data['Topics'][i]) == 3:
        Topic_list.append(hierarchical_data['Topics'][i])

#hierarchical_data = hierarchical_data.drop(hierarchical_data[hierarchical_data['Topics'].isin(Topic_list)].index).reset_index(drop=True)

['2', '6']
Int64Index([1, 2, 3, 5], dtype='int64')


In [None]:
hierarchical_data.T.duplicated(['Topics'])

In [112]:
hierarchical_data

Unnamed: 0,Parent_ID,Parent_Name,Topics,Child_ID5,Child_Name5,Distance,Child_ID3,Child_Name3,Child_ID2,Child_Name2
0,8,위해_우리_있는_우크라이나_여러분,"[0, 1, 2, 3, 4]",5.0,우크라이나_러시아_미사일_키이우_드론,1.151412,,,,
1,8,위해_우리_있는_우크라이나_여러분,"[0, 1, 2]",7.0,위해_우리_있는_여러분_위한,1.128212,2.0,수출_달러_지난해_수출은_대비,,
2,8,위해_우리_있는_우크라이나_여러분,"[0, 1, 2]",7.0,위해_우리_있는_여러분_위한,1.128212,6.0,위해_우리_여러분_있는_위한,,
3,8,위해_우리_있는_우크라이나_여러분,"[0, 1]",7.0,위해_우리_있는_여러분_위한,0.826611,6.0,위해_우리_여러분_있는_위한,0.0,위해_여러분_위한_우리_통해
4,8,위해_우리_있는_우크라이나_여러분,"[3, 4]",,,0.692167,5.0,우크라이나_러시아_미사일_키이우_드론,3.0,우크라이나_키이우_드론_미사일_러시아
5,8,위해_우리_있는_우크라이나_여러분,"[0, 1]",7.0,위해_우리_있는_여러분_위한,0.826611,6.0,위해_우리_여러분_있는_위한,1.0,모든_있는_그리고_내가_우리
6,8,위해_우리_있는_우크라이나_여러분,"[3, 4]",,,0.692167,5.0,우크라이나_러시아_미사일_키이우_드론,4.0,러시아_국방부는_우크라이나_러시아군_하이마스


In [None]:
for a in range(len(parent_data[parent_name])):
    for b in range(len(child_data3[parent_name])):
        if parent_data[parent_name][a] == child_data3[parent_name][b]:
            parent_data2 = pd.concat([parent_data,child_data3]).reset_index(drop=True)
            parent_data2 = parent_data2.drop(index=a).reset_index(drop=True)
            parent_data2['Parent_Name'] = parent_data2['Parent_Name'][0]
            parent_data2['Parent_ID'] = parent_data2['Parent_ID'][0]
            
parent_data2 = pd.concat([parent_data2,child_data2]).reset_index(drop=True)
parent_data2['Parent_Name'] = parent_data2['Parent_Name'][0]
parent_data2['Parent_ID'] = parent_data2['Parent_ID'][0]

min_list = []

for i in range(len(parent_data2)):
    a = parent_data2.loc[i]
    if len(a[2]) == 4:
        min_list.append(a[6])
        
index_list = parent_data2[parent_data2['Child_ID4'].isin(min_list)].index

data = parent_data2['Child_Name5'][index_list[0]]
id =  parent_data2['Child_ID5'][index_list[0]]
Topic_list = []

for i in index_list:
    parent_data2['Child_Name5'][i] = data
    parent_data2['Child_ID5'][i] = id
    if len(parent_data2['Topics'][i]) == 4:
        Topic_list.append(parent_data2['Topics'][i])

parent_data2 = parent_data2.drop(parent_data2[parent_data2['Topics'].isin(Topic_list)].index).reset_index(drop=True)

In [266]:
for a in range(len(parent_data['Child_Name5'])):
    for b in range(len(child_data_2['Child_Name5'])):
        if parent_data['Child_Name5'][a] == child_data_2['Child_Name5'][b]:
            parent_data2 = pd.concat([parent_data,child_data_2]).reset_index(drop=True)
            parent_data2 = parent_data2.drop(index=a).reset_index(drop=True)
            parent_data2['Parent_Name'] = parent_data2['Parent_Name'][0]
            parent_data2['Parent_ID'] = parent_data2['Parent_ID'][0]
            
parent_data2 = pd.concat([parent_data2,child_data]).reset_index(drop=True)
parent_data2['Parent_Name'] = parent_data2['Parent_Name'][0]
parent_data2['Parent_ID'] = parent_data2['Parent_ID'][0]

min_list = []

for i in range(len(parent_data2)):
    a = parent_data2.loc[i]
    if len(a[2]) == 4:
        min_list.append(a[6])
        
index_list = parent_data2[parent_data2['Child_ID4'].isin(min_list)].index

data = parent_data2['Child_Name5'][index_list[0]]
id =  parent_data2['Child_ID5'][index_list[0]]
Topic_list = []

for i in index_list:
    parent_data2['Child_Name5'][i] = data
    parent_data2['Child_ID5'][i] = id
    if len(parent_data2['Topics'][i]) == 4:
        Topic_list.append(parent_data2['Topics'][i])

parent_data2 = parent_data2.drop(parent_data2[parent_data2['Topics'].isin(Topic_list)].index).reset_index(drop=True)

In [273]:
parent_data2

Unnamed: 0,Parent_ID,Parent_Name,Topics,Child_ID5,Child_Name5,Distance,Child_ID4,Child_Name4,Child_ID,Child_Name
0,8,우크라이나_위해_우리_있는_여러분,"[0, 1, 2, 3, 4]",0,위해_여러분_우리_위한_시민,0.940946,,,,
1,8,우크라이나_위해_우리_있는_여러분,"[2, 3]",7,우크라이나_러시아_있다_미사일_키이우,0.865585,6.0,모든_있는_내가_하나님의_그리고,2.0,우크라이나_러시아_전쟁_러뽕_그냥
2,8,우크라이나_위해_우리_있는_여러분,"[1, 4]",7,우크라이나_러시아_있다_미사일_키이우,0.578441,5.0,우크라이나_러시아_미사일_있다_키이우,4.0,러시아_우크라이나_러시아군_국방부는_하이마스
3,8,우크라이나_위해_우리_있는_여러분,"[2, 3]",7,우크라이나_러시아_있다_미사일_키이우,0.865585,6.0,모든_있는_내가_하나님의_그리고,3.0,모든_내가_있는_하나님의_그리고
4,8,우크라이나_위해_우리_있는_여러분,"[1, 4]",7,우크라이나_러시아_있다_미사일_키이우,0.578441,5.0,우크라이나_러시아_미사일_있다_키이우,1.0,우크라이나_미사일_키이우_러시아_벨라루스


In [276]:
import plotly.express as px

df=px.data.tips()

fig = px.treemap(parent_data2, path=[px.Constant("topic"), 'Parent_Name','Child_Name5','Child_Name4','Child_Name'], values='Distance',
                color='Distance',color_continuous_scale='RdBu')
fig.update_traces(root_color="lightgrey")
fig.update_layout(margin = dict(t=50, l=25, r=25, b=25))
fig.show()

In [181]:
df = px.data.gapminder().query("year == 2007")

In [182]:
df

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap,iso_alpha,iso_num
11,Afghanistan,Asia,2007,43.828,31889923,974.580338,AFG,4
23,Albania,Europe,2007,76.423,3600523,5937.029526,ALB,8
35,Algeria,Africa,2007,72.301,33333216,6223.367465,DZA,12
47,Angola,Africa,2007,42.731,12420476,4797.231267,AGO,24
59,Argentina,Americas,2007,75.320,40301927,12779.379640,ARG,32
...,...,...,...,...,...,...,...,...
1655,Vietnam,Asia,2007,74.249,85262356,2441.576404,VNM,704
1667,West Bank and Gaza,Asia,2007,73.422,4018332,3025.349798,PSE,275
1679,"Yemen, Rep.",Asia,2007,62.698,22211743,2280.769906,YEM,887
1691,Zambia,Africa,2007,42.384,11746035,1271.211593,ZMB,894
