In [1]:

import numpy as np
import pandas as pd

from tqdm.notebook import tqdm
import json
import re

tqdm.pandas()

In [31]:
original_df = pd.read_csv('/Users/magz/Documents/workspace/asia3012/dim-sim-newspaper-full-text-16jan-cleaned.csv')
doc_label_df = pd.read_csv('/Users/magz/Documents/workspace/asia3012/doc-topic-label-31jan.csv', index_col=0)



In [20]:
original_df.columns

Index(['Unnamed: 0', 'id', 'workAnnotationLabel', 'annotatedId', 'title',
       'formatDisplay', 'format', 'mainFormat', 'numFormats', 'itemMedia',
       'itemFormats', 'topItemFormats', 'primaryFormat', 'description',
       'yearRange', 'snippets', 'author', 'authors', 'workType',
       'thumbnailUrl', 'thumbnailIsCulturallySensitive', 'numItems',
       'numHoldings', 'onlineUrl', 'onlineText', 'languages', 'austLanguages',
       'firstArticlePublicationName', 'culturallySensitive',
       'firstAustraliansInd', 'correctableText', 'newspaper', 'abstrct',
       'date', 'page', 'numberOfCorrectors', 'correctedByMe', 'wordCount',
       'articleType', 'illustrated', 'cleanedAbstract', 'cleanedSnip',
       'cleanedTitle', 'year', 'text', 'cleanedText'],
      dtype='object')

In [32]:
#doc_label_df = pd.concat([doc_label_df, original_df[['articleType', 'newspaper','thumbnailUrl']]], axis=1)
doc_label_df = pd.concat([doc_label_df, original_df[['id','title','abstrct','date','authors','year', 'articleType', 'newspaper','thumbnailUrl']]], axis=1)


In [6]:
from bertopic import BERTopic
from umap import UMAP
from sentence_transformers import SentenceTransformer
from sentence_transformers.util import cos_sim
from bertopic.vectorizers import ClassTfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from hdbscan import HDBSCAN
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, OpenAI, PartOfSpeech
import ast
from scipy.cluster import hierarchy as sch

In [11]:
bertopic_df = pd.read_csv('/Users/magz/Documents/workspace/asia3012/embedded_data_jan_20.csv',index_col=0)
bertopic_df['embedding'] = bertopic_df['embedding'].apply(ast.literal_eval).apply(lambda x: np.array(x, dtype=np.float32))
embedding = np.array(bertopic_df['embedding'].to_list(), dtype=np.float32)

In [12]:
umap_model = UMAP(n_neighbors=20, n_components=5, min_dist=0.0, metric='cosine', random_state=50)

# Step 3 - Cluster reduced embeddings
hdbscan_model = HDBSCAN(min_cluster_size=7, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
vectorizer_model = CountVectorizer(stop_words="english", token_pattern='[a-zA-Z0-9]{3,}')
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
pos_model = PartOfSpeech("en_core_web_sm")

topic_model = BERTopic(n_gram_range=(1, 1), verbose=True, nr_topics='auto',
                       top_n_words=15,
                       umap_model=umap_model,
                       hdbscan_model=hdbscan_model,
                       vectorizer_model=vectorizer_model,
                       ctfidf_model=ctfidf_model,
                       representation_model = pos_model,
                       )

topics, probs = topic_model.fit_transform(bertopic_df['cleanedTextBert'], embeddings=embedding)
print(topic_model.get_topic_info())

2024-05-13 00:01:04,503 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-05-13 00:01:09,970 - BERTopic - Dimensionality - Completed ✓
2024-05-13 00:01:09,971 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-05-13 00:01:09,991 - BERTopic - Cluster - Completed ✓
2024-05-13 00:01:09,992 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-05-13 00:02:12,210 - BERTopic - Representation - Completed ✓
2024-05-13 00:02:12,212 - BERTopic - Topic reduction - Reducing number of topics
2024-05-13 00:02:54,400 - BERTopic - Topic reduction - Reduced number of topics from 34 to 19


    Topic  Count                                   Name  \
0      -1    204               -1_ctns_time_bgs_betting   
1       0    159          0_yesterday_town_hall_members   
2       1     87                 1_start_2nd_second_ond   
3       2     82      2_restaurant_dishes_sauce_chicken   
4       3     33                 3_water_salt_oil_sauce   
5       4     22                 4_bitch_dog_puppy_best   
6       5     21  5_theatre_troupe_delightful_programme   
7       6     17             6_fat_foods_alcohol_energy   
8       7     16     7_trinder_cocktails_farewell_party   
9       8     13          8_frozen_foods_market_porkers   
10      9     13          9_middle_crushed_tahina_lemon   
11     10     13            10_fish_takeaway_chips_shop   
12     11     13           11_know_answers_snake_answer   
13     12     13          12_music_news_session_weather   
14     13     11    13_war_communists_communism_foreign   
15     14     11                    14_dim_sims_sim_cab 

In [13]:
hierarchical_topics = topic_model.hierarchical_topics(bertopic_df['cleanedTextBert'])
print(hierarchical_topics)

100%|██████████| 17/17 [01:17<00:00,  4.54s/it]

   Parent_ID                        Parent_Name  \
16        34   yesterday_town_hall_members_year   
15        33   yesterday_town_hall_members_year   
14        32    yesterday_hall_members_year_day   
13        31    yesterday_hall_members_year_day   
12        30   yesterday_town_hall_members_year   
11        29    yesterday_hall_members_year_day   
10        28         town_members_year_day_work   
9         27   yesterday_town_hall_members_year   
8         26              year_day_work_men_dim   
7         25  yesterday_town_hall_members_stall   
6         24         town_members_year_day_work   
5         23          members_year_day_work_men   
4         22    yesterday_town_members_year_day   
3         21            town_hall_year_day_work   
2         20  yesterday_town_hall_members_stall   
1         19          members_year_day_work_dim   
0         18           town_stall_year_day_work   

                                               Topics Child_Left_ID  \
16  [0, 1,




In [14]:
topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)

In [33]:
def create_list_from_topics(row):
    # First layer
    if row['Topic'] in [2,3,6,8,9,10,17]:
        first_element = 'Food'  
    else:
        first_element = 'Non-Food'
    
    # Second element condition using match
    match row['Topic']:
        # food
        case 3:
            second_element = 'Recipe'
        case 2:
            second_element = 'Restaurant Review'
        case 6:
            second_element = 'Health & Diet'
        case 8:
            second_element = 'Food Product Trade'
        case 9:
            second_element = 'Middle Eastern Food'
        case 10:
            second_element = 'Fish & Chips'
        case 17:
            second_element = 'Food Guide'
        # non-food

        case 0|16:
            second_element = 'Community Events'
        case 1:
            second_element = 'Horse Race'
        case 4:
            second_element = 'Dog Show'
        case 5:
            second_element = 'Theatre & Entertainment'
        case 7|15:
            second_element = 'Social Events'
        case 11:
            second_element = 'Quiz'
        case 12:
            second_element = 'Programme Schedule'
        case 13:
            second_element = 'Government & Politics'
        case 14:
            second_element = 'Local News'       
        case _:
            second_element = 'Other'
    
    # Combine elements, filtering out None
    result = [first_element] + ([second_element] if second_element is not None else [])
    
    return result

In [34]:
def create_cat_corpus_knowledge(row):
   if row['Topic'] == 16:
      if re.search(r'\bmason\w*', row['Document'], re.IGNORECASE):
         row['categories'] += ['Funeral for Chinese Mason']
      elif re.search(r'\byoungster\w*', row['Document'], re.IGNORECASE):
         row['categories'] += ['McKenzie\'s Chinese Dinner for Youngsters']
      else:
         row['categories'] += ['Other']
   
   if row['Topic'] == 0:
      if re.search(r'\b\w*war nurse\w*', row['Document'], re.IGNORECASE) or re.search(r'\b\w*dim sim do\w*', row['Document'], re.IGNORECASE):
         row['categories'] += ['\"Dim Sim Do\" Fundraising for War Nurses']
      else:
         row['categories'] += ['Other']

         
         
   
   return row['categories']

In [35]:
doc_label_df['categories'] = doc_label_df.apply(create_list_from_topics, axis=1)
doc_label_df['categories'] = doc_label_df.apply(create_cat_corpus_knowledge, axis=1)

In [36]:
# opt1 = ['Non-Food', 'Theatre & Entertainment', 'Other']
# id = 100
# doc_label_df.at[id, 'categories'] = opt1
# print(doc_label_df.iloc[[id]])

In [37]:
doc_label_df['category_path'] = doc_label_df['categories'].apply(tuple)

# Count unique paths
path_counts = doc_label_df['category_path'].value_counts().reset_index()
path_counts.columns = ['categories', 'count']
print(path_counts)

                                           categories  count
0                                   (Non-Food, Other)    204
1                 (Non-Food, Community Events, Other)    139
2                              (Non-Food, Horse Race)     87
3                           (Food, Restaurant Review)     82
4                                      (Food, Recipe)     33
5                           (Non-Food, Social Events)     26
6                                (Non-Food, Dog Show)     22
7   (Non-Food, Community Events, "Dim Sim Do" Fund...     22
8                 (Non-Food, Theatre & Entertainment)     21
9                               (Food, Health & Diet)     17
10                        (Food, Middle Eastern Food)     13
11                               (Food, Fish & Chips)     13
12                         (Food, Food Product Trade)     13
13                                   (Non-Food, Quiz)     13
14                     (Non-Food, Programme Schedule)     13
15                  (Non

In [38]:
def parse_categories(category_counts):
    tree = {}
    for _, row in category_counts.iterrows():
        current_level = tree
        path = row['categories']
        count = row['count']
        for i, part in enumerate(path):
            if part not in current_level:
                current_level[part] = {} if i < len(path) - 1 else {'_count': count}
            current_level = current_level[part]
    return tree

# Function to convert nested dictionary to the required JSON structure
def build_json_structure(tree, color_index=0, colors=["#fbc987", "#fada85","#fab285",]):
    result = []
    for name, sub_tree in tree.items():
        if '_count' in sub_tree:
            # This is a leaf node, prepare the item with count
            item = {
                "name": name,
                "itemStyle": {"color": colors[color_index % len(colors)]},
                "value": sub_tree['_count']
            }
        else:
            # Non-leaf node, recurse
            item = {
                "name": name,
                "itemStyle": {"color": colors[color_index % len(colors)]},
                "children": build_json_structure(sub_tree, color_index + 1)
            }
        result.append(item)
    return result

# Parse the category paths into a hierarchical dictionary
category_tree = parse_categories(path_counts)

# Convert the hierarchical dictionary to JSON structure
json_structure = build_json_structure(category_tree)
json_output = json.dumps(json_structure, indent=4)

# Print and save the output
print(json_output)
with open('structured_web_data_input.json', 'w') as f:
    f.write(json_output)

SyntaxError: '[' was never closed (1048984495.py, line 14)

In [43]:
#doc_label_df.set_index('id', inplace=True)

data_json = doc_label_df[['categories','title','abstrct','date','authors','year', 'articleType', 'newspaper','thumbnailUrl']].to_json(orient='index')
with open('keyed_article_data.json', 'w') as file:
    json.dump(json.loads(data_json), file)

In [47]:
doc_label_df.reset_index(inplace=True)

doc_label_df['categories_string'] = doc_label_df['categories'].apply(lambda x: ', '.join(x) if isinstance(x, list) else x)

grouped = doc_label_df.groupby('categories_string')['id'].apply(list)

# Convert the grouped data to a JSON string
categories_json = grouped.to_json()
print(categories_json)

{"Food, Fish & Chips":[254798369,254800021,201151665,120918161,190893279,267029509,267178808,196615330,196436190,196679969,249097984,196859078,140063550],"Food, Food Guide":[230403895,125630906,118106641,136294370,101967300,101982907,120857042,249094255],"Food, Food Product Trade":[217935714,74062147,267752217,60677801,142742249,137930515,267433093,72535102,131766056,51776971,106990537,110758876,267168944],"Food, Health & Diet":[231112898,69171835,225978325,71653984,44560491,263210773,255521652,140143302,254380359,255437456,259493441,267388332,267224038,230222376,259369483,267336881,267390139],"Food, Middle Eastern Food":[262706253,262518397,262528101,262743342,261327215,261329485,261453178,261456607,262131521,261658830,261559476,261460733,261547975],"Food, Recipe":[223810567,12238963,255428942,223896004,206136719,12450548,12117738,12145996,22555258,190783899,69106341,63052990,171265236,224930968,167976895,201355468,260547216,27188316,100183360,131252677,46448354,224553594,266345861,71

In [11]:
data_json = json.loads(data_json)

In [12]:
def parse_categories(categories):
    tree = {}
    for path in categories:
        current_level = tree
        for part in path['categories']:
            if part not in current_level:
                current_level[part] = {}
            current_level = current_level[part]
    return tree

# Function to convert nested dictionary to the required JSON structure
def build_json_structure(tree, color_index=0, colors=["#3498db", "#e74c3c", "#2ecc71", "#f1c40f", "#9b59b6", "#34495e", "#1abc9c"]):
    result = []
    for name, sub_tree in tree.items():
        item = {
            "name": name,
            #"itemStyle": {"color": colors[color_index % len(colors)]},
            "children": build_json_structure(sub_tree, color_index + 1) if sub_tree else []
        }
        # Check if node is a leaf (no children)
        if not item["children"]:
            item["value"] = 1  # Add 'value': 1 for leaf nodes
        result.append(item)
    return result

# Parse the flat list into a hierarchical dictionary
category_tree = parse_categories(data_json)

# Convert the hierarchical dictionary to JSON structure
json_structure = build_json_structure(category_tree)
json_output = json.dumps(json_structure, indent=4)

print(json_output)
#Print and save the output
with open('structured_web_data_input.json', 'w') as f:
    f.write(json_output)

[
    {
        "name": "Non-Food",
        "children": [
            {
                "name": "Community Events",
                "children": [
                    {
                        "name": "Other",
                        "children": [],
                        "value": 1
                    },
                    {
                        "name": "Funeral for Chinese Mason",
                        "children": [],
                        "value": 1
                    },
                    {
                        "name": "McKenzie's Chinese Dinner for Youngsters",
                        "children": [],
                        "value": 1
                    },
                    {
                        "name": "\"Dim Sim Do\" Fundraising for War Nurses",
                        "children": [],
                        "value": 1
                    }
                ]
            },
            {
                "name": "Theatre & Entertainment",
                "child