In [44]:
# It is a tweet_chain mode. All nested sub-tweets can be found through the root tweet
import pandas as pd
import bson

# read the bson file
with open('../../data/raw/v_forest.bson', 'rb') as file:
    data = bson.decode_all(file.read())

# transform to DataFrame
df = pd.DataFrame(data)

In [45]:
# Check if the parent column is entirely empty (all NaN)
if df['parent'].isna().all():
    print("The 'parent' column is entirely empty (all values are NaN).")
else:
    print("The 'parent' column contains some non-empty values (not all values are NaN).")

The 'parent' column contains some non-empty values (not all values are NaN).


In [46]:
df.columns

Index(['_id', 'author', 'parent', 'quoted', 'mentions', 'hashtags', 'depth',
       'node_type', 'quote_ancestors', 'tweet_ancestors', 'children',
       'tree_authors', 'tree_hashtags', 'tree_users', 'author_keynode',
       'hashtag_keynode', 'valid', 'created_at', 'created_at_dt',
       'display_text', 'display_text_range', 'extended_entities',
       'favourite_count', 'lang', 'quote_count', 'reply_count',
       'retweet_count', 'urls', 'text', 'child_nodes', 'influence_tweet',
       'influence_user', 'verified', 'deidentmentions', 'deidentscreenname',
       'deidentname', 'timebucket', 'horiz_offset', 'influence_tweet_factor',
       'left', 'vert_offset', 'location', 'vert_correct'],
      dtype='object')

In [47]:
# Only keep the English tweets   
df = df[df['lang'] == 'en']
print(df.shape) 

(400174, 43)


In [48]:
print(df['created_at_dt'].min()) 
# min = 2009-09-05 23:06:34
print(df['created_at_dt'].max())
# max = 2020-02-19 18:42:10

# ref: https://www.aph.gov.au/About_Parliament/Parliamentary_Departments/Parliamentary_Library/pubs/rp/rp2122/201920AustralianBushfiresFAQupdate#_ftn1
start_date = pd.Timestamp('2019-04-01')
end_date = pd.Timestamp('2020-6-1')

mask = (df['created_at_dt'] >= start_date) & (df['created_at_dt'] <= end_date)
df = df.loc[mask]
print(df.shape)  # (397726, 43)

2009-09-05 23:06:34
2020-02-19 18:42:10
(397726, 43)


In [49]:
bushfire_keywords = [
    "bushfiredisaster", "australfires", "fire", "australianfires", "bushfires",
    "australiaburns", "australianbushfiredisaster", "australianbushfires",
    "australfire", "australiaburning", "koala", "animal", "australianwildfires",
    "australianbushfire", "firefighter", "bushfireaustralia", "nswfires", "vicfires",
    "bushfirecrisis", "bushfiresaustralia", "bushfirecrisisaustralia"
]

df = df[
    df['text'].str.contains('|'.join(bushfire_keywords), case=False, na=False)
]

#### Above are all filter content from Preprocess+LDA.ipynb

#### Below are the tweets_chain model

In [50]:
import pandas as pd
from collections import defaultdict
# Preprocess the data: create a dictionary that maps each tweet ID to its replies
replies_dict = defaultdict(list)
for _, row in df.iterrows():
    parent_id = row['parent']
    if pd.notna(parent_id):
        replies_dict[parent_id].append(row['_id'])

# Helper function to build the reply chain recursively
def build_chain(tweet_id):
    chain = {}
    # Find replies to the given tweet_id using the pre-built replies_dict
    for reply_id in replies_dict[tweet_id]:
        # Recursively build the chain for each reply
        chain[reply_id] = build_chain(reply_id)
    return chain

# Initialize a dictionary to store the tweet chains
tweet_chain = {}

# Build the tweet chain for all root tweets (tweets without a parent)
for _, row in df.iterrows():
    tweet_id = row['_id']
    parent_id = row['parent']
    
    if pd.isna(parent_id):  # Root Tweet
        # Build the chain for this root tweet
        tweet_chain[tweet_id] = build_chain(tweet_id)

# Save the tweet_chain to a JSON file
import json
with open('../../data/processed/tweet_chain_optimized.json', 'w') as f:
    json.dump(tweet_chain, f, indent=4)

print("Tweet chain saved to tweet_chain_optimized.json")


Tweet chain saved to tweet_chain_optimized.json


In [90]:
# The below code are just for testing the accuracy of the model, if you just need the .json file just ignore the below part of the code.

import json

# Load the JSON file
with open('../../data/processed/tweet_chain_optimized.json', 'r') as f:
    tweet_chain_data = json.load(f)

# Check if the specific root tweet ID exists in the data
root_tweet_id = '1213330419044638722'

if root_tweet_id in tweet_chain_data:
    # Print the tweet chain for this specific root tweet
    print(f"Root Tweet ID: {root_tweet_id}")
    print(json.dumps(tweet_chain_data[root_tweet_id], indent=4))
else:
    print(f"Root Tweet ID {root_tweet_id} not found in the data.")


Root Tweet ID: 1213330419044638722
{
    "1213330772741910528": {},
    "1213331011565539329": {
        "1213338975101841408": {},
        "1213789095106764800": {}
    },
    "1213331183309713408": {},
    "1213331186723876865": {},
    "1213331517008539648": {},
    "1213332451008757760": {},
    "1213332516121088001": {},
    "1213332701140246529": {},
    "1213333162236903430": {
        "1213425350425735168": {},
        "1213487631943622656": {},
        "1213732636138426369": {}
    },
    "1213335394093191168": {},
    "1213335569956139013": {},
    "1213335778639527936": {
        "1213348813118722048": {},
        "1213643997148631041": {
            "1213841605368729605": {}
        }
    },
    "1213336001516457984": {
        "1213570587735416832": {}
    },
    "1213337720975577088": {},
    "1213337952362749952": {},
    "1213338081186631680": {},
    "1213338427187290112": {},
    "1213338765067862016": {},
    "1213339188474417152": {},
    "1213339451809624065": {},


In [88]:
def count_nested_replies(chain):
    count = 0
    for reply_id, nested_chain in chain.items():
        count += 1 + count_nested_replies(nested_chain)
    return count

with open('../../data/processed/tweet_chain_optimized.json', 'r') as f:
    tweet_chain_data = json.load(f)

# Initialize variables to track the _id with the most nested replies
max_nested_count = 0
max_nested_id = None

# Iterate over each root tweet and count the nested replies
for root_id, chain in tweet_chain_data.items():
    nested_count = count_nested_replies(chain)
    if nested_count > max_nested_count:
        max_nested_count = nested_count
        max_nested_id = root_id

# Print the result
print(f"The _id with the most nested replies is: {max_nested_id} with {max_nested_count} nested replies.")

The _id with the most nested replies is: 1213330419044638722 with 789 nested replies.


In [89]:
def find_max_depth(chain):
    if not chain:  # Base case: if the chain is empty
        return 0
    else:
        # Recursively find the depth of each nested reply
        return 1 + max(find_max_depth(nested_chain) for nested_chain in chain.values())


import json

# Load the JSON file
with open('../../data/processed/tweet_chain_optimized.json', 'r') as f:
    tweet_chain_data = json.load(f)

# Initialize variables to track the _id with the deepest nested replies
max_depth = 0
max_depth_id = None

# Iterate over each root tweet and find the maximum depth of nested replies
for root_id, chain in tweet_chain_data.items():
    depth = find_max_depth(chain)
    if depth > max_depth:
        max_depth = depth
        max_depth_id = root_id

# Print the result
print(f"The _id with the most levels of nested replies is: {max_depth_id} with {max_depth} levels of nesting.")


The _id with the most levels of nested replies is: 1213986231589060608 with 42 levels of nesting.
