In [56]:
# 完整提取整个数据（包含主推和回复和转推） 使用它与后表执行合并
import pandas as pd
import bson

# Load the data
with open('../../data/raw/v_forest.bson', 'rb') as f:
    data = bson.decode_all(f.read())
    
# Convert the data to a DataFrame
whole_data = pd.DataFrame(data)

In [57]:
# Only keep the English tweets
whole_data = whole_data[whole_data['lang'] == 'en']


# Zoom into the event date range
start_date = pd.Timestamp('2019-11-01') 
end_date = pd.Timestamp('2020-3-1')

mask = (whole_data['created_at_dt'] >= start_date) & (whole_data['created_at_dt'] <= end_date)
whole_data = whole_data.loc[mask]

In [59]:
whole_data_bushfire_related = whole_data.copy()

In [3]:
# Use bushfire keywords to filter out the bushfire-related tweets
# 暂时禁用
bushfire_keywords = [ 
    "bushfiredisaster", "australfires", "fire", "australianfires", "bushfires",
    "australiaburns", "australianbushfiredisaster", "australianbushfires",
    "australfire", "australiaburning", "koala", "animal", "australianwildfires",
    "australianbushfire", "firefighter", "bushfireaustralia", "nswfires", "vicfires",
    "bushfirecrisis", "bushfiresaustralia", "bushfirecrisisaustralia"
]

# filter out the bushfire-related tweets
whole_data_bushfire_related = whole_data[
    whole_data['text'].str.contains('|'.join(bushfire_keywords), case=False, na=False)
]

print(whole_data_bushfire_related.shape)

(217998, 43)


In [60]:
# Sentiment 赋值
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# Initialize the VADER sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

def get_sentiment(text):
    sentiment = analyzer.polarity_scores(text)
    return sentiment['compound']  # return the compound score


whole_data_bushfire_related['sentiment'] = whole_data_bushfire_related['display_text'].apply(get_sentiment)
whole_data_bushfire_related[['display_text', 'sentiment']].head(5)

Unnamed: 0,display_text,sentiment
4689,"Oz gov't works 2 protect coal,argued 2b an exi...",-0.8225
4690,Digital welfare systems also drives up unemplo...,-0.636
4691,And they’ll blame the same ppl who lost their ...,-0.5994
4692,Fishery collapse ‘confirms Silent Spring pesti...,-0.4939
4693,"#GreatExtermination: Insects, Germany\nhttps:/...",0.0


In [5]:
whole_data_bushfire_related.info()

<class 'pandas.core.frame.DataFrame'>
Index: 217998 entries, 4707 to 490653
Data columns (total 44 columns):
 #   Column                  Non-Null Count   Dtype         
---  ------                  --------------   -----         
 0   _id                     217998 non-null  object        
 1   author                  217998 non-null  object        
 2   parent                  58692 non-null   object        
 3   quoted                  43434 non-null   object        
 4   mentions                217998 non-null  object        
 5   hashtags                217998 non-null  object        
 6   depth                   217998 non-null  int64         
 7   node_type               217998 non-null  object        
 8   quote_ancestors         217998 non-null  object        
 9   tweet_ancestors         217998 non-null  object        
 10  children                217998 non-null  int64         
 11  tree_authors            217998 non-null  object        
 12  tree_hashtags           217998 n

In [8]:
# save the data
whole_data_bushfire_related.to_csv('../../data/processed/whole_data(parent&child).csv', index=False)

In [29]:
from collections import defaultdict
import json

# whole_data提取出_id和parent列
id_data = whole_data[['_id', 'parent']]


# 确保 `_id` 和 `parent` 列都是字符串
id_data['_id'] = id_data['_id'].astype(str)
id_data['parent'] = id_data['parent'].astype(str)

# 构建回复字典 replies_dict
replies_dict = defaultdict(list)
for _, row in id_data.iterrows():
    parent_id = row['parent']
    if pd.notna(parent_id):
        replies_dict[parent_id].append(row['_id'])

# 找出所有 `parent` 中存在但 `_id` 中不存在的引用
parent_ids = set(id_data['parent'].dropna().unique())
tweet_ids = set(id_data['_id'])
missing_parents = parent_ids - tweet_ids

# 将缺失的 `parent` 引用作为新的父节点
for missing_parent in missing_parents:
    replies_dict[missing_parent] = []

# 构建推文链
def build_chain(tweet_id):
    chain = {}
    # Find replies to the given tweet_id using the pre-built replies_dict
    if tweet_id in replies_dict:
        for reply_id in replies_dict[tweet_id]:
            # Recursively build the chain for each reply
            chain[reply_id] = build_chain(reply_id)
    return chain

# 初始化 tweet_chain
tweet_chain = {}

# 遍历所有推文，构建所有根节点的链条
for _, row in id_data.iterrows():
    tweet_id = row['_id']
    parent_id = row['parent']
    
    # 如果没有 `parent`，或者 `parent` 在 `missing_parents` 中，将其视为根节点
    if pd.isna(parent_id) or parent_id in missing_parents:
        # Build the chain for this root tweet
        tweet_chain[tweet_id] = build_chain(tweet_id)

# 保存到 JSON 文件
with open('../../data/processed/tweet_chain_optimized.json', 'w') as f:
    json.dump(tweet_chain, f, indent=4)

print("Tweet chain saved to tweet_chain_optimized.json")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  id_data['_id'] = id_data['_id'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  id_data['parent'] = id_data['parent'].astype(str)


Tweet chain saved to tweet_chain_optimized.json


In [30]:
tweet_chain

{'1190059455649480704': {'1190494682913988610': {'1190561730897666048': {'1190958840373600256': {'1191675410347479040': {'1191909951305736192': {'1192430808897740800': {'1192712354040971264': {'1193099694932553728': {'1193656513895616512': {'1194068495585071104': {'1194375966111354881': {'1194737208789352449': {'1195085599985655808': {'1195488417708044288': {'1195494614695854080': {'1195848627006005248': {'1196170617990475776': {'1196735929932664833': {'1197112423309664257': {'1197438229437272064': {'1198207445769674754': {'1198927277150461953': {'1199266929912905728': {'1199459367860391936': {'1199824528672559104': {'1200194821148700673': {'1200373591306452992': {'1201264347692953600': {'1201642603642413056': {'1202173831311323137': {'1202778425913569280': {'1203155805937102848': {'1203636486517579776': {'1203997521296556037': {'1204354594735656960': {'1204721935315173376': {'1205085458583060481': {'1205463772468371456': {'1205808348928413696': {'1206162580508512257': {'12067230196592

In [40]:
df = pd.read_csv('../../data/processed/final_data.csv')
df.head()  

Unnamed: 0,_id,author,parent,quoted,mentions,hashtags,depth,node_type,quote_ancestors,tweet_ancestors,...,horiz_offset,influence_tweet_factor,left,vert_offset,location,vert_correct,cleaned_text,dominant_topic,sentiment,weighted_sentiment
0,1134575763334680576,108577207,,1.134424e+18,"['108577207', '21475927', '254515782', '341163...","['ausvotes2019', 'auspol']",0,Root,[],[],...,0.5,0.477121,False,0.795556,"Tasmania, Australia",True,methinks scottmorrisonmp olofdawson scottjakob...,1,0.5419,0.258552
1,1163265539247968256,3112695773,,,"['3112695773', '88593058']",[],0,Root,[],[],...,0.5,3.471585,False,0.99,Unknown,True,koala cut tie alan jones significant buyer med...,3,-0.5574,-1.935062
2,1164479471346257921,2960282202,,,"['3459051', '50393960', '17596622', '155065462...","['amazonfire', 'prayfortheamazon']",0,Root,[],[],...,0.5,0.30103,True,0.572222,Unknown,True,kindly use pay firefighter jet bombardier cl 1...,5,0.1027,0.030916
3,1165088032082604038,2233234848,,,['2233234848'],['amazonfires'],0,Root,[],[],...,0.5,0.477121,False,0.785556,Unknown,True,earth talk human done amazonfires,9,0.0,0.0
4,1170123597802872834,95802989,,,['95802989'],"['stanthorpe', 'qldfires', 'tenterfield', 'nsw...",0,Root,[],[],...,0.5,1.946943,True,0.755556,Australia,True,givit supporting service responding bushfire e...,5,0.7146,1.391286


In [41]:
ef = pd.read_csv('../../data/processed/final_data(main_topic).csv')
ef.head()

Unnamed: 0.1,Unnamed: 0,_id,created_at,display_text,sentiment,weighted_sentiment,dominant_topic,cleaned_text,influence_tweet_factor,media_urls,media_count,tweet_url,date,main_topic
0,0,1192617057394708480,2019-11-08 01:37:18,If that’s what the fires look like from up her...,0.6239,1.527748,6,fire look like nsw mid nth coast huge prayer t...,2.448706,['https://pbs.twimg.com/media/EI0G41MVAAAqVzx....,1,https://t.co/AlgiOgAHQI,2019-11-08,1
1,1,1192684541913100288,2019-11-08 06:05:28,Progression of major fires today across NSW #N...,0.0,0.0,6,progression major fire today across nsw nswfires,2.974742,['https://pbs.twimg.com/ext_tw_video_thumb/119...,1,https://t.co/KAZ3pW9i8O,2019-11-08,1
2,2,1192725213294804992,2019-11-08 08:47:05,#Smoke from #nswfires in the northeast is not ...,0.1154,0.270429,6,smoke nswfires northeast visible satellite ima...,2.343409,['https://pbs.twimg.com/media/EI1pQmDWkAIbDNG....,2,https://t.co/eNsAxzo6Nc,2019-11-08,8
3,3,1192766676854435840,2019-11-08 11:31:50,This shows the dangerous conditions that have ...,-0.743,-2.701052,6,show dangerous condition confronted firefighte...,3.635333,['https://pbs.twimg.com/ext_tw_video_thumb/119...,1,https://t.co/lIhnF8P1Qf,2019-11-08,1
4,4,1192932983407202304,2019-11-08 22:32:41,Incredible vision from @NSWRFS of a crew from ...,-0.0276,-0.078089,6,incredible vision nswrfs crew warringah hq hil...,2.829304,['https://pbs.twimg.com/ext_tw_video_thumb/119...,1,https://t.co/9YA4PxNqzW,2019-11-08,1


In [62]:
# df 保留id, created_at_dt, reply_count, retweet_count, favourite_count, quote_count，sentiment
df = df[['_id', 'created_at_dt', 'reply_count', 'retweet_count', 'favourite_count', 'quote_count', 'sentiment']]
# ef 保留id, main_topic
ef = ef[['_id', 'main_topic']]
# 合并两个表 以id为主键
topic_data = pd.merge(df, ef, on='_id')
topic_data.head()

Unnamed: 0,_id,created_at_dt,reply_count,retweet_count,favourite_count,quote_count,sentiment,main_topic
0,1192617057394708480,2019-11-08 01:37:18,0,280,458,0,0.6239,1
1,1192684541913100288,2019-11-08 06:05:28,0,609,673,0,0.0,1
2,1192725213294804992,2019-11-08 08:47:05,0,118,158,0,0.1154,8
3,1192766676854435840,2019-11-08 11:31:50,0,2879,5002,0,-0.743,1
4,1192932983407202304,2019-11-08 22:32:41,26,364,639,85,-0.0276,1


In [63]:
# 对于sentiment， 使用三态归类，大于0.05为正面，小于-0.05为负面，其余为中性 直接用文字表示 Positive, Negative, Neutral
topic_data['sentiment'] = topic_data['sentiment'].apply(lambda x: 'Positive' if x > 0.05 else 'Negative' if x < -0.05 else 'Neutral')
topic_data.head()

Unnamed: 0,_id,created_at_dt,reply_count,retweet_count,favourite_count,quote_count,sentiment,main_topic
0,1192617057394708480,2019-11-08 01:37:18,0,280,458,0,Positive,1
1,1192684541913100288,2019-11-08 06:05:28,0,609,673,0,Neutral,1
2,1192725213294804992,2019-11-08 08:47:05,0,118,158,0,Positive,8
3,1192766676854435840,2019-11-08 11:31:50,0,2879,5002,0,Negative,1
4,1192932983407202304,2019-11-08 22:32:41,26,364,639,85,Neutral,1


In [61]:
# whole_data_bushfire_related仅保留id, created_at_dt, reply_count, retweet_count, favourite_count, quote_coun, sentiment
whole_data_bushfire_related = whole_data_bushfire_related[['_id', 'created_at_dt', 'reply_count', 'retweet_count', 'favourite_count', 'quote_count', 'sentiment']]

In [46]:
whole_data_bushfire_related.info()

<class 'pandas.core.frame.DataFrame'>
Index: 217998 entries, 4707 to 490653
Data columns (total 7 columns):
 #   Column           Non-Null Count   Dtype         
---  ------           --------------   -----         
 0   _id              217998 non-null  int64         
 1   created_at_dt    217998 non-null  datetime64[ns]
 2   reply_count      217998 non-null  int64         
 3   retweet_count    217998 non-null  int64         
 4   favourite_count  217998 non-null  int64         
 5   quote_count      217998 non-null  int64         
 6   sentiment        217998 non-null  object        
dtypes: datetime64[ns](1), int64(5), object(1)
memory usage: 13.3+ MB


In [64]:
whole_data_bushfire_related['sentiment'] = pd.to_numeric(whole_data_bushfire_related['sentiment'], errors='coerce')

whole_data_bushfire_related['sentiment'] = whole_data_bushfire_related['sentiment'].apply(lambda x: 'Positive' if x > 0.05 else 'Negative' if x < -0.05 else 'Neutral')

In [65]:
# 将whole_data_bushfire_related的_id变为int64 
whole_data_bushfire_related['_id'] = whole_data_bushfire_related['_id'].astype('int64')

# 将topic_data的data_time变为datetime64
topic_data['created_at_dt'] = pd.to_datetime(topic_data['created_at_dt'])

In [66]:
print(whole_data_bushfire_related.info())

print(topic_data.info())

<class 'pandas.core.frame.DataFrame'>
Index: 396434 entries, 4689 to 490653
Data columns (total 7 columns):
 #   Column           Non-Null Count   Dtype         
---  ------           --------------   -----         
 0   _id              396434 non-null  int64         
 1   created_at_dt    396434 non-null  datetime64[ns]
 2   reply_count      396434 non-null  int64         
 3   retweet_count    396434 non-null  int64         
 4   favourite_count  396434 non-null  int64         
 5   quote_count      396434 non-null  int64         
 6   sentiment        396434 non-null  object        
dtypes: datetime64[ns](1), int64(5), object(1)
memory usage: 24.2+ MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 157670 entries, 0 to 157669
Data columns (total 8 columns):
 #   Column           Non-Null Count   Dtype         
---  ------           --------------   -----         
 0   _id              157670 non-null  int64         
 1   created_at_dt    157670 non-null  datetime64[ns]
 2   

In [67]:
# 执行两表合并 whole_data_bushfire_related 与 topic_data 以_id为主键
merged_df = pd.merge(whole_data_bushfire_related, topic_data, on=['_id', 'reply_count', 'retweet_count', 'favourite_count', 'quote_count', 'sentiment'], how='outer')

merged_df['main_topic'] = merged_df['main_topic'].fillna(100)
merged_df['main_topic'] = merged_df['main_topic'].astype('int64')
merged_df['sentiment'] = merged_df['sentiment'].astype('str')

# 合并created_at_dt_x和created_at_dt_y 
merged_df['created_at_dt_x'] = merged_df['created_at_dt_x'].fillna(merged_df['created_at_dt_y'])
merged_df = merged_df.drop(columns=['created_at_dt_y'])
merged_df = merged_df.rename(columns={'created_at_dt_x': 'created_at_dt'})

merged_df.info()

print(merged_df.head(10))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 396434 entries, 0 to 396433
Data columns (total 8 columns):
 #   Column           Non-Null Count   Dtype         
---  ------           --------------   -----         
 0   _id              396434 non-null  int64         
 1   created_at_dt    396434 non-null  datetime64[ns]
 2   reply_count      396434 non-null  int64         
 3   retweet_count    396434 non-null  int64         
 4   favourite_count  396434 non-null  int64         
 5   quote_count      396434 non-null  int64         
 6   sentiment        396434 non-null  object        
 7   main_topic       396434 non-null  int64         
dtypes: datetime64[ns](1), int64(6), object(1)
memory usage: 24.2+ MB
                   _id       created_at_dt  reply_count  retweet_count  \
0  1190059455649480704 2019-11-01 00:14:19            0              0   
1  1190060977754009600 2019-11-01 00:20:22            0              8   
2  1190063498971107328 2019-11-01 00:30:23            0   

In [68]:
import json
tweet_chain = json.load(open('../../data/processed/tweet_chain_optimized.json', 'r'))

# 使用chian来找到所有父推文下的子推文 并将这些子推文的main_topic赋值为父推文的main_topic
def update_main_topic(main_tweet_id, sub_tweet_dict, df):
    # 检查主推文是否在 DataFrame 中
    if main_tweet_id not in df['_id'].values:
        print(f"Warning: main_tweet_id {main_tweet_id} not found in DataFrame.")
        return

    # 获取主推文的 main_topic 值
    main_topic = df[df['_id'] == main_tweet_id]['main_topic'].values[0]
    
    # 遍历子推文
    for sub_tweet_id, sub_tweet_chain in sub_tweet_dict.items():
        if sub_tweet_id in df['_id'].values:
            # 更新子推文的 main_topic
            df.loc[df['_id'] == sub_tweet_id, 'main_topic'] = main_topic
        
            # 递归地更新嵌套的子推文
            update_main_topic(sub_tweet_id, sub_tweet_chain, df)
        else:
            print(f"Warning: sub_tweet_id {sub_tweet_id} not found in DataFrame.")

        
# 遍历 tweet_chain 并更新所有的子推文
for main_tweet_id, sub_tweet_chain in tweet_chain.items():
    update_main_topic(main_tweet_id, sub_tweet_chain, merged_df)

print("Main topics updated successfully!")

  if main_tweet_id not in df['_id'].values:




IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



Main topics updated successfully!


In [69]:
# 计算main_topic的分布 按照主题数量排序
main_topic_distribution = merged_df['main_topic'].value_counts().sort_index()
print(main_topic_distribution)

main_topic
1       11412
2       28607
3       18827
4         563
5       29868
6        4474
7       28514
8       11667
9        9004
10      14734
100    238764
Name: count, dtype: int64


In [53]:
# 统计tweet_chain中的推文数量 包括主推文和子推文
tweet_chain_count = 0
for main_tweet_id, sub_tweet_chain in tweet_chain.items():
    tweet_chain_count += 1 + len(sub_tweet_chain)
print(f"Total number of tweets in tweet_chain: {tweet_chain_count}")

Total number of tweets in tweet_chain: 304072


In [70]:
# 对于主题=100的推文，将其删除
merged_df = merged_df[merged_df['main_topic'] != 100]

In [71]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 157670 entries, 117 to 392555
Data columns (total 8 columns):
 #   Column           Non-Null Count   Dtype         
---  ------           --------------   -----         
 0   _id              157670 non-null  int64         
 1   created_at_dt    157670 non-null  datetime64[ns]
 2   reply_count      157670 non-null  int64         
 3   retweet_count    157670 non-null  int64         
 4   favourite_count  157670 non-null  int64         
 5   quote_count      157670 non-null  int64         
 6   sentiment        157670 non-null  object        
 7   main_topic       157670 non-null  int64         
dtypes: datetime64[ns](1), int64(6), object(1)
memory usage: 10.8+ MB


In [72]:
# 保存数据为json
merged_df.to_json('../../data/processed/topic_data(parent&child).json', orient='records')