In [1]:
import pandas as pd
import bson

#      ===== 读取数据 ===== 
# read the data
with open('../../data/raw/v_forest.bson', 'rb') as file:
    data = bson.decode_all(file.read())

# convert the data to a pandas dataframe
df = pd.DataFrame(data)

In [2]:
# 备份一个df_backup  可以使用df = df_backup.copy()来恢复原始数据     ===== 备份 =====
# 后期有一些需要修改数据的地方 深拷贝一个备份 方便回溯数据 不用再加载一次
# df_backup = df.copy() 

In [3]:
# df.head()  # Cehck structure of the data     ===== 数据结构 ===== 

In [4]:
# df.info()   # 此时数据应该是(490653,43) ===== 数据类型 ===== 

In [5]:
# Only keep the English tweets     ===== 语言 ===== Verified in MongoDB Compass👌
df = df[df['lang'] == 'en']
print(df.shape)  # 表示400174行, 43列

(400174, 43)


In [6]:
# 显示推文时间的最早和最晚时间    ===== 时间 =====  Verified in MongoDB Compass👌
print(df['created_at_dt'].min()) 
# min = 2009-09-05 23:06:34
print(df['created_at_dt'].max())
# max = 2020-02-19 18:42:10

# 根据题意 查询了政府网站对2019-2020山火的时间框架 山火时间为July 2019 to late February 2020 出于严谨考虑 时间框架向前推至2019-04-01 由于可能的情绪蔓延 框架向后推至2020-6-1 
# 注意：根据此考量 此时间节点后的推文 很可能是鞭尸的假新闻 
# ref: https://www.aph.gov.au/About_Parliament/Parliamentary_Departments/Parliamentary_Library/pubs/rp/rp2122/201920AustralianBushfiresFAQupdate#_ftn1
start_date = pd.Timestamp('2019-04-01')
end_date = pd.Timestamp('2020-6-1')

mask = (df['created_at_dt'] >= start_date) & (df['created_at_dt'] <= end_date)
df = df.loc[mask]
print(df.shape)  # (397726, 43)

2009-09-05 23:06:34
2020-02-19 18:42:10
(397726, 43)


In [7]:
# 忽略转发/回复的推文 只使用父推文 df['parent'] = null       ===== 父推文 =====  Verified in MongoDB Compass👌
df = df[df['parent'].isnull()]

print(df.shape)  # (190316, 43)

(190316, 43)


In [8]:
# 加载预处理资源 与数据处理本体分离 便于后期调整数据处理流程
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

# 下载必要的NLTK资源
nltk.download('stopwords')
nltk.download('wordnet')

# 数据预处理函数
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/zhoujingfeng/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/zhoujingfeng/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [9]:
# 预处理函数 ===== 预处理 =====  
# 由于regex的运作方法在python和mongodb中不同 此步骤无法进行验证 清各位同仁仔细检查代码和输出
def preprocess(text):
    text = text.lower()  # 转换为小写
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)  # 移除URL
    text = re.sub(r'\W', ' ', text)  # 移除特殊字符
    text = text.split()  # 分词
    text = [lemmatizer.lemmatize(word) for word in text if word not in stop_words]  # 词形还原和去除停用词
    text = [word.strip() for word in text] # 去除文本前后空格
    return ' '.join(text)

# 保存文本到新列
df['cleaned_text'] = df['text'].apply(preprocess)

# 查看预处理后的数据
df[['text', 'cleaned_text']].head(10)

Unnamed: 0,text,cleaned_text
3159,https://t.co/9wQlvxcHcD Ro...,rock roll dream angelina jolie bad rock roll d...
3169,I'M SCREAMING!! 😭😭🤣🤣\nY'all need to see this 😭...,screaming need see
3195,🗣 Seriously. It's never been this easy to crea...,seriously never easy create great marketing video
3234,"This morning @ViktoriaRusso, @PatriciaAlves &a...",morning viktoriarusso patriciaalves amp announ...
3241,It was wonderful to welcome @KatyRobertson to ...,wonderful welcome katyrobertson mosman rowing ...
3265,Nearly half of Australian children don’t think...,nearly half australian child think bread amp c...
3323,.@HayleyDrubin gives us a visual representatio...,hayleydrubin give u visual representation carb...
3335,"Australia, where people can tether baby dugong...",australia people tether baby dugong use bait l...
3341,#PoliticsofCoal @AndreaDeschamps \nPart 1: Aus...,politicsofcoal andreadeschamps part 1 australi...
3350,Sport brings communities together. It also hel...,sport brings community together also help keep...


In [10]:
# 过滤掉纯广告推文 但留下以山火为题打广告的推文  ===== 广告 ===== 
# 定义广告关键词和山火关键词
ad_keywords = ['buy now', 'sale', 'discount', 'free shipping', 'deals', 'promotion', 
               'special offer', 'clearance', 'flash sale', 'exclusive deal', 'shop now', 
               'buy one get one free', 'best price', 'hot item', 'online store', 'save big', 
               'coupon code', 'percent off', 'markdown', 'bargain', 'hot deal', 'limited offer', 
               'sale ends soon', 'big savings', 'exclusive offer', 'limited stock', 'order now', 
               'act fast', 'free gift', 'lowest price', 'best deal', 'special price', 'hot sale', 
               'mega sale', 'discount code']

fire_keywords = ['burns', 'fire', 'dangerous', 'burn', 'burning', 'died', 'saved', 'bushfire', 
                   'wildfire', 'forest fire', 'flames', 'evacuate', 'evacuation', 'firefighter', 
                   'smoke', 'ash', 'firestorm', 'emergency', 'hazard', 'fire season', 'blaze', 
                   'scorching', 'inferno', 'emergency response', 'climate crisis', 
                   'climate emergency', 'natural disaster', 'red alert', 'alert', 'containment', 
                   'fire suppression', 'firefighting', 'wildfire season']

# 构建正则表达式
ad_pattern = r'\b(?:' + '|'.join(ad_keywords) + r')\b'   
fire_pattern = r'\b(?:' + '|'.join(fire_keywords) + r')\b'

# 使用正则表达式进行布尔索引
ad_mask = df['cleaned_text'].str.contains(ad_pattern)
non_ad_mask = df['cleaned_text'].str.contains(fire_pattern)

# 使用布尔索引过滤数据
filtered_df = df[ad_mask & ~non_ad_mask]  # 与非操作 找到不是关于山火的广告

# 从原始数据中删除广告推文
df = df[~df.index.isin(filtered_df.index)]

print(df.shape)   # 过滤掉了(493, 44)条广告推文 剩余（189823, 44）条推文


(189823, 44)


In [12]:
# 创建词典和语料库
from gensim import corpora, models

# 创建词典: 词典是一个将单词映射到整数id的映射 每一个单词都有一个唯一的id 用于创建语料库
dictionary = corpora.Dictionary([tweet.split() for tweet in df['cleaned_text']])

# 创建语料库: 语料库是一个将文档转换为词袋表示的对象 词袋是一个稀疏向量 其中每个单词的id映射到其在文档中的出现次数 
corpus = [dictionary.doc2bow(tweet.split()) for tweet in df['cleaned_text']]

In [13]:
'''
# 主题模型  ===== LDA =====  此任务无法在Jupyter中并发  在Mac M1上运行时长约1.5小时左右 (参数[2,30,2]) 验证可输出代码块10之后的csv文件 用.py格式运行多核并发LDA建模
# ref:https://wenku.csdn.net/column/68rabmq8w3#1.1%20 完整LDA介绍 
# ref:https://docs.pingcode.com/ask/174423.html 参数解释
from gensim.models import CoherenceModel

def train_lda_and_evaluate(num_topics):
    model = models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=15)
    coherencemodel = CoherenceModel(model=model, texts=[tweet.split() for tweet in df['cleaned_text']], dictionary=dictionary, coherence='c_v')
    coherence = coherencemodel.get_coherence()
    perplexity = model.log_perplexity(corpus)
    return num_topics, coherence, perplexity

# 调参 寻找最佳主题数K
start = 2
limit = 30
step = 2

best_num_topics = start
best_coherence = 0

for num_topics in range(start, limit + 1, step):
    num_topics, coherence, perplexity = train_lda_and_evaluate(num_topics)
    print(f"Number of Topics: {num_topics} \t Coherence Score: {coherence} \t Perplexity: {perplexity}")
    if coherence > best_coherence:
        best_coherence = coherence
        best_num_topics = num_topics

print(f"Best number of topics: {best_num_topics}")  # 一致性分数
print(f"Best Coherence Score: {best_coherence}")  # 困惑度

# Number of Topics: 2 	 Coherence Score: 0.3759011059947136 	 Perplexity: -8.296418882976667
# Number of Topics: 4 	 Coherence Score: 0.42184629557748243 	 Perplexity: -8.299548671751104
# Number of Topics: 6 	 Coherence Score: 0.43786540809139907 	 Perplexity: -8.510174269697803
# Number of Topics: 8 	 Coherence Score: 0.47176915985311024 	 Perplexity: -8.74394636272138
# Number of Topics: 10 	 Coherence Score: 0.4738715046477873 	 Perplexity: -9.159282468691316
# Number of Topics: 12 	 Coherence Score: 0.43104690782377725 	 Perplexity: -9.940450411910698
# Number of Topics: 14 	 Coherence Score: 0.41125086867564276 	 Perplexity: -10.647159945542178
# Number of Topics: 16 	 Coherence Score: 0.4120749905900342 	 Perplexity: -11.182660855673397
# Number of Topics: 18 	 Coherence Score: 0.4403819680385084 	 Perplexity: -11.831301903587304
# Number of Topics: 20 	 Coherence Score: 0.4282543571789759 	 Perplexity: -12.212385516719994
# Number of Topics: 22 	 Coherence Score: 0.42327749443398766 	 Perplexity: -12.60193505708384
# Number of Topics: 24 	 Coherence Score: 0.4176014410867455 	 Perplexity: -12.991686981669742
# Number of Topics: 26 	 Coherence Score: 0.3966323119642199 	 Perplexity: -13.359620649761812
# Number of Topics: 28 	 Coherence Score: 0.38242442503126844 	 Perplexity: -13.824911991294304
# Number of Topics: 30 	 Coherence Score: 0.4018442087621527 	 Perplexity: -14.1170189810078
# Best number of topics: 10
# Best Coherence Score: 0.4738715046477873
'''


'\n# 主题模型  ===== LDA =====  此任务无法在Jupyter中并发  在Mac M1上运行时长约1.5小时左右 (参数[2,30,2]) 验证可输出代码块10之后的csv文件 用.py格式运行多核并发LDA建模\n# ref:https://wenku.csdn.net/column/68rabmq8w3#1.1%20 完整LDA介绍 \n# ref:https://docs.pingcode.com/ask/174423.html 参数解释\nfrom gensim.models import CoherenceModel\n\ndef train_lda_and_evaluate(num_topics):\n    model = models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=15)\n    coherencemodel = CoherenceModel(model=model, texts=[tweet.split() for tweet in df[\'cleaned_text\']], dictionary=dictionary, coherence=\'c_v\')\n    coherence = coherencemodel.get_coherence()\n    perplexity = model.log_perplexity(corpus)\n    return num_topics, coherence, perplexity\n\n# 调参 寻找最佳主题数K\nstart = 2\nlimit = 30\nstep = 2\n\nbest_num_topics = start\nbest_coherence = 0\n\nfor num_topics in range(start, limit + 1, step):\n    num_topics, coherence, perplexity = train_lda_and_evaluate(num_topics)\n    print(f"Number of Topics: {num_topics} \t Coherence Score: {coherence

In [14]:
'''
# LDA调参 找到passes 和iterations  ===== LDA =====
# ！！不要运行这段代码 要跑8小时以上

# 定义参数搜索空间
passes_list = [20, 40, 60, 80]
iterations_list = [12, 14, 16]

# 初始化变量以存储最佳参数
best_coherence = -1
best_perplexity = float('inf')
best_passes = None
best_iterations = None

# 进行参数搜索
for passes in passes_list:
    for iterations in iterations_list:
        # 训练LDA模型
        lda_model = models.LdaModel(corpus, num_topics=10, id2word=dictionary, passes=passes, iterations=iterations, eval_every=1)
        
        # 计算困惑度
        perplexity = lda_model.log_perplexity(corpus)
        
        # 计算主题一致性
        coherence_model_lda = CoherenceModel(model=lda_model, texts=[tweet.split() for tweet in df['cleaned_text']], dictionary=dictionary, coherence='c_v')
        coherence = coherence_model_lda.get_coherence()
        
        # 输出当前参数的评估结果
        print(f"Passes: {passes}, Iterations: {iterations}, Coherence: {coherence}, Perplexity: {perplexity}")
        
        # 更新最佳参数
        if coherence > best_coherence and perplexity < best_perplexity:
            best_coherence = coherence
            best_perplexity = perplexity
            best_passes = passes
            best_iterations = iterations

print(f"Best Passes: {best_passes}, Best Iterations: {best_iterations}, Best Coherence: {best_coherence}, Best Perplexity: {best_perplexity}")

# Passes: 20, Iterations: 12, Coherence: 0.45824873948325983, Perplexity: -9.03431114214111
# Passes: 20, Iterations: 14, Coherence: 0.44837592847592934, Perplexity: -9.12432583705112
# Passes: 20, Iterations: 16, Coherence: 0.48747396284574838, Perplexity: -9.15307253900221   ** 选定 **
# Passes: 40, Iterations: 12, Coherence: 0.48289085159458595, Perplexity: -9.083461884051694 
# Passes: 40, Iterations: 14, Coherence: 0.4273950074677269, Perplexity: -9.082347179210101
# Passes: 40, Iterations: 16, Coherence: 0.4712308458087272, Perplexity: -9.090992752020139
# Passes: 60, Iterations: 14, Coherence: 0.4525172695340438, Perplexity: -9.071471253067294
# Passes: 60, Iterations: 16, Coherence: 0.47780339081003687, Perplexity: -9.11608313556567
# Passes: 80, Iterations: 12, Coherence: 0.4611642988790317, Perplexity: -9.060177368492962
'''

'\n# LDA调参 找到passes 和iterations  ===== LDA =====\n# ！！不要运行这段代码 要跑8小时以上\n\n# 定义参数搜索空间\npasses_list = [20, 40, 60, 80]\niterations_list = [12, 14, 16]\n\n# 初始化变量以存储最佳参数\nbest_coherence = -1\nbest_perplexity = float(\'inf\')\nbest_passes = None\nbest_iterations = None\n\n# 进行参数搜索\nfor passes in passes_list:\n    for iterations in iterations_list:\n        # 训练LDA模型\n        lda_model = models.LdaModel(corpus, num_topics=10, id2word=dictionary, passes=passes, iterations=iterations, eval_every=1)\n        \n        # 计算困惑度\n        perplexity = lda_model.log_perplexity(corpus)\n        \n        # 计算主题一致性\n        coherence_model_lda = CoherenceModel(model=lda_model, texts=[tweet.split() for tweet in df[\'cleaned_text\']], dictionary=dictionary, coherence=\'c_v\')\n        coherence = coherence_model_lda.get_coherence()\n        \n        # 输出当前参数的评估结果\n        print(f"Passes: {passes}, Iterations: {iterations}, Coherence: {coherence}, Perplexity: {perplexity}")\n        \n        # 更新最佳参数\

In [15]:
# 使用最佳参数训练最终LDA模型
final_lda_model = models.LdaModel(corpus, num_topics=10, id2word=dictionary, passes=20, iterations=16, eval_every=180000)

# 输出最终模型的主题
for idx, topic in final_lda_model.print_topics(-1):
    print(f"Topic: {idx}\nWords: {topic}\n")
    
'''
Topic: 0
Words: 0.032*"1" + 0.027*"year" + 0.025*"2" + 0.024*"million" + 0.022*"000" + 0.020*"billion" + 0.020*"animal" + 0.018*"3" + 0.017*"5" + 0.014*"4"

Topic: 1
Words: 0.027*"bushfire" + 0.027*"vicfires" + 0.018*"info" + 0.017*"fire" + 0.015*"smoke" + 0.014*"south" + 0.013*"amp" + 0.013*"rain" + 0.012*"advice" + 0.011*"air"

Topic: 2
Words: 0.089*"australia" + 0.051*"koala" + 0.039*"australiabushfires" + 0.036*"australiaburning" + 0.035*"australiaonfire" + 0.030*"animal" + 0.030*"australiafires" + 0.028*"australianbushfiresdisaster" + 0.020*"australianfires" + 0.017*"rain"

Topic: 3
Words: 0.019*"like" + 0.016*"people" + 0.014*"know" + 0.013*"get" + 0.012*"one" + 0.010*"look" + 0.010*"right" + 0.010*"would" + 0.010*"think" + 0.009*"need"

Topic: 4
Words: 0.067*"auspol" + 0.032*"scottmorrisonmp" + 0.019*"australiaburns" + 0.017*"scottyfrommarketing" + 0.017*"government" + 0.015*"climateemergency" + 0.014*"morrison" + 0.013*"amp" + 0.011*"australian" + 0.011*"australianbushfiredisaster"

Topic: 5
Words: 0.063*"climate" + 0.041*"change" + 0.017*"bushfireaustralia" + 0.014*"amp" + 0.008*"climatechange" + 0.007*"science" + 0.007*"action" + 0.007*"fuel" + 0.006*"lie" + 0.006*"game"

Topic: 6
Words: 0.044*"help" + 0.021*"please" + 0.020*"australia" + 0.017*"donation" + 0.016*"australianbushfires" + 0.014*"support" + 0.014*"donate" + 0.014*"australiafires" + 0.013*"australianbushfiredisaster" + 0.013*"money"

Topic: 7
Words: 0.021*"thank" + 0.019*"firefighter" + 0.017*"amp" + 0.012*"thanks" + 0.011*"home" + 0.011*"australianfires" + 0.011*"work" + 0.010*"volunteer" + 0.009*"amazing" + 0.009*"people"

Topic: 8
Words: 0.044*"australia" + 0.026*"fire" + 0.017*"climatechange" + 0.016*"australiafires" + 0.014*"world" + 0.014*"bushfires" + 0.013*"climateemergency" + 0.013*"australianfires" + 0.012*"australianbushfiredisaster" + 0.011*"climatecrisis"

Topic: 9
Words: 0.037*"fire" + 0.015*"power" + 0.015*"nsw" + 0.014*"community" + 0.013*"water" + 0.012*"bushfire" + 0.011*"affected" + 0.011*"island" + 0.010*"service" + 0.010*"area"
'''

Topic: 0
Words: 0.025*"smoke" + 0.025*"bushfiresaustralia" + 0.024*"bushfires" + 0.018*"bushfirecrisisaustralia" + 0.018*"melbourne" + 0.017*"australianfires" + 0.016*"australia" + 0.015*"new" + 0.014*"nswfires" + 0.014*"australiafires"

Topic: 1
Words: 0.061*"auspol" + 0.028*"scottmorrisonmp" + 0.022*"scottyfrommarketing" + 0.020*"australiaburns" + 0.018*"morrison" + 0.016*"australianbushfiredisaster" + 0.012*"pm" + 0.010*"minister" + 0.009*"climateemergency" + 0.009*"scott"

Topic: 2
Words: 0.077*"australia" + 0.030*"australiafires" + 0.028*"australiaonfire" + 0.026*"australiaburning" + 0.024*"australianbushfiresdisaster" + 0.021*"australiabushfires" + 0.020*"fire" + 0.020*"australianfires" + 0.019*"koala" + 0.018*"rain"

Topic: 3
Words: 0.030*"climate" + 0.019*"change" + 0.018*"amp" + 0.018*"climatechange" + 0.014*"australia" + 0.014*"auspol" + 0.012*"climateemergency" + 0.011*"climatecrisis" + 0.010*"government" + 0.009*"australianbushfiredisaster"

Topic: 4
Words: 0.048*"info" + 0

'\nTopic: 0\nWords: 0.032*"1" + 0.027*"year" + 0.025*"2" + 0.024*"million" + 0.022*"000" + 0.020*"billion" + 0.020*"animal" + 0.018*"3" + 0.017*"5" + 0.014*"4"\n\nTopic: 1\nWords: 0.027*"bushfire" + 0.027*"vicfires" + 0.018*"info" + 0.017*"fire" + 0.015*"smoke" + 0.014*"south" + 0.013*"amp" + 0.013*"rain" + 0.012*"advice" + 0.011*"air"\n\nTopic: 2\nWords: 0.089*"australia" + 0.051*"koala" + 0.039*"australiabushfires" + 0.036*"australiaburning" + 0.035*"australiaonfire" + 0.030*"animal" + 0.030*"australiafires" + 0.028*"australianbushfiresdisaster" + 0.020*"australianfires" + 0.017*"rain"\n\nTopic: 3\nWords: 0.019*"like" + 0.016*"people" + 0.014*"know" + 0.013*"get" + 0.012*"one" + 0.010*"look" + 0.010*"right" + 0.010*"would" + 0.010*"think" + 0.009*"need"\n\nTopic: 4\nWords: 0.067*"auspol" + 0.032*"scottmorrisonmp" + 0.019*"australiaburns" + 0.017*"scottyfrommarketing" + 0.017*"government" + 0.015*"climateemergency" + 0.014*"morrison" + 0.013*"amp" + 0.011*"australian" + 0.011*"austral

In [18]:
# 输出模型 分配主题到df 输出数据到csv文件
# 分配主题 
def get_dominant_topic(model, corpus):
    dominant_topics = []
    for bow in corpus:
        topic_probs = model.get_document_topics(bow)
        dominant_topic = max(topic_probs, key=lambda x: x[1])[0]
        dominant_topics.append(dominant_topic)
    return dominant_topics

df['dominant_topic'] = get_dominant_topic(final_lda_model, corpus)


In [21]:
# 输出模型
final_lda_model.save('../../models/lda_model.gensim')
dictionary.save('../../data/processed/dictionary.gensim')
corpora.MmCorpus.serialize('../../data/processed/corpus.mm', corpus)

# 使用这个语法在jupyter框架外加载模型
# final_lda_model = models.LdaModel.load('../../models/lda_model.gensim')
# dictionary = corpora.Dictionary.load('../../data/processed/dictionary.gensim')
# corpus = corpora.MmCorpus('../../data/processed/corpus.mm')

In [None]:
# 输出数据到csv文件
df.to_csv('../../data/processed/tweets_with_topics.csv', index=False, encoding='utf-8-sig')

In [None]:
# 生成可视化HTML文件以查看关键topics
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

# 准备pyLDAvis数据
lda_display = gensimvis.prepare(final_lda_model, corpus, dictionary)

# 显示可视化界面
pyLDAvis.display(lda_display)

# 保存可视化结果到HTML文件
pyLDAvis.save_html(lda_display, '../../backend/utils')


In [None]:
# 生成词图 有需要时可以使用
from wordcloud import WordCloud
import matplotlib.pyplot as plt


# 自定义函数将所有词汇转换为大写
def to_upper_case(frequencies):
    return {word.upper(): freq for word, freq in frequencies.items()}


# 创建一个大图，将所有词云汇聚在一起
num_topics = final_lda_model.num_topics
fig, axes = plt.subplots(2, (num_topics + 1) // 2, figsize=(20, 10), sharex=True, sharey=True)

# 生成每个主题的词云并汇聚到一张图中
for i, ax in enumerate(axes.flatten()):
    if i < num_topics:
        topic_words = dict(final_lda_model.show_topic(i, 200))
        topic_words_upper = to_upper_case(topic_words)

        wordcloud = WordCloud(width=800, height=600,
                              background_color='white',
                              max_words=200,
                              contour_width=3,
                              contour_color='steelblue',
                              random_state=21,
                              max_font_size=110).generate_from_frequencies(topic_words_upper)

        ax.imshow(wordcloud, interpolation='bilinear')
        ax.set_title(f'Topic {i}', fontsize=16)
        ax.axis('off')
    else:
        ax.axis('off')

# 调整子图布局
plt.tight_layout()
plt.savefig('../../backend/utils/LDA_wordcloud.png', format='png', bbox_inches='tight')
plt.show()


In [None]:
# 从LDA_visualization.html文件 和LDA_wordcloud.png中 总结出以下和山火相关新闻关键词
# 使用这些关键词将推文进行过滤 保留与山火相关的推文 保存为tweets_bushfire_related_keywords.csv

# 加载数据
df = pd.read_csv('../../data/processed/tweets_with_topics.csv')

# From HTML file
bushfire_keywords = [ 
    "bushfiredisaster", "australfires", "fire", "australianfires", "bushfires",
    "australiaburns", "australianbushfiredisaster", "australianbushfires",
    "australfire", "australiaburning", "koala", "animal", "australianwildfires",
    "australianbushfire", "firefighter", "bushfireaustralia", "nswfires", "vicfires",
    "bushfirecrisis", "bushfiresaustralia", "bushfirecrisisaustralia"
]

# 使用关键词过滤推文
df_bushfire_related = df[
    df['text'].str.contains('|'.join(bushfire_keywords), case=False, na=False)
]
# 将过滤后的数据保存到新的CSV文件
output_path = '../../data/processed/tweets_bushfire_related_keywords.csv'
df_bushfire_related.to_csv(output_path, index=False, encoding='utf-8-sig')

print(len(df_bushfire_related))  # 158902


In [17]:
# 聚合location列统计每个地点的推文数   ===== 地点 =====   这段后期有需要再用
location_counts = df.groupby('location').size()
location_counts = location_counts.sort_values(ascending=False)
print(location_counts.head(10))

location
Unknown                       75217
Victoria, Australia           18388
New South Wales, Australia    18228
United States                 15909
Australia                     15617
United Kingdom                 8312
Queensland, Australia          6027
South Australia, Australia     4074
Canada                         3704
India                          3492
dtype: int64
