In [1]:
import pandas as pd
from gensim import corpora, models
from gensim.models import CoherenceModel

df = pd.read_json('../../data/processed/final_data(filtered).json')

In [2]:
# Create a dictionary
dictionary = corpora.Dictionary([tweet.split() for tweet in df['cleaned_text']])

# Create a corpus
corpus = [dictionary.doc2bow(tweet.split()) for tweet in df['cleaned_text']]


In [3]:
# Train the LDA model for main topics, use topic=10, passes=20, iterations=16 from pre_LDA model.
passes = 20
iterations = 16

lda_model = models.LdaModel(corpus, num_topics=10, id2word=dictionary, passes=passes, iterations=iterations,
                            eval_every=1000000,random_state=88)
perplexity = lda_model.log_perplexity(corpus)
coherence_model_lda = CoherenceModel(model=lda_model, texts=[tweet.split() for tweet in df['cleaned_text']],
                                     dictionary=dictionary, coherence='c_v')
coherence = coherence_model_lda.get_coherence()

print(f"Passes: {passes}, Iterations: {iterations}, Coherence: {coherence}, Perplexity: {perplexity}")


Passes: 20, Iterations: 16, Coherence: 0.4344320248678731, Perplexity: -8.955812934977232


In [4]:
# Dominant topic for each tweet
def get_dominant_topic(model, corpus):
    dominant_topics = []
    for bow in corpus:
        topic_probs = model.get_document_topics(bow)
        dominant_topic = max(topic_probs, key=lambda x: x[1])[0] + 1  
        dominant_topics.append(dominant_topic)
    return dominant_topics

df['main-topic'] = get_dominant_topic(lda_model, corpus)

In [5]:
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

# Prepare data
lda_display = gensimvis.prepare(lda_model, corpus, dictionary)

# Save HTML file
pyLDAvis.save_html(lda_display, 'LDA_new_visualization.html')

In [57]:
# save the model/dictionary/corpus
lda_model.save('../../models/main_topics/lda_model_subtopics')
dictionary.save('../../models/main_topics/dictionary_subtopics')
corpora.MmCorpus.serialize('../../models/main_topics/corpus_subtopics', corpus)

### LDA_new_visualization.html Results Analysis:
**when λ = 1 we have the following words:**

* Topic 1: fire, people, koala, australiafires, australianfires, like, one, u, australia, amp, need, world, get, water, know, make, much, comment, thing, animal, going, save, time, see, stop, think, could, country, really, many

* Topic 2: help, australia, australianbushfiredisaster, support, please, australiafires, donation, australianbushfires, donate, australian, relief, australianfires, team, money, charity, thank, thanks, amp, bushfire, affected, wildlife, bushfires, go, fund, fundraiser, community, fire, via, great, supporting

* Topic 3: auspol, australianbushfiredisaster, scottmorrisonmp, australiaburns, scottyfrommarketing, government, climateemergency, morrison, amp, australianfires, australiaburning, bushfirecrisis, australian, australianbushfiresdisaster, pm, coal, bushfirecrisisaustralia, minister, via, climatecrisis, scott, bushfires, go, australiafires, time, say, australianbushfires, stopadani, lnp, auspol2020

* Topic 4: climate, climatechange, change, australia, climateemergency, climatecrisis, australiafires, bushfires, amp, fire, australianfires, australian, action, climateaction, disaster, australiaburns, australianbushfires, global, crisis, emission, extinctionrebellion, world, time, drought, government, need, science, environment, impact, future

* Topic 5: australia, australianbushfiresdisaster, australiaburning, australiafires, australiaonfire, australiabushfires, australianfires, rain, australiaisburning, australianbushfires, australiaburns, australianbushfire, animal, heart, fire, love, god, prayforaustralia, save, australiafire, news, hope, happening, beautiful, bushfires, pray, picture, image, bushfiresaustralia, prayer

* Topic 6: fire, nsw, nswfires, south, bushfires, area, new, victoria, bushfire, latest, firefighter, state, nswrfs, service, coast, wale, game, update, news, road, thanks, via, burning, reduction, daily, local, forest, emergency, community, burn

* Topic 7: bushfireaustralia, smoke, day, rain, today, air, melbourne, australianfires, last, year, bushfires, australiafires, new, sydney, 2020, week, stream, bushfiresaustralia, australiaburns, australianbushfires, bushfire, australia, hour, canberra, bushfirecrisis, australian, quality, january, month, gt

* Topic 8: animal, koala, australia, wildlife, 1, million, kangaroo, billion, 000, fire, lost, bushfires, island, home, australiafires, 2, specie, life, australian, people, tree, 5, burnt, loss, habitat, dead, thousand, australianfires, killed, amp

* Topic 9: vicfires, bushfire, info, amp, advice, watch, act, river, nswfires, warning, safires, issued, school, emergency, community, farmer, abbeyard, flooding, south, creek, rd, mallacoota, 8km, ese, message, student, aboriginal, valley, information, east

* Topic 10: story, health, thread, roberthubel, medium, post, heavy, visit, page, incredible, news, organisation, insurance, social, flash, grief, online, ongoing, forget, sound, india, staff, check, comic, nasa, mental, firies, clean, lt, hospital

---
**when λ = 0.5 we have the following words:**
* Topic 1: people, like, koala, water, comment, thing, one, know, u, world, make, something, get, much, really, stop, think, fire, need, feel, lot, going, save, look, australiabushfire, raising, could, even, put, little

* Topic 2: help, support, donation, please, donate, relief, team, charity, australianbushfiredisaster, thank, money, fund, fundraiser, supporting, thanks, affected, australianwildfires, raise, donated, effort, australianbushfires, amazing, whogivesatruck, community, link, great, recovery, donating, australian, share

* Topic 3: auspol, scottmorrisonmp, scottyfrommarketing, australianbushfiredisaster, morrison, government, australiaburns, pm, minister, scott, coal, climateemergency, stopadani, lnp, auspol2020, climatecriminals, auspolsocorrupt, scomo, murdoch, prime, govt, scottyfommarketing, bushfirecrisis, geraldjeandelannoy, scottfrommarketing, morrisonfires, lnpfail, scomomustgo, katyrobertson, bushfirecrisisaustralia

* Topic 4: climate, climatechange, change, climatecrisis, climateemergency, action, climateaction, global, emission, extinctionrebellion, science, scientist, climatechangeisreal, disaster, industry, drought, christopherblombaker, robertmandyparker, fuel, arson, larsstrom, environment, arsonist, australia, natural, policy, indigenous, environmental, future, economy

* Topic 5: australia, australianbushfiresdisaster, australiaburning, australiaonfire, australiaisburning, australiabushfires, australianbushfire, rain, heart, australiafires, god, prayforaustralia, australiafire, happening, love, australianfires, pray, picture, prayer, beautiful, break, heartbreaking, image, australianbushfires, hope, save, australiaburns, thought, sad, hero

* Topic 6: fire, nsw, nswfires, south, victoria, latest, area, state, coast, wale, game, update, reduction, service, new, wombat, grass, daily, nswrfs, road, rf, letter, mountain, hazard, recover, region, royal, resident, commission, firefighter

* Topic 7: bushfireaustralia, smoke, air, melbourne, day, last, today, stream, 2020, sydney, hour, rain, canberra, quality, january, week, gt, jan, solidarity, year, morning, airquality, kangarooisland, melbournesmoke, fundraising, hail, night, ausopen, sky, city

* Topic 8: animal, million, koala, 1, kangaroo, billion, 000, wildlife, lost, island, specie, burnt, loss, habitat, dead, 2, thousand, killed, home, burned, survivor, 5, tree, died, native, destroyed, art, life, christelleshirleyhamelgarcia, 50

* Topic 9: vicfires, info, advice, watch, bushfire, act, river, safires, issued, school, farmer, abbeyard, flooding, rd, creek, mallacoota, 8km, warning, ese, amp, student, aboriginal, tourist, gippsland, fish, fall, thomasjosephflynndiaz, 22, project, myrtleford

* Topic 10: story, health, thread, roberthubel, heavy, visit, page, incredible, organisation, insurance, social, post, flash, grief, online, medium, sound, india, comic, nasa, mental, firies, clean, lt, excellent, hospital, code, v, item, sport

---
**when λ = 0 we have the following words:**
* Topic 1: thing, something, feel, australiabushfire, raising, difference, whole, buy, french, baby, living, camel, dog, special, killing, kill, saveaustralia, uk, war, cry, fridaysforfuture, bear, resilience, mother, evidence, rainforest, click, survive, sweet, wallaby

* Topic 2: support, donation, donate, relief, charity, thank, fund, fundraiser, supporting, australianwildfires, raise, donated, amazing, donating, link, whogivesatruck, share, proceeds, rt, learn, appeal, generozity, proud, auction, cross, wonderful, wire, canada, book, bid

* Topic 3: auspol, morrison, scottyfrommarketing, pm, minister, scott, stopadani, lnp, auspol2020, climatecriminals, auspolsocorrupt, scomo, murdoch, geraldjeandelannoy, scottfrommarketing, scottyfommarketing, prime, morrisonfires, lnpfail, scomomustgo, katyrobertson, larsfoster, leadership, scottmorrison, mine, federal, olofdawson, coalition, sackscomo, adani

* Topic 4: climate, change, climatechange, climateaction, global, emission, extinctionrebellion, science, scientist, climatechangeisreal, industry, christopherblombaker, robertmandyparker, larsstrom, environmental, arsonist, natural, economy, fossil, murdochspuppet, dismissthisgovernment, unprecedented, term, denier, denial, bullshitboy, warming, recognize, carbon, research

* Topic 5: australiaisburning, heart, god, prayforaustralia, australiafire, happening, pray, picture, prayer, break, landscape, globe, fake, surprise, n, hurt, praying, zoo, magritferguson, song, related, co, actforamazonia, stopbolsonaro, decided, ravine, dont, design, awful, taylor

* Topic 6: nsw, latest, coast, wale, game, update, reduction, wombat, grass, rf, letter, mountain, royal, region, hazard, recover, resident, 7news, wollemi, threatened, augustebaileys, rural, pine, greg, arsonemergency, ahead, crew, buffalo, chief, wwf

* Topic 7: bushfireaustralia, smoke, air, melbourne, last, stream, hour, canberra, quality, january, gt, jan, airquality, solidarity, morning, kangarooisland, melbournesmoke, fundraising, hail, ausopen, sky, city, tonight, dust, australianopen, temperature, goal, night, hot, c

* Topic 8: million, 000, lost, kangaroo, billion, island, specie, burnt, loss, thousand, killed, dead, habitat, burned, survivor, died, native, destroyed, art, christelleshirleyhamelgarcia, half, signed, saved, 25, hectare, raining, endangered, conservation, population, saving

* Topic 9: vicfires, info, advice, watch, river, safires, issued, school, farmer, abbeyard, flooding, rd, 8km, ese, mallacoota, student, aboriginal, tourist, gippsland, fish, fall, project, thomasjosephflynndiaz, 22, myrtleford, instagram, tamboon, 10km, bay, cann

* Topic 10: story, health, thread, roberthubel, heavy, visit, page, incredible, organisation, insurance, social, flash, grief, online, sound, india, comic, nasa, excellent, code, v, hospital, mental, lt, clean, firies, item, sport, blog, episode

#### These words can be concluded in to main topics: 
**Topic 1: Wildlife and Environmental Impact**

**Reasoning:** From λ=1, keywords such as “koala”, “animal”, “australiafires”, “wildlife” suggest a focus on the Impact of Bushfires on Wildlife and the Environment. This is confirmed by λ=0.5, which includes similar terms like “save”, “raising”, and λ=0 with terms like “resilience”, “wallaby”, indicating consistent concern about wildlife and ecological consequences.

**Topic 2: Fundraising and Community Support**

**Reasoning:** From λ=1, keywords such as “help”, “support”, “donation”, “charity” , "money", "please", "thank(s)", "fund", "fundraiser", "community", "supporting" clearly define a theme of Fundraising and Community Support for Bushfire Relief. This is reinforced by λ=0.5 and λ=0, where words like “donated”, “recovery” (and a lots of simmilar words)are prominent, indicating ongoing efforts to raise funds and support affected communities.

**Topic 3: Political Criticism and Government Response**

**Reasoning:** From λ=1, keywords like “auspol”, “scottmorrisonmp”, “government”, “climateemergency”, "minister", "auspol2020" suggest a theme of Political Criticism and Government Response to Bushfires. λ=0.5 and λ=0 added simmilar terms, highlighting strong public discontent with the government's actions.

**Topic 4: Climate Change Debate**

**Reasoning:** From λ=1, keywords like “climate”, “climatechange”, “climatecrisis”, "climatecrisis" point directly to a theme of Climate Change and Its Role in Bushfires. This is consistently echoed in λ=0.5 and λ=0, confirming the focus on the environmental debate surrounding climate change and its impacts.

**Topic 5: Emotional and Spiritual Reactions**

**Reasoning:** From λ=1, keywords like “prayforaustralia”, “save”, “god”, "pray" suggest a theme of Emotional and Spiritual Responses to the Bushfires. This is further supported by λ=0.5 and λ=0 with words like “prayer”, “hope” indicating a strong emotional and spiritual reaction from the public.

**Topic 6: Geographical and Location-Based Information**

**Reasoning:** From λ=1, keywords like “nsw”, “victoria”, “area”, “state”, “coast” suggest a theme of Geographical and Location-Based Information related to the bushfires. This is confirmed by λ=0.5 and λ=0, with terms like “region”, “mountain”, and “mountain” highlighting the strong association with specific locations and the impact of bushfires in those regions.

**Topic 7: Regional Air Quality and Environmental Conditions**

**Reasoning:** From λ=1, keywords like “smoke”, “air”, “melbourne”, “canberra”, “rain” suggest a theme of Air Quality and Environmental Conditions in Specific Regions. This is supported by λ=0.5 and λ=0, where terms like “airquality”, “melbournesmoke”, and “solidarity” emphasize concerns over air quality and environmental impacts, particularly in Melbourne and Canberra.

**Topic 8: Wildlife Devastation**

**Reasoning:** From λ=1, keywords like “animal”, “kangaroo”, “tree”, “billion”, “dead” suggest a theme of Devastation of Wildlife due to the bushfires. This is consistently supported by λ=0.5 and λ=0, with terms like “loss”, “habitat”, “died”, "destroyed", "life" highlighting the severe impact on wildlife. And from the LDA diagram, this topic has a strong association with topic 1.

**Topic 9: Emergency Information and Public Safety**

**Reasoning:** From λ=1, keywords like “vicfires”, “bushfire”, and “emergency” suggest a theme of Emergency Response and Public Safety in Victoria. This is supported by λ=0.5 and λ=0, with terms like “flooding”, “school”, and “warning” emphasizing the impact on local communities and infrastructure. The topic's position away from the cluster center might be a evidence that indicates a specialized focus on localized events?

**Topic 10: Could be "Health and Mental Well-being"**

**Reasoning:** From λ=1, keywords like "story," "health," "thread," "visit," and "grief" suggest a theme centered on personal narratives, mental health, and online discussions related to the bushfires. This theme is consistently supported by λ=0.5 and λ=0, with additional terms like "social," "online," "comic," and "medium" indicating the use of various media and platforms to share these stories. The distinct positioning of this topic away from others on the LDA diagram further emphasizes its unique focus on personal and reflective content.

In [2]:
# Unexpected termination of the previous content, reload the data here, this part should be deleted lateer
# When submitting, read the data instead of rerunning it, because you may get different modeling results 
import pandas as pd
from gensim import corpora, models

lda_model = models.LdaModel.load('/Users/zhoujingfeng/Downloads/pythonProject/DECO/models/main_topics/lda_model_subtopics')
dictionary = corpora.Dictionary.load('/Users/zhoujingfeng/Downloads/pythonProject/DECO/models/main_topics/dictionary_subtopics')
corpus = corpora.MmCorpus('/Users/zhoujingfeng/Downloads/pythonProject/DECO/models/main_topics/corpus_subtopics')

df = pd.read_json('../../data/processed/final_data(filtered).json')

def get_dominant_topic(model, corpus):
    dominant_topics = []
    for bow in corpus:
        topic_probs = model.get_document_topics(bow)
        dominant_topic = max(topic_probs, key=lambda x: x[1])[0] + 1
        dominant_topics.append(dominant_topic)
    return dominant_topics

df['main_topic'] = get_dominant_topic(lda_model, corpus)

In [7]:
df

Unnamed: 0,_id,created_at,display_text,sentiment,weighted_sentiment,dominant_topic,cleaned_text,influence_tweet_factor,media_urls,media_count,tweet_url,date,main-topic
0,1192617057394708480,2019-11-08 01:37:18,If that’s what the fires look like from up her...,0.6239,1.527748,6,fire look like nsw mid nth coast huge prayer t...,2.448706,[https://pbs.twimg.com/media/EI0G41MVAAAqVzx.jpg],1,https://t.co/AlgiOgAHQI,2019-11-08,1
1,1192684541913100288,2019-11-08 06:05:28,Progression of major fires today across NSW #N...,0.0000,0.000000,6,progression major fire today across nsw nswfires,2.974742,[https://pbs.twimg.com/ext_tw_video_thumb/1192...,1,https://t.co/KAZ3pW9i8O,2019-11-08,1
2,1192725213294804992,2019-11-08 08:47:05,#Smoke from #nswfires in the northeast is not ...,0.1154,0.270429,6,smoke nswfires northeast visible satellite ima...,2.343409,[https://pbs.twimg.com/media/EI1pQmDWkAIbDNG.p...,2,https://t.co/eNsAxzo6Nc,2019-11-08,8
3,1192766676854435840,2019-11-08 11:31:50,This shows the dangerous conditions that have ...,-0.7430,-2.701052,6,show dangerous condition confronted firefighte...,3.635333,[https://pbs.twimg.com/ext_tw_video_thumb/1192...,1,https://t.co/lIhnF8P1Qf,2019-11-08,1
4,1192932983407202304,2019-11-08 22:32:41,Incredible vision from @NSWRFS of a crew from ...,-0.0276,-0.078089,6,incredible vision nswrfs crew warringah hq hil...,2.829304,[https://pbs.twimg.com/ext_tw_video_thumb/1192...,1,https://t.co/9YA4PxNqzW,2019-11-08,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
157665,1220355953708548098,2020-01-23 14:41:47,Sometimes your company makes you proud by goin...,0.9136,1.146817,5,sometimes company make proud going beyond prod...,1.255273,[https://pbs.twimg.com/media/EO-TKuRU8AEVTpy.jpg],1,https://t.co/sIs3GLGBYe,2020-01-23,2
157666,1220367377793212416,2020-01-23 15:27:10,Me seeing the doomsday clock going to a 100 se...,-0.7351,-2.376015,2,seeing doomsday clock going 100 second austral...,3.232234,[https://pbs.twimg.com/ext_tw_video_thumb/1220...,1,https://t.co/JFAN5F9D55,2020-01-23,3
157667,1220420935326621699,2020-01-23 18:59:59,"""We need new words for collective grief of thi...",-0.4939,-0.148679,2,need new word collective grief scale firefeels...,0.301030,[],0,,2020-01-23,5
157668,1220423469529255937,2020-01-23 19:10:04,‘Shock and denial have turned to clawing grief...,-0.4939,-0.148679,3,shock denial turned clawing grief knotted pit ...,0.301030,[],0,,2020-01-23,8


In [6]:
# Save the main topic results
df.to_csv('../../data/processed/final_data(main_topic).csv')

In [31]:
# Secondary Topic Analysis
# Hold here, wait for UI's futher instructions

from gensim.models import LdaModel


def secondary_topic_analysis(data, main_topic_col='main-topic', text_col='cleaned_text', num_topics=5):
    grouped_data = data.groupby(main_topic_col)[text_col].apply(list)
    results = {}
    
    for main_topic, texts in grouped_data.items():
        texts = [text.split() for text in texts]  # Split text into words
        dictionary = corpora.Dictionary(texts)  # Construct dictionary
        corpus = [dictionary.doc2bow(text) for text in texts]  # Construct corpus
        
        # LDA modeling for sub topics
        sub_lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, random_state=88)
        
        # Save the model, dictionary, and corpus
        results[main_topic] = {'model': lda_model, 'dictionary': dictionary, 'corpus': corpus}
        
        print(f"\nMain Topic: {main_topic}")
        
        for lambda_value in [0, 0.5, 1]:  # Use privious method to get the top words
            top_words = []
            for i in range(num_topics):
                words = sub_lda_model.show_topic(i, topn=30)
                words_sorted_by_lambda = sorted(words, key=lambda x: lambda_value * x[1] + (1 - lambda_value) * dictionary.dfs[dictionary.token2id[x[0]]], reverse=True)
                top_words.append([word[0] for word in words_sorted_by_lambda])
            
            print(f"Lambda={lambda_value} Top 30 Words for each sub-topic:")
            for i, words in enumerate(top_words):
                print(f"  Sub-Topic {i + 1}: {words}")
            print("\n")
    
    return results

results = secondary_topic_analysis(df)



Main Topic: 1
Lambda=0 Top 30 Words for each sub-topic:
  Sub-Topic 1: ['fire', 'nswfires', 'nsw', 'australiafires', 'bushfires', 'australia', 'bushfire', 'south', 'australianfires', 'new', 'vicfires', 'amp', 'auspol', 'victoria', 'australian', 'state', 'bushfirecrisis', 'road', 'across', 'grass', 'news', 'latest', 'australianbushfiredisaster', 'wildfire', 'forest', 'island', 'augustebaileys', 'bushfiresaustralia', 'tag', 'src']
  Sub-Topic 2: ['fire', 'nswfires', 'nsw', 'australiafires', 'bushfires', 'south', 'nswrfs', 'new', 'firefighter', 'vicfires', 'area', 'amp', 'emergency', 'australian', 'wale', 'road', 'update', 'day', 'mountain', 'bushfirecrisisaustralia', 'volunteer', 'green', 'valley', 'blue', 'creek', 'issued', 'incident', 'mount', 'buffalo', 'nug']
  Sub-Topic 3: ['fire', 'nswfires', 'nsw', 'australiafires', 'bushfires', 'australia', 'bushfire', 'south', 'australianfires', 'new', 'firefighter', 'amp', 'auspol', 'victoria', 'australian', 'bushfirecrisis', 'wale', 'via', 'c