In [1]:
import pandas as pd
df = pd.read_csv('data//All_tweets_public.csv')


In [2]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import re
import nltk
from nltk.corpus import stopwords

# Download NLTK stopwords if not already downloaded
nltk.download('stopwords')

# Custom stop words
custom_stopwords = set(stopwords.words('english'))
additional_stopwords = {'xexxa', 'ba', 'bwildfire', 'bcalifornia', 'itxexxs', 'bthe'}  # Add your custom stop words here
custom_stopwords.update(additional_stopwords)

# Function to preprocess the text data
def preprocess_text(text):
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    # Remove special characters, numbers, and punctuation
    text = re.sub(r'\@\w+|\#','', text)
    text = re.sub(r'[^A-Za-z\s]', '', text)
    # Convert to lowercase
    text = text.lower()
    # Remove stopwords including custom ones
    text = ' '.join([word for word in text.split() if word not in custom_stopwords])
    return text


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\msarv\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
# Preprocess the 'Text' column
df['Processed_Text'] = df['Text'].apply(preprocess_text)


In [4]:
# Create a document-term matrix
vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
dtm = vectorizer.fit_transform(df['Processed_Text'])


In [5]:

# Perform LDA topic modeling
lda = LatentDirichletAllocation(n_components=5, random_state=0)
lda.fit(dtm)


In [None]:
# Function to display topics
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print(f"Topic {topic_idx + 1}:")
        print(", ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))

# Display the top 10 words for each topic
no_top_words = 10
display_topics(lda, vectorizer.get_feature_names_out(), no_top_words)

Topic 1:
wildfire california oregon trump bcalifornia science lost like relief climate
Topic 2:
wildfire smoke west air coast wildfires bwildfire quality bthe sky
Topic 3:
forest fires climate wildfire change management national years amp burning
Topic 4:
wildfire gender reveal california party oregon started starting county sparked
Topic 5:
wildfire like smoke forest people bi covid itxexxs know spread
