In [None]:
import os
from pathlib import Path

# --- ENVIRONMENT SWITCH ---
# Set to True if running on local machine with Google Drive Desktop mounted
# Set to False if running in Google Colab cloud
RUNNING_LOCALLY = True

if RUNNING_LOCALLY:
    # Standard macOS path for Google Drive Desktop
    BASE_PATH = Path('/Volumes/GoogleDrive/MyDrive/AI Public Trust')
else:
    # Google Colab cloud path
    from google.colab import drive
    drive.mount('/content/drive')
    BASE_PATH = Path('/content/drive/MyDrive/AI Public Trust')

# Pre-compute critical paths used across notebooks
twits_folder = BASE_PATH / 'Raw Data/Twits/'
test_folder = BASE_PATH / 'Raw Data/'
datasets_folder = BASE_PATH / 'Data Sets'
cleanedds_folder = BASE_PATH / 'Data Sets/Cleaned Data'
networks_folder = BASE_PATH / 'Data Sets/Networks/'
literature_folder = BASE_PATH / 'Literature/'
topic_models_folder = BASE_PATH / 'Models/Topic Modeling/'


In [1]:
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import Phrases
from nltk.corpus import stopwords
import nltk

# Ensure you have downloaded the stopwords
nltk.download('stopwords')

# Sample data
data = [
    "This is a sample document for topic modeling.",
    "Another document for topic modeling example.",
    "Gensim is a useful library for topic modeling.",
    "We will create bigrams for better topic modeling."
]

# Preprocess the data
def preprocess(texts):
    """
    Tokenize the text and remove stopwords.
    """
    stop_words = stopwords.words('english')
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

data_processed = preprocess(data)

# Create bigrams
bigram = Phrases(data_processed, min_count=2, threshold=10) # Adjust min_count and threshold as needed
bigram_mod = gensim.models.phrases.Phraser(bigram)

def make_bigrams(texts):
    """
    Apply the bigram model to the texts.
    """
    return [bigram_mod[doc] for doc in texts]

data_bigrams = make_bigrams(data_processed)

# Create Dictionary
id2word = corpora.Dictionary(data_bigrams)

# Create Corpus: Term Document Frequency
corpus = [id2word.doc2bow(text) for text in data_bigrams]

# Build LDA model
lda_model = gensim.models.LdaModel(
    corpus=corpus,  # The corpus in bag-of-words format
    id2word=id2word,  # The dictionary mapping of word IDs
    num_topics=4,  # The number of topics
    random_state=100,  # Random state for reproducibility
    update_every=1,  # Update model every `update_every` chunks
    chunksize=10,  # Number of documents to use in each training chunk
    passes=10,  # Number of passes through the corpus during training
    alpha='auto',  # Hyperparameter for document-topic density
    per_word_topics=True  # Whether to compute the word-topic distribution
)

# Print the topics
for idx, topic in lda_model.print_topics(-1):
    print(f'Topic: {idx} \nWords: {topic}\n')


Topic: 0 
Words: 0.083*"topic" + 0.083*"modeling" + 0.083*"sample" + 0.083*"document" + 0.083*"example" + 0.083*"gensim" + 0.083*"library" + 0.083*"useful" + 0.083*"bigrams" + 0.083*"better"

Topic: 1 
Words: 0.187*"modeling" + 0.187*"topic" + 0.104*"useful" + 0.104*"library" + 0.104*"gensim" + 0.104*"sample" + 0.104*"document" + 0.021*"example" + 0.021*"bigrams" + 0.021*"create"

Topic: 2 
Words: 0.083*"topic" + 0.083*"modeling" + 0.083*"document" + 0.083*"gensim" + 0.083*"bigrams" + 0.083*"create" + 0.083*"library" + 0.083*"example" + 0.083*"sample" + 0.083*"useful"

Topic: 3 
Words: 0.173*"modeling" + 0.173*"topic" + 0.096*"document" + 0.096*"another" + 0.096*"better" + 0.096*"create" + 0.096*"bigrams" + 0.096*"example" + 0.019*"sample" + 0.019*"gensim"



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [5]:
data_processed

[['sample', 'document', 'topic', 'modeling'],
 ['another', 'document', 'topic', 'modeling', 'example'],
 ['gensim', 'useful', 'library', 'topic', 'modeling'],
 ['create', 'bigrams', 'better', 'topic', 'modeling']]

In [6]:
bigram

<gensim.models.phrases.Phrases at 0x7836233964a0>

In [3]:
data_bigrams

[['sample', 'document', 'topic', 'modeling'],
 ['another', 'document', 'topic', 'modeling', 'example'],
 ['gensim', 'useful', 'library', 'topic', 'modeling'],
 ['create', 'bigrams', 'better', 'topic', 'modeling']]

In [4]:
id2word

<gensim.corpora.dictionary.Dictionary at 0x7836233d5060>