In [None]:
# Download the ZIP file from Google Drive
!gdown --id 1muxtgFx3n0FNI3j9a_Y0sjpq_x612ull -O gensim_representation.zip

# Unzip the file
import zipfile
with zipfile.ZipFile("gensim_representation.zip", 'r') as zip_ref:
    zip_ref.extractall()

Downloading...
From: https://drive.google.com/uc?id=1muxtgFx3n0FNI3j9a_Y0sjpq_x612ull
To: /content/gensim_representation.zip
100% 25.7M/25.7M [00:00<00:00, 29.3MB/s]


In [None]:
# Download the ZIP file from Google Drive
!gdown --id 1JvhCzH4zgNzdvFu_9C-6yGEc1ELNy9-V -O toxic_gensim_representation.zip


# Unzip the file
import zipfile
with zipfile.ZipFile("toxic_gensim_representation.zip", 'r') as zip_ref:
    zip_ref.extractall()

Downloading...
From: https://drive.google.com/uc?id=1JvhCzH4zgNzdvFu_9C-6yGEc1ELNy9-V
To: /content/toxic_gensim_representation.zip
100% 1.80M/1.80M [00:00<00:00, 16.7MB/s]


In [None]:
from gensim.corpora import Dictionary
import joblib

# Load the Gensim dictionary
dictionary = Dictionary.load('gensim_dictionary.dict')

# Load the Gensim corpus
corpus = joblib.load('gensim_corpus.joblib')

toxic_dictionary = Dictionary.load('toxic_gensim_dictionary.dict')
toxic_corpus = joblib.load('toxic_gensim_corpus_bow.joblib')


In [None]:
texts = [
    [dictionary[id] for id, freq in doc]
    for doc in corpus
]

toxic_texts = [
    [toxic_dictionary[id] for id, freq in doc]
    for doc in toxic_corpus
]

In [None]:
from gensim.models import LdaModel
from gensim.models.coherencemodel import CoherenceModel

# Function to train and evaluate a single LDA model
def train_and_evaluate_lda(k, corpus, dictionary, texts, passes=5, iterations=10):
    print(f"\n=== Training LDA for k={k} ===")

    # Train LDA model
    print("Step 1/4: Training LDA model...")
    lda_model = LdaModel(
        corpus=corpus,
        id2word=dictionary,
        num_topics=k,
        passes=passes,
        iterations=iterations,
        random_state=42
    )
    print("LDA model training complete.")

    # Calculate Coherence
    print("Step 2/4: Calculating Coherence...")
    coherence_model = CoherenceModel(
        model=lda_model,
        corpus=corpus,
        texts=texts,  # Use the tokenized documents
        dictionary=dictionary,
        coherence='c_v'
    )
    coherence_score = coherence_model.get_coherence()
    print(f"Coherence score calculated: {coherence_score:.4f}")

    # Calculate Perplexity
    print("Step 3/4: Calculating Perplexity...")
    perplexity = lda_model.log_perplexity(corpus)
    print(f"Perplexity calculated: {perplexity:.4f}")

    # Calculate Diversity
    print("Step 4/4: Calculating Diversity...")
    topics = lda_model.show_topics(num_topics=k, formatted=False)
    unique_words = set(word for topic in topics for word, _ in topic[1])
    diversity = len(unique_words) / (k * 10)
    print(f"Diversity calculated: {diversity:.4f}")

    print(f"Finished for k={k}: Coherence={coherence_score:.4f}, Perplexity={perplexity:.4f}, Diversity={diversity:.4f}")

    return {
        'k': k,
        'coherence': coherence_score,
        'perplexity': perplexity,
        'diversity': diversity,
        'lda_model': lda_model
    }

In [None]:
results_k5 = train_and_evaluate_lda(5, corpus, dictionary, texts, passes=5, iterations=10)



=== Training LDA for k=5 ===
Step 1/4: Training LDA model...
LDA model training complete.
Step 2/4: Calculating Coherence...
Coherence score calculated: 0.7538
Step 3/4: Calculating Perplexity...
Perplexity calculated: -7.7267
Step 4/4: Calculating Diversity...
Diversity calculated: 0.8600
Finished for k=5: Coherence=0.7538, Perplexity=-7.7267, Diversity=0.8600


In [None]:
lda_model = results_k5['lda_model']
lda_model.save('lda_model_k5.gensim')


In [None]:
results_k10 = train_and_evaluate_lda(10, corpus, dictionary, texts, passes=5, iterations=10)



=== Training LDA for k=10 ===
Step 1/4: Training LDA model...
LDA model training complete.
Step 2/4: Calculating Coherence...
Coherence score calculated: 0.6798
Step 3/4: Calculating Perplexity...
Perplexity calculated: -7.7794
Step 4/4: Calculating Diversity...
Diversity calculated: 0.8300
Finished for k=10: Coherence=0.6798, Perplexity=-7.7794, Diversity=0.8300


In [None]:
lda_model_k10 = results_k10['lda_model']
lda_model_k10.save('lda_model_k10.gensim')


### Toxic Section

In [None]:
toxic_results_k10 = train_and_evaluate_lda(10, toxic_corpus, toxic_dictionary, toxic_texts, passes=5, iterations=20)



=== Training LDA for k=10 ===
Step 1/4: Training LDA model...
LDA model training complete.
Step 2/4: Calculating Coherence...
Coherence score calculated: 0.4779
Step 3/4: Calculating Perplexity...
Perplexity calculated: -7.0605
Step 4/4: Calculating Diversity...
Diversity calculated: 0.9300
Finished for k=10: Coherence=0.4779, Perplexity=-7.0605, Diversity=0.9300


In [None]:
toxic_lda_model_k10 = toxic_results_k10['lda_model']
toxic_lda_model_k10.save('toxic_lda_model_k10.gensim')


In [None]:
toxic_results_k5 = train_and_evaluate_lda(5, toxic_corpus, toxic_dictionary, toxic_texts, passes=5, iterations=20)



=== Training LDA for k=5 ===
Step 1/4: Training LDA model...
LDA model training complete.
Step 2/4: Calculating Coherence...
Coherence score calculated: 0.4902
Step 3/4: Calculating Perplexity...
Perplexity calculated: -7.0486
Step 4/4: Calculating Diversity...
Diversity calculated: 0.9200
Finished for k=5: Coherence=0.4902, Perplexity=-7.0486, Diversity=0.9200


In [None]:
toxic_lda_model_k5 = toxic_results_k5['lda_model']
toxic_lda_model_k5.save('toxic_lda_model_k5.gensim')
