In [None]:
#Script 1: Nested LDA Without Lemmatization

import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from gensim import corpora, models
import nltk

# Ensure nltk packages are downloaded
nltk.download('punkt')
nltk.download('stopwords')

def process_csvs_for_nested_lda(files_info, output_csv_path, num_topics=5, num_subtopics=3, num_words=10):
    all_topics_data = []

    for file_info in files_info:
        file_path, language = file_info['file_path'], file_info['language']
        df = pd.read_csv(file_path)
        stop_words = stopwords.words(language)
        texts = df['text'].dropna().astype(str).str.lower().apply(word_tokenize)
        texts = texts.apply(lambda x: [word for word in x if word not in stop_words and word.isalpha()])

        dictionary = corpora.Dictionary(texts)
        corpus = [dictionary.doc2bow(text) for text in texts]

        lda_model = models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=10, random_state=42)

        topic_docs = {i: [] for i in range(num_topics)}
        for doc_id, row in zip(texts.index, lda_model[corpus]):
            row = sorted(row, key=lambda x: -x[1])
            dominant_topic = row[0][0]
            topic_docs[dominant_topic].append(texts.loc[doc_id])

        subtopics = {}
        for topic_num, docs in topic_docs.items():
            sub_dictionary = corpora.Dictionary(docs)
            sub_corpus = [sub_dictionary.doc2bow(text) for text in docs]
            sub_lda_model = models.LdaModel(sub_corpus, num_topics=num_subtopics, id2word=sub_dictionary, passes=10, random_state=42)
            subtopics[topic_num] = sub_lda_model.print_topics(num_words=num_words)

            # Accumulate topics data for CSV output
            for subtopic_num, subtopic in enumerate(subtopics[topic_num]):
                words = ', '.join(word for word, _ in sub_lda_model.show_topic(subtopic_num, topn=num_words))
                all_topics_data.append({
                    'File': file_path,
                    'Language': language,
                    'Main Topic': topic_num + 1,
                    'Subtopic': subtopic_num + 1,
                    'Words': words
                })

    # Writing the result to a CSV file outside the loop
    topics_df = pd.DataFrame(all_topics_data)
    topics_df.to_csv(output_csv_path, index=False)
    print(f"All topics saved to {output_csv_path}")

files_info = [
    {'file_path': 'file1.csv', 'language': 'english'},
    {'file_path': 'file2.csv', 'language': 'russian'}
]

output_csv_path = 'nested_lda_results.csv'
process_csvs_for_nested_lda(files_info, output_csv_path)


In [None]:
#Script 2: Nested LDA With Lemmatization

import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim import corpora, models
import nltk

# Ensure nltk packages are downloaded
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

def process_csvs_for_nested_lda_with_lemmatization(files_info, output_csv_path, num_topics=5, num_subtopics=3, num_words=10):
    lemmatizer = WordNetLemmatizer()
    all_topics_data = []

    for file_info in files_info:
        file_path, language = file_info['file_path'], file_info['language']
        df = pd.read_csv(file_path)
        stop_words = stopwords.words(language)
        texts = df['text'].dropna().astype(str).str.lower().apply(word_tokenize)
        texts = texts.apply(lambda x: [lemmatizer.lemmatize(word) for word in x if word not in stop_words and word.isalpha()])

        dictionary = corpora.Dictionary(texts)
        corpus = [dictionary.doc2bow(text) for text in texts]

        lda_model = models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=10, random_state=42)

        topic_docs = {i: [] for i in range(num_topics)}
        for doc_id, row in zip(texts.index, lda_model[corpus]):
            row = sorted(row, key=lambda x: -x[1])
            dominant_topic = row[0][0]
            topic_docs[dominant_topic].append(texts.loc[doc_id])

        subtopics = {}
        for topic_num, docs in topic_docs.items():
            sub_dictionary = corpora.Dictionary(docs)
            sub_corpus = [sub_dictionary.doc2bow(text) for text in docs]
            sub_lda_model = models.LdaModel(sub_corpus, num_topics=num_subtopics, id2word=sub_dictionary, passes=10, random_state=42)
            subtopics[topic_num] = sub_lda_model.print_topics(num_words=num_words)

            # Accumulate topics data for CSV output
            for subtopic_num, subtopic in enumerate(subtopics[topic_num]):
                words = ', '.join(word for word, _ in sub_lda_model.show_topic(subtopic_num, topn=num_words))
                all_topics_data.append({
                    'File': file_path,
                    'Language': language,
                    'Main Topic': topic_num + 1,
                    'Subtopic': subtopic_num + 1,
                    'Words': words
                })

    # Writing the result to a CSV file outside the loop
    topics_df = pd.DataFrame(all_topics_data)
    topics_df.to_csv(output_csv_path, index=False)
    print(f"All topics saved to {output_csv_path}")

files_info = [
    {'file_path': 'file1.csv', 'language': 'english'},
    {'file_path': 'file2.csv', 'language': 'russian'}
]

output_csv_path = 'nested_lda_lemmatized_results.csv'
process_csvs_for_nested_lda_with_lemmatization(files_info, output_csv_path)
