In [None]:
import pandas as pd
import gensim
import gensim.corpora as corpora
from gensim.models import Phrases
from gensim.models.ldamodel import LdaModel
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk

# Ensure that the French stopwords are downloaded
nltk.download('stopwords')
nltk.download('punkt')

# Load the dataset
df = pd.read_csv('express_multiple.csv', encoding='utf-8')

# Select the target documents from the second column
texts = df.iloc[:, 1].astype(str)

# Preprocess the text data
# Load French stopwords
stop_words = stopwords.words('french')
stop_words.extend(['»', '«', '’', 'L', 'a'])

# Tokenize and remove stopwords
def preprocess(text):
    return [word for word in word_tokenize(text.lower()) if word.isalpha() and word not in stop_words]

# Apply preprocessing to each document
tokenized_texts = texts.apply(preprocess)

# Build the bigram model
bigram = Phrases(tokenized_texts, min_count=5, threshold=100)
bigram_mod = gensim.models.phrases.Phraser(bigram)

# Apply the bigram model to each document
def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

# Apply the bigram model to the tokenized texts
bigram_texts = make_bigrams(tokenized_texts)

# Create Dictionary and Corpus needed for Topic Modeling
id2word = corpora.Dictionary(bigram_texts)
corpus = [id2word.doc2bow(text) for text in bigram_texts]

# Build LDA model
lda_model = LdaModel(corpus=corpus,
                     id2word=id2word,
                     num_topics=20,
                     random_state=100,
                     update_every=1,
                     chunksize=100,
                     passes=10,
                     alpha='auto',
                     per_word_topics=True)

# Print the topics
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

In [None]:
# Function to print the number of documents in the corpus
def print_corpus_documents_count(corpus):
    print(f"The number of documents in the corpus is: {len(corpus)}")
# Call the function to print the number of documents
print_corpus_documents_count(corpus)

In [None]:
# Function to print the topic distribution for each document
def print_topic_distribution_for_each_document(lda_model, corpus):
    for i, row in enumerate(corpus):
        doc_topics = lda_model.get_document_topics(row)
        print(f"Document {i} topic distribution: {doc_topics}")

# Call the function to print the topic distribution for each document
print_topic_distribution_for_each_document(lda_model, corpus)

In [None]:
# Function to create a new CSV document with topic distributions
def append_topic_distribution_to_csv(lda_model, corpus, df, output_filename):
    # Get the topic distribution for each document
    topic_dist_list = [lda_model.get_document_topics(bow, minimum_probability=0) for bow in corpus]
    
    # Create a DataFrame
    topic_dist_df = pd.DataFrame([{topic: prob for topic, prob in doc} for doc in topic_dist_list])
    
    # Rename columns to reflect topic names
    topic_dist_df.columns = [f'Topic_{col}' for col in topic_dist_df.columns]
    
    # Merge with the original DataFrame
    merged_df = pd.concat([df, topic_dist_df], axis=1)
    
    # Fill NaN values with 0 (documents may not have all topics)
    merged_df = merged_df.fillna(0)
    
    # Save to a new CSV file
    merged_df.to_csv(output_filename, index=False, encoding='utf-8')
    print(f"New CSV with topic distributions saved as '{output_filename}'")

# Call the function to create a new CSV with topic distributions
append_topic_distribution_to_csv(lda_model, corpus, df, 'express_multiple_with_topics.csv')