In [4]:
import pandas as pd
import gensim
import gensim.corpora as corpora
from gensim.models import Phrases
from gensim.models.ldamodel import LdaModel
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk

# Ensure that the French stopwords are downloaded
nltk.download('stopwords')
nltk.download('punkt')

# Load the dataset
df = pd.read_csv('express_multiple.csv', encoding='utf-8')

# Select the target documents from the second column
texts = df.iloc[:, 1].astype(str)

# Preprocess the text data
# Load French stopwords
stop_words = stopwords.words('french')
stop_words.extend(['»', '«', '’', 'L', 'a'])

# Tokenize and remove stopwords
def preprocess(text):
    return [word for word in word_tokenize(text.lower()) if word.isalpha() and word not in stop_words]

# Apply preprocessing to each document
tokenized_texts = texts.apply(preprocess)

# Build the bigram model
bigram = Phrases(tokenized_texts, min_count=5, threshold=100)
bigram_mod = gensim.models.phrases.Phraser(bigram)

# Apply the bigram model to each document
def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

# Apply the bigram model to the tokenized texts
bigram_texts = make_bigrams(tokenized_texts)

# Create Dictionary and Corpus needed for Topic Modeling
id2word = corpora.Dictionary(bigram_texts)
corpus = [id2word.doc2bow(text) for text in bigram_texts]

# Build LDA model
lda_model = LdaModel(corpus=corpus,
                     id2word=id2word,
                     num_topics=20,
                     random_state=100,
                     update_every=1,
                     chunksize=100,
                     passes=10,
                     alpha='auto',
                     per_word_topics=True)

# Print the topics
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jerem\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jerem\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Topic: 0 
Words: 0.005*"eugène" + 0.005*"naviguer" + 0.003*"radeau" + 0.003*"corriveau" + 0.003*"misogynie" + 0.000*"dolan" + 0.000*"frères_sœurs" + 0.000*"charland" + 0.000*"maillot_bain" + 0.000*"dubé"
Topic: 1 
Words: 0.004*"inquiets" + 0.003*"sénatrices" + 0.002*"mofif" + 0.001*"forum_leaders" + 0.001*"immigrantes" + 0.001*"laïla" + 0.001*"quinquennal" + 0.001*"liaisons" + 0.001*"lussier" + 0.001*"chanteuse"
Topic: 2 
Words: 0.000*"plus" + 0.000*"canada" + 0.000*"aussi" + 0.000*"deux" + 0.000*"où" + 0.000*"français" + 0.000*"ontario" + 0.000*"services" + 0.000*"milliards" + 0.000*"francophones"
Topic: 3 
Words: 0.094*"femmes" + 0.019*"violence" + 0.014*"filles" + 0.013*"album" + 0.012*"hommes" + 0.007*"violence_faite" + 0.007*"cycle" + 0.007*"étudiantes" + 0.006*"féminine" + 0.006*"cheminement"
Topic: 4 
Words: 0.047*"haïti" + 0.015*"paix" + 0.013*"pays" + 0.011*"haïtiens" + 0.011*"sécurité" + 0.010*"ukraine" + 0.009*"crise" + 0.009*"haïtien" + 0.008*"mission" + 0.008*"humanitaire"

In [5]:
# Function to print the number of documents in the corpus
def print_corpus_documents_count(corpus):
    print(f"The number of documents in the corpus is: {len(corpus)}")
# Call the function to print the number of documents
print_corpus_documents_count(corpus)

The number of documents in the corpus is: 600


In [6]:
# Function to print the topic distribution for each document
def print_topic_distribution_for_each_document(lda_model, corpus):
    for i, row in enumerate(corpus):
        doc_topics = lda_model.get_document_topics(row)
        print(f"Document {i} topic distribution: {doc_topics}")

# Call the function to print the topic distribution for each document
print_topic_distribution_for_each_document(lda_model, corpus)

Document 0 topic distribution: [(8, 0.033576354), (9, 0.21930566), (10, 0.7447521)]
Document 1 topic distribution: [(8, 0.227497), (9, 0.74823517)]
Document 2 topic distribution: [(4, 0.060526922), (5, 0.060322363), (8, 0.17765626), (9, 0.42725593), (10, 0.012820101), (19, 0.26084876)]
Document 3 topic distribution: [(4, 0.090117596), (5, 0.11487318), (9, 0.39446643), (10, 0.1065389), (19, 0.29126847)]
Document 4 topic distribution: [(3, 0.025630837), (8, 0.72213364), (9, 0.22890723), (10, 0.01881466)]
Document 5 topic distribution: [(8, 0.06722723), (9, 0.73549175), (10, 0.061472617), (19, 0.13339427)]
Document 6 topic distribution: [(9, 0.89265525), (10, 0.016108464), (12, 0.048601728), (19, 0.025369428)]
Document 7 topic distribution: [(3, 0.033963483), (8, 0.46474242), (9, 0.44768062), (10, 0.024188755), (12, 0.013151686)]
Document 8 topic distribution: [(5, 0.27907035), (9, 0.5107835), (10, 0.17830426), (19, 0.02194306)]
Document 9 topic distribution: [(8, 0.033211574), (9, 0.3640

In [35]:
# Function to create a new CSV document with topic distributions
def append_topic_distribution_to_csv(lda_model, corpus, df, output_filename):
    # Get the topic distribution for each document
    topic_dist_list = [lda_model.get_document_topics(bow, minimum_probability=0) for bow in corpus]
    
    # Create a DataFrame
    topic_dist_df = pd.DataFrame([{topic: prob for topic, prob in doc} for doc in topic_dist_list])
    
    # Rename columns to reflect topic names
    topic_dist_df.columns = [f'Topic_{col}' for col in topic_dist_df.columns]
    
    # Merge with the original DataFrame
    merged_df = pd.concat([df, topic_dist_df], axis=1)
    
    # Fill NaN values with 0 (documents may not have all topics)
    merged_df = merged_df.fillna(0)
    
    # Save to a new CSV file
    merged_df.to_csv(output_filename, index=False, encoding='utf-8')
    print(f"New CSV with topic distributions saved as '{output_filename}'")

# Call the function to create a new CSV with topic distributions
append_topic_distribution_to_csv(lda_model, corpus, df, 'express_multiple_with_topics.csv')

New CSV with topic distributions saved as 'express_multiple_with_topics.csv'
