In [31]:
import pandas as pd
import gensim
import gensim.corpora as corpora
from gensim.models import Phrases
from gensim.models.ldamodel import LdaModel
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk

# Ensure that the French stopwords are downloaded
nltk.download('stopwords')
nltk.download('punkt')

# Load the dataset
df = pd.read_csv('express_multiple.csv', encoding='utf-8')

# Select the target documents from the second column
texts = df.iloc[:, 1].astype(str)

# Preprocess the text data
# Load French stopwords
stop_words = set(stopwords.words('french'))

# Tokenize and remove stopwords
def preprocess(text):
    return [word for word in word_tokenize(text.lower()) if word.isalpha() and word not in stop_words]

# Apply preprocessing to each document
tokenized_texts = texts.apply(preprocess)

# Build the bigram model
bigram = Phrases(tokenized_texts, min_count=5, threshold=100)
bigram_mod = gensim.models.phrases.Phraser(bigram)

# Apply the bigram model to each document
def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

# Apply the bigram model to the tokenized texts
bigram_texts = make_bigrams(tokenized_texts)

# Create Dictionary and Corpus needed for Topic Modeling
id2word = corpora.Dictionary(bigram_texts)
corpus = [id2word.doc2bow(text) for text in bigram_texts]

# Build LDA model
lda_model = LdaModel(corpus=corpus,
                     id2word=id2word,
                     num_topics=5,
                     random_state=100,
                     update_every=1,
                     chunksize=100,
                     passes=10,
                     alpha='auto',
                     per_word_topics=True)

# Print the topics
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jerem\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jerem\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Topic: 0 
Words: 0.014*"plus" + 0.010*"a" + 0.008*"français" + 0.005*"comme" + 0.004*"tout" + 0.004*"view_all" + 0.003*"québec" + 0.003*"fait" + 0.003*"toronto" + 0.003*"vie"
Topic: 1 
Words: 0.012*"plus" + 0.008*"a" + 0.008*"français" + 0.006*"roman" + 0.005*"comme" + 0.004*"femmes" + 0.003*"tout" + 0.003*"view_all" + 0.003*"être" + 0.003*"langue_française"
Topic: 2 
Words: 0.018*"a" + 0.013*"plus" + 0.006*"canada" + 0.005*"gouvernement" + 0.004*"ontario" + 0.004*"cette" + 0.004*"aussi" + 0.004*"français" + 0.004*"médias" + 0.003*"toronto"
Topic: 3 
Words: 0.017*"a" + 0.012*"plus" + 0.012*"français" + 0.006*"francophones" + 0.005*"canada" + 0.004*"langues_officielles" + 0.004*"photo" + 0.004*"cette" + 0.004*"être" + 0.004*"faire"
Topic: 4 
Words: 0.018*"haïti" + 0.005*"pays" + 0.004*"paix" + 0.004*"haïtiens" + 0.003*"crise" + 0.003*"haïtien" + 0.003*"annik" + 0.003*"sécurité" + 0.002*"haïtienne" + 0.002*"depuis"


In [33]:
# Function to print the number of documents in the corpus
def print_corpus_documents_count(corpus):
    print(f"The number of documents in the corpus is: {len(corpus)}")
# Call the function to print the number of documents
print_corpus_documents_count(corpus)

The number of documents in the corpus is: 600


In [34]:
# Function to print the topic distribution for each document
def print_topic_distribution_for_each_document(lda_model, corpus):
    for i, row in enumerate(corpus):
        doc_topics = lda_model.get_document_topics(row)
        print(f"Document {i} topic distribution: {doc_topics}")

# Call the function to print the topic distribution for each document
print_topic_distribution_for_each_document(lda_model, corpus)

Document 0 topic distribution: [(0, 0.16333236), (1, 0.78668076), (2, 0.03463352), (3, 0.015275525)]
Document 1 topic distribution: [(0, 0.77178234), (2, 0.035547264), (3, 0.18846257)]
Document 2 topic distribution: [(0, 0.8390507), (2, 0.077514745), (3, 0.071998194), (4, 0.011177757)]
Document 3 topic distribution: [(0, 0.16915865), (2, 0.098451644), (3, 0.6229232), (4, 0.10936245)]
Document 4 topic distribution: [(0, 0.17747846), (2, 0.6098627), (3, 0.20761257)]
Document 5 topic distribution: [(2, 0.03887446), (3, 0.96080595)]
Document 6 topic distribution: [(3, 0.9922422)]
Document 7 topic distribution: [(0, 0.84072757), (3, 0.15836702)]
Document 8 topic distribution: [(1, 0.9979362)]
Document 9 topic distribution: [(0, 0.06474725), (2, 0.85136557), (3, 0.08374342)]
Document 10 topic distribution: [(0, 0.6306333), (1, 0.018095763), (3, 0.3482736)]
Document 11 topic distribution: [(1, 0.05639665), (3, 0.9423714)]
Document 12 topic distribution: [(0, 0.9980334)]
Document 13 topic dist

In [35]:
# Function to create a new CSV document with topic distributions
def append_topic_distribution_to_csv(lda_model, corpus, df, output_filename):
    # Get the topic distribution for each document
    topic_dist_list = [lda_model.get_document_topics(bow, minimum_probability=0) for bow in corpus]
    
    # Create a DataFrame
    topic_dist_df = pd.DataFrame([{topic: prob for topic, prob in doc} for doc in topic_dist_list])
    
    # Rename columns to reflect topic names
    topic_dist_df.columns = [f'Topic_{col}' for col in topic_dist_df.columns]
    
    # Merge with the original DataFrame
    merged_df = pd.concat([df, topic_dist_df], axis=1)
    
    # Fill NaN values with 0 (documents may not have all topics)
    merged_df = merged_df.fillna(0)
    
    # Save to a new CSV file
    merged_df.to_csv(output_filename, index=False, encoding='utf-8')
    print(f"New CSV with topic distributions saved as '{output_filename}'")

# Call the function to create a new CSV with topic distributions
append_topic_distribution_to_csv(lda_model, corpus, df, 'express_multiple_with_topics.csv')

New CSV with topic distributions saved as 'express_multiple_with_topics.csv'
