In [None]:
import pandas as pd
import gensim
from gensim import corpora, models
from gensim.utils import simple_preprocess
import re

# Read the input CSV file
input_csv = "Preprocessed_Dataset_Misinfo_TRUE.csv"  # Replace with the actual file name
df = pd.read_csv(input_csv)

# Create a new column to store the topics
df["topics"] = ""

# Function to tokenize the text for topic modeling
def tokenize(text):
    return simple_preprocess(text)

# Function to extract topic names from the topic string
def extract_topic_names(topic_string):
    return re.findall(r'"(.*?)"', topic_string)

# Iterate through each row and process the text in column B
for index, row in df.iterrows():
    text = row["text"]  # Assuming the column name is "B"

    # Tokenize the text for topic modeling
    tokenized_text = tokenize(str(text))

    if tokenized_text:  # Check if the tokenized text is not empty
        # Create the dictionary and the corpus for the LDA model
        dictionary = corpora.Dictionary([tokenized_text])
        corpus = [dictionary.doc2bow(tokenized_text)]

        # Apply the LDA model
        num_topics = 5  # Adjust the number of topics as needed
        lda_model = models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=10)

        # Save the topics in the dataframe
        topics = lda_model.print_topics()
        topic_names = [extract_topic_names(topic[1]) for topic in topics]

        # Keep track of unique topics for each row
        unique_topic_names = set()
        for topic_name_list in topic_names:
            for topic_name in topic_name_list:
                if len(unique_topic_names) < 5:
                    unique_topic_names.add(topic_name)
                else:
                    break
            if len(unique_topic_names) == 5:
                break

        df.at[index, "topics"] = ", ".join(unique_topic_names)

# Save the output as a new CSV file
output_csv = "Top5-topic_modeling_Preprocessed_Dataset_Misinfo_TRUE.csv"
df.to_csv(output_csv, index=False)