In [1]:
# Import Libraries
import nltk
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from gensim import corpora
from gensim.models import LdaModel

# Download NLTK Resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# Load the Data
df = pd.read_csv('npr.csv')
documents = df['Article'].tolist()

# Preprocess the Data
stop_words = set(stopwords.words('english'))  # Create a set of English stopwords
lemmatizer = WordNetLemmatizer()  # Initialize a WordNet lemmatizer

def preprocess_text(text):
    tokens = word_tokenize(text.lower())  # Tokenize the text into words and convert to lowercase
    tokens = [token for token in tokens if token.isalnum()]  # Filter out non-alphanumeric tokens
    tokens = [token for token in tokens if token not in stop_words]  # Remove stopwords from the tokens
    tokens = [lemmatizer.lemmatize(token) for token in tokens]  # Lemmatize each token
    return tokens  # Return the preprocessed tokens

preprocessed_documents = [preprocess_text(doc) for doc in documents]  # Preprocess each document

# Create document-term matrix
dictionary = corpora.Dictionary(preprocessed_documents)
dictionary.filter_extremes(no_below=15, no_above=0.5)  # Filter out tokens that appear in less than 15 documents or more than 50% of the documents
corpus = [dictionary.doc2bow(doc) for doc in preprocessed_documents]  # Convert each preprocessed document into a bag-of-words representation using the dictionary

# Run LDA
lda_model = LdaModel(corpus, num_topics=5, id2word=dictionary, passes=15)  # Train an LDA model on the corpus with 5 topics

# Interpret Results
article_labels = []  # Empty list to store dominant topic labels for each document
for i, doc in enumerate(preprocessed_documents):
    bow = dictionary.doc2bow(doc)  # Convert to bag-of-words representation
    topics = lda_model.get_document_topics(bow)  # Get list of topic probabilities
    dominant_topic = max(topics, key=lambda x: x[1])[0]  # Determine topic with highest probability
    article_labels.append(dominant_topic)  # Append to the list

df_result = pd.DataFrame({"Article": documents, "Topic": article_labels})  # Create DataFrame
print("Table with Articles and Topic:")
print(df_result)
print()

# Print top terms for each topic
print("Top Terms for Each Topic:")
for idx, topic in lda_model.print_topics():
    print(f"Topic {idx}:")
    terms = [term.strip() for term in topic.split("+")]
    for term in terms:
        weight, word = term.split("*")
        print(f"- {word.strip()} (weight: {weight.strip()})")
    print()


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Table with Articles and Topic:
                                                 Article  Topic
0      In the Washington of 2016, even when the polic...      2
1        Donald Trump has used Twitter  —   his prefe...      2
2        Donald Trump is unabashedly praising Russian...      2
3      Updated at 2:50 p. m. ET, Russian President Vl...      2
4      From photography, illustration and video, to d...      3
...                                                  ...    ...
11987  The number of law enforcement officers shot an...      0
11988    Trump is busy these days with victory tours,...      2
11989  It’s always interesting for the Goats and Soda...      1
11990  The election of Donald Trump was a surprise to...      2
11991  Voters in the English city of Sunderland did s...      4

[11992 rows x 2 columns]

Top Terms for Each Topic:
Topic 0:
- "state" (weight: 0.012)
- "law" (weight: 0.010)
- "school" (weight: 0.009)
- "court" (weight: 0.007)
- "student" (weight: 0.007)
- "feder