
# Topic Modeling with Latent Dirichlet Allocation (LDA)
This project applies Topic Modeling using LDA to extract meaningful topics from a dataset of articles. The workflow includes text preprocessing, model training, and topic evaluation.


In [None]:

# Importing Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
    

In [None]:

# Loading the Dataset
npr = pd.read_csv('npr.csv')
npr.head()
    

In [None]:

# Exploring the Dataset
print("Sample Article:")
print(npr['Article'][0])

# Checking for Missing Values
print(f"Missing Values: {npr.isnull().sum()}")

# Drop missing values if needed
npr.dropna(inplace=True)
    

In [None]:

# Vectorizing the Text Data
cv = CountVectorizer(max_df=0.9, min_df=2, stop_words='english')
dtm = cv.fit_transform(npr['Article'])

# Shape of Document-Term Matrix
print(f"Shape of DTM: {dtm.shape}")
    

In [None]:

# Training LDA Model
lda = LatentDirichletAllocation(n_components=5, random_state=42)
lda.fit(dtm)

# Extracting Topics
for i, topic in enumerate(lda.components_):
    print(f"Topic {i}:")
    print([cv.get_feature_names_out()[index] for index in topic.argsort()[-10:]])
    

In [None]:

# Assigning Dominant Topics to Articles
topic_results = lda.transform(dtm)
npr['Topic'] = topic_results.argmax(axis=1)

# Display Articles with Assigned Topics
npr[['Article', 'Topic']].head()
    

In [None]:

# Visualizing Topic Distribution
sns.countplot(x='Topic', data=npr)
plt.title("Distribution of Topics Across Articles")
plt.show()
    


### Conclusions
- The model successfully grouped articles into 5 distinct topics.
- Topics were identified based on frequently occurring keywords.
- Further fine-tuning of the preprocessing steps or increasing the number of topics could enhance results.
