# Assignment 2: Extracting Topics from the Documents

### Data Exploration

In [4]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from gensim.models.ldamodel import LdaModel
from gensim import corpora
import nltk
from nltk.corpus import stopwords

#### Loading the dataset and printing the statistics

In [2]:
file_path = r'E:\Online_Course\NLP\Projects\text_docs.xlsx'
text_docs = pd.read_excel(file_path)
total_rows = len(text_docs)

unique_documents = text_docs['text'].nunique()

print(f"Total rows: {total_rows}")
print(f"Unique documents: {unique_documents}")


Total rows: 10
Unique documents: 10


#### Checking and handling missing values while also cleaning the text column and adding insights

In [3]:
print(text_docs.isnull().sum())  
text_docs = text_docs.dropna()  
text_docs = text_docs.drop_duplicates(subset='text')  
text_docs['text'] = text_docs['text'].str.lower().str.strip()
text_docs['word_count'] = text_docs['text'].str.split().apply(len)
print(text_docs)

document_id    0
text           0
dtype: int64
   document_id                                               text  word_count
0            1  the stock market has been experiencing volatil...          12
1            2  the economy is growing, and businesses are opt...          11
2            3  climate change is a critical issue that needs ...          11
3            4  advances in artificial intelligence have revol...           8
4            5  the rise of electric vehicles is shaping the f...          13
5            6  healthcare is evolving with the introduction o...          11
6            7  the entertainment industry is shifting towards...           9
7            8  social media is influencing the way people int...          11
8            9  governments around the world are investing in ...          10
9           10  cybersecurity is an ongoing concern as digital...          11


#### Preprocessing the text
- Tokenize
- Removing Stop words
- Normalizing

In [6]:
stop_words = set(stopwords.words('english'))

text_docs['clean_text'] = text_docs['text'].str.lower().str.split()
text_docs['clean_text'] = text_docs['clean_text'].apply(lambda x: [word for word in x if word not in stop_words])
print(text_docs)


   document_id                                               text  word_count  \
0            1  the stock market has been experiencing volatil...          12   
1            2  the economy is growing, and businesses are opt...          11   
2            3  climate change is a critical issue that needs ...          11   
3            4  advances in artificial intelligence have revol...           8   
4            5  the rise of electric vehicles is shaping the f...          13   
5            6  healthcare is evolving with the introduction o...          11   
6            7  the entertainment industry is shifting towards...           9   
7            8  social media is influencing the way people int...          11   
8            9  governments around the world are investing in ...          10   
9           10  cybersecurity is an ongoing concern as digital...          11   

                                          clean_text  
0  [stock, market, experiencing, volatility, due,... 

### Task 2 Generate Topics using LDA
#### Creating a LDA (Latent Dirichlet Allocation) and displaying the top 5 words for each topic generated by the model

In [None]:
dictionary = corpora.Dictionary(text_docs['clean_text'])
corpus = [dictionary.doc2bow(doc) for doc in text_docs['clean_text']]
lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=5, passes=10)
for idx, topic in lda_model.print_topics(num_words=5):
    print(f"Topic {idx}: {topic}")

Topic 0: 0.031*"influencing" + 0.031*"media" + 0.031*"world" + 0.031*"investing" + 0.031*"people"
Topic 1: 0.059*"digital" + 0.059*"cybersecurity" + 0.059*"platforms" + 0.059*"concern" + 0.059*"ongoing"
Topic 2: 0.059*"shaping" + 0.059*"future" + 0.059*"rise" + 0.059*"automobile" + 0.059*"vehicles"
Topic 3: 0.043*"climate" + 0.043*"immediate" + 0.043*"critical" + 0.043*"global" + 0.043*"issue"
Topic 4: 0.046*"towards" + 0.046*"streaming" + 0.046*"shifting" + 0.046*"evolving" + 0.046*"industry"


### Evaluation of the reults
- Topic 0:
    - Possible Theme: Social interactions, media influence, and communication dynamics.
- Topic 1:
    - Possible Theme: Financial instability, economic trends, and market uncertainties.
- Topic 2:
    - Possible Theme: Global challenges, environmental issues, or humanitarian concerns.
- Topic 3:
    - Possible Theme: Technology-related issues, digital safety, and cybersecurity concerns.
Topic 4:
    - Possible Theme: Global energy policies, media streaming platforms, or international affairs.
