In [1]:
import pandas as pd
from nltk.tokenize import RegexpTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [2]:
# Load Dataset
df = pd.read_csv('abcnews-date-text.csv')

In [3]:
df.head()

Unnamed: 0,publish_date,headline_text
0,20030219,aba decides against community broadcasting lic...
1,20030219,act fire witnesses must be aware of defamation
2,20030219,a g calls for infrastructure protection summit
3,20030219,air nz staff in aust strike for pay rise
4,20030219,air nz strike to affect australian travellers


In [4]:
df.shape

(1082168, 2)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1082168 entries, 0 to 1082167
Data columns (total 2 columns):
 #   Column         Non-Null Count    Dtype 
---  ------         --------------    ----- 
 0   publish_date   1082168 non-null  int64 
 1   headline_text  1082168 non-null  object
dtypes: int64(1), object(1)
memory usage: 16.5+ MB


In [6]:
df.isnull().sum()

publish_date     0
headline_text    0
dtype: int64

In [8]:
df.duplicated().sum()

0

In [10]:
documents_list=df['headline_text'].tolist()

In [11]:
# Initialize regex tokenizer
tokenizer = RegexpTokenizer(r'\w+')

# Vectorize document using TF-IDF
tfidf = TfidfVectorizer(lowercase=True,
                        stop_words='english',
                        ngram_range = (1,1),
                        tokenizer = tokenizer.tokenize)

# Fit and Transform the documents
train_data = tfidf.fit_transform(documents_list)

In [12]:
# Define the number of topics or components
num_components=5

# Create LDA object
model=LatentDirichletAllocation(n_components=num_components)

# Fit and Transform SVD model on data
lda_matrix = model.fit_transform(train_data)

# Get Components 
lda_components=model.components_

In [13]:
# Print the topics with their terms
terms = tfidf.get_feature_names()

for index, component in enumerate(lda_components):
    zipped = zip(terms, component)
    top_terms_key=sorted(zipped, key = lambda t: t[1], reverse=True)[:7]
    top_terms_list=list(dict(top_terms_key).keys())
    print("Topic "+str(index)+": ",top_terms_list)



Topic 0:  ['police', 'iraq', 'killed', 'abc', 'market', 'attack', 'new']
Topic 1:  ['man', 'interview', 'police', 'crash', 'charged', 'court', 'murder']
Topic 2:  ['council', 'water', 'new', 'govt', 'plan', 'health', 'urged']
Topic 3:  ['day', 'win', 'world', 'new', 'australia', 'cup', 'open']
Topic 4:  ['rural', 'country', 'says', 'wa', 'new', 'labor', 'hour']
