In [1]:
import pandas as pd

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from string import punctuation

sw_indo = stopwords.words('indonesian') + list(punctuation)


# Import Data

In [2]:
df = pd.read_csv('data/data/kompas.csv')
df.head()

Unnamed: 0,teks
0,Ginandjar Tetap Ditahan. Jaksa Agung Dilaporka...
1,Jakarta Dikangkangi Para Preman\nKALAU tak pun...
2,Penyimpangan di Setpres Seolah Terjadi Sekaran...
3,"Dibayarkan, Rapel Kenaikan Gaji Pegawai Pos\nK..."
4,"Stop Kekerasan, Elite agar Duduk Bersama\nSeju..."


# Extract Bow

In [3]:
from sklearn.feature_extraction.text import CountVectorizer

bow = CountVectorizer(ngram_range=(1, 2), tokenizer=word_tokenize, stop_words=sw_indo, min_df=5)
bow_matrix = bow.fit_transform(df.teks)



# Topic Modeling

In [4]:
vocab = bow.get_feature_names_out()

In [6]:
len(vocab)

25134

In [11]:
def get_topic(model):
    return [[vocab[idx] for idx in reversed(comp.argsort()[-6:]) if vocab[idx].isalnum()] 
            for comp in model.components_]


## Latent semantic analysis (LSA)

In [7]:
from sklearn.decomposition import TruncatedSVD

In [8]:
lsa = TruncatedSVD(n_components=10, n_iter=10, random_state=42)

In [None]:
lsa_matrix = lsa.fit_transform(bow_matrix)

In [10]:
# analogi
print(bow_matrix.shape) # hidden
print(lsa_matrix.shape) # weight / code
print(lsa.components_.shape) # fitur / topic

(2008, 25134)
(2008, 10)
(10, 25134)


In [12]:
get_topic(lsa)

[['presiden', 'indonesia', 'pemerintah', 'dpr'],
 ['presiden', 'dpr', 'ketua', 'partai', 'mpr', 'tandjung'],
 ['pemerintah', 'rp', 'indonesia', 'bank', 'persen', 'utang'],
 ['presiden', 'indonesia', 'as', 'abdurrahman', 'wahid'],
 ['presiden', 'air', 'banjir', 'harga', 'rp', 'dpr'],
 ['harga', 'beras', 'rp', 'bbm'],
 ['mpr', 'konstitusi', 'bppn', 'uud'],
 ['indonesia', 'mpr', 'konstitusi', 'uud', 'perubahan', '1945'],
 ['pemerintah', 'dpr', 'israel', 'bppn', 'kota', 'aceh'],
 ['indonesia', 'pemerintah', 'dpr', 'beras', 'utang', 'air']]

# Latent Semantic Allocation (LDA)

In [13]:
from sklearn.decomposition import LatentDirichletAllocation

In [14]:
lda = LatentDirichletAllocation(n_components=10, max_iter=10, random_state=42)
lda_matrix = lda.fit_transform(bow_matrix)

In [15]:
get_topic(lda)

[['tandjung', 'dpr', 'hukum', 'ketua'],
 ['bank', 'indonesia', 'pemerintah', 'ekonomi'],
 ['pesawat', 'orang', 'rumah', 'korban'],
 ['banjir', 'air', 'warga', 'jakarta', 'jalan'],
 ['tni', 'hukum', 'ginandjar', 'tim'],
 ['rp', 'pemerintah', 'harga', 'bppn'],
 ['indonesia', 'as', 'pemerintah', 'aceh'],
 ['presiden', 'dpr', 'politik', 'ketua'],
 ['daerah', 'indonesia', 'masyarakat', 'maluku'],
 ['polisi', 'kepala', 'jakarta', 'orang']]