# 1. Latent dirichlet allocation   

1.1. Data

In [2]:
import pandas as pd
npr = pd.read_csv('npr.csv')
npr.head()

Unnamed: 0,Article
0,"In the Washington of 2016, even when the polic..."
1,Donald Trump has used Twitter — his prefe...
2,Donald Trump is unabashedly praising Russian...
3,"Updated at 2:50 p. m. ET, Russian President Vl..."
4,"From photography, illustration and video, to d..."


1.2. Preprocessing

In [3]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
dtm = cv.fit_transform(npr['Article'])
dtm

<11992x54777 sparse matrix of type '<class 'numpy.int64'>'
	with 3033388 stored elements in Compressed Sparse Row format>

1.3. LDA

In [4]:
from sklearn.decomposition import LatentDirichletAllocation
LDA = LatentDirichletAllocation(n_components=7,random_state=42)
LDA.fit(dtm)

1.4. Showing stored words

In [17]:
len(cv.vocabulary_.keys())

54777

In [10]:
import random

In [11]:
for i in range(10):
    random_word_id = random.randint(0,54777)
    print(cv.get_feature_names_out()[random_word_id])

conceivably
roads
twitter
reaffirm
informing
blowhard
appointment
mcmillan
silhouette
rerouting


1.4.1. Showing top words per topic

In [12]:
len(LDA.components_)

7

In [13]:
LDA.components_

array([[8.64332806e+00, 2.38014333e+03, 1.42900522e-01, ...,
        1.43006821e-01, 1.42902042e-01, 1.42861626e-01],
       [2.76191749e+01, 5.36394437e+02, 1.42857148e-01, ...,
        1.42861973e-01, 1.42857147e-01, 1.42906875e-01],
       [7.22783888e+00, 8.24033986e+02, 1.42857148e-01, ...,
        6.14236247e+00, 2.14061364e+00, 1.42923753e-01],
       ...,
       [3.11488651e+00, 3.50409655e+02, 1.42857147e-01, ...,
        1.42859912e-01, 1.42857146e-01, 1.42866614e-01],
       [4.61486388e+01, 5.14408600e+01, 3.14281373e+00, ...,
        1.43107628e-01, 1.43902481e-01, 2.14271779e+00],
       [4.93991422e-01, 4.18841042e+02, 1.42857151e-01, ...,
        1.42857146e-01, 1.43760101e-01, 1.42866201e-01]])

In [14]:
len(LDA.components_[0])

54777

In [15]:
single_topic = LDA.components_[0]

In [16]:
single_topic.argsort()

array([ 2475, 18302, 35285, ..., 22673, 42561, 42993])

In [18]:
single_topic[2475]

0.1428571430851871

In [19]:
single_topic[42993]

6247.2455105210865

In [20]:
single_topic.argsort()[-10:]

array([33390, 36310, 21228, 10425, 31464,  8149, 36283, 22673, 42561,
       42993])

In [21]:
top_word_indices = single_topic.argsort()[-10:]

In [22]:
for index in top_word_indices:
    print(cv.get_feature_names_out()[index])

new
percent
government
company
million
care
people
health
said
says


In [23]:
for index, topic in enumerate(LDA.components_):
    print(f"The top 15 words for topic #{index}")
    print([cv.get_feature_names_out()[i] for i in topic.argsort()[-15:]])
    print('\n')

The top 15 words for topic #0
['companies', 'money', 'year', 'federal', '000', 'new', 'percent', 'government', 'company', 'million', 'care', 'people', 'health', 'said', 'says']


The top 15 words for topic #1
['military', 'house', 'security', 'russia', 'government', 'npr', 'reports', 'says', 'news', 'people', 'told', 'police', 'president', 'trump', 'said']


The top 15 words for topic #2
['way', 'world', 'family', 'home', 'day', 'time', 'water', 'city', 'new', 'years', 'food', 'just', 'people', 'like', 'says']


The top 15 words for topic #3
['time', 'new', 'don', 'years', 'medical', 'disease', 'patients', 'just', 'children', 'study', 'like', 'women', 'health', 'people', 'says']


The top 15 words for topic #4
['voters', 'vote', 'election', 'party', 'new', 'obama', 'court', 'republican', 'campaign', 'people', 'state', 'president', 'clinton', 'said', 'trump']


The top 15 words for topic #5
['years', 'going', 've', 'life', 'don', 'new', 'way', 'music', 'really', 'time', 'know', 'think',

1.4.2. Attaching discovered topic labels to original articles

In [24]:
dtm

<11992x54777 sparse matrix of type '<class 'numpy.int64'>'
	with 3033388 stored elements in Compressed Sparse Row format>

In [25]:
dtm.shape

(11992, 54777)

In [26]:
len(npr)

11992

In [27]:
topic_results = LDA.transform(dtm)

In [28]:
topic_results.shape

(11992, 7)

In [29]:
topic_results[0]

array([1.61040465e-02, 6.83341493e-01, 2.25376318e-04, 2.25369288e-04,
       2.99652737e-01, 2.25479379e-04, 2.25497980e-04])

In [30]:
topic_results[0].round(2)

array([0.02, 0.68, 0.  , 0.  , 0.3 , 0.  , 0.  ])

In [31]:
topic_results[0].argmax()

1

1.4.3. Combining with original data

In [32]:
npr.head()

Unnamed: 0,Article
0,"In the Washington of 2016, even when the polic..."
1,Donald Trump has used Twitter — his prefe...
2,Donald Trump is unabashedly praising Russian...
3,"Updated at 2:50 p. m. ET, Russian President Vl..."
4,"From photography, illustration and video, to d..."


In [33]:
topic_results.argmax(axis=1)

array([1, 1, 1, ..., 3, 4, 0])

In [34]:
npr['Topic'] = topic_results.argmax(axis=1)

In [36]:
npr.head(10)

Unnamed: 0,Article,Topic
0,"In the Washington of 2016, even when the polic...",1
1,Donald Trump has used Twitter — his prefe...,1
2,Donald Trump is unabashedly praising Russian...,1
3,"Updated at 2:50 p. m. ET, Russian President Vl...",1
4,"From photography, illustration and video, to d...",2
5,I did not want to join yoga class. I hated tho...,3
6,With a who has publicly supported the debunk...,3
7,"I was standing by the airport exit, debating w...",2
8,"If movies were trying to be more realistic, pe...",3
9,"Eighteen years ago, on New Year’s Eve, David F...",2


# 2. Quora  

In [37]:
data = pd.read_csv('quora_questions.csv')
data.head()

Unnamed: 0,Question
0,What is the step by step guide to invest in sh...
1,What is the story of Kohinoor (Koh-i-Noor) Dia...
2,How can I increase the speed of my internet co...
3,Why am I mentally very lonely? How can I solve...
4,"Which one dissolve in water quikly sugar, salt..."


In [38]:
count_vectorizer = CountVectorizer(max_df=0.9, min_df=2, stop_words='english')
corpus_matrix = count_vectorizer.fit_transform(data['Question'])

In [40]:
LDA2 = LatentDirichletAllocation(n_components=7,random_state=42)
LDA2.fit(corpus_matrix)

In [41]:
len(count_vectorizer.vocabulary_.keys())

38669

In [42]:
for i in range(10):
    random_word_id = random.randint(0,38669)
    print(count_vectorizer.get_feature_names_out()[random_word_id])

pays
rgb
grid
chatrapati
perfect
extensions
hydrosphere
allegedly
watched
300mbps


In [43]:
LDA2.components_

array([[ 0.14302751,  0.18117197,  0.14642241, ...,  0.14285724,
         2.14149267,  0.14285724],
       [ 0.14359789, 68.66894325,  0.14285793, ...,  0.14285728,
         0.14285797,  0.14285728],
       [ 5.30622659,  0.15008675,  0.14285791, ...,  2.14285648,
         0.14285796,  2.14285648],
       ...,
       [ 0.14325168,  0.14313396,  0.14285777, ...,  0.14285725,
         0.1428578 ,  0.14285725],
       [ 0.24759557,  0.14293736,  2.13928835, ...,  0.14285723,
         0.14421791,  0.14285723],
       [26.64869167,  0.14315453,  0.14285781, ...,  0.14285726,
         0.14285784,  0.14285726]])

In [44]:
len(LDA2.components_[0])

38669

In [45]:
for index, topic in enumerate(LDA2.components_):
    print(f"The top 15 words for topic #{index}")
    print([cv.get_feature_names_out()[i] for i in topic.argsort()[-15:]])
    print('\n')

The top 15 words for topic #0
['digest', 'canes', 'persuasive', 'hated', 'monticello', 'dopey', 'angrier', 'ameera', 'crystalline', 'constellation', 'doorbell', 'personhood', 'ivs', 'excused', 'bakken']


The top 15 words for topic #1
['pooping', 'klezmer', 'naval', 'frightfully', 'evicted', '080', 'hyperactive', 'impressionist', '648', 'csi', 'gonzalo', 'pooled', 'furious', 'health', 'bakken']


The top 15 words for topic #2
['marisol', 'beeping', 'pommer', 'compatibility', 'cannabis', 'pacific', 'asl', 'crushed', 'excused', 'befits', 'bakken', 'doorbell', 'orville', 'constellation', 'garlic']


The top 15 words for topic #3
['gonzalo', 'henrik', 'optioned', 'constellation', 'laszewski', 'impersonation', 'henriette', 'procrastinated', 'options', 'lately', 'bakken', 'formula', 'houghton', 'inveterate', 'laundered']


The top 15 words for topic #4
['chamblee', 'crick', 'competitions', 'machu', 'intercession', 'bobotie', 'identification', 'eyeglasses', 'excerpts', 'excused', 'fiorina', '

In [47]:
corpus_matrix.shape

(404289, 38669)

In [48]:
len(data)

404289

In [49]:
topic_results2 = LDA2.transform(corpus_matrix)

In [50]:
topic_results2.shape

(404289, 7)

In [51]:
topic_results2[0]

array([0.01789911, 0.31645888, 0.59414638, 0.01786013, 0.0178879 ,
       0.01787348, 0.01787413])

In [52]:
topic_results2[0].round(2)

array([0.02, 0.32, 0.59, 0.02, 0.02, 0.02, 0.02])

In [53]:
topic_results2.argmax(axis=1)

array([2, 4, 0, ..., 0, 6, 5])

In [54]:
data['Topic'] = topic_results2.argmax(axis=1)

In [55]:
data.head(10)

Unnamed: 0,Question,Topic
0,What is the step by step guide to invest in sh...,2
1,What is the story of Kohinoor (Koh-i-Noor) Dia...,4
2,How can I increase the speed of my internet co...,0
3,Why am I mentally very lonely? How can I solve...,5
4,"Which one dissolve in water quikly sugar, salt...",2
5,Astrology: I am a Capricorn Sun Cap moon and c...,2
6,Should I buy tiago?,4
7,How can I be a good geologist?,2
8,When do you use シ instead of し?,0
9,Motorola (company): Can I hack my Charter Moto...,0


# 3. Vietnamese Corpus

In [64]:
import os
import pandas as pd
import json

# Đường dẫn tới thư mục chứa các file JSON
folder_path = "/home/dotronghiep/Documents/Uni/Year3_Term2/NLP/ExtractedNews-2022"

# Tạo một danh sách để lưu trữ dữ liệu từ các file JSON
data = []

# Lặp qua tất cả các file trong thư mục
for filename in os.listdir(folder_path):
    if filename.endswith(".json"):
        file_path = os.path.join(folder_path, filename)
        with open(file_path, "r", encoding="utf-8") as file:
            json_data = json.load(file)
            for item in json_data:
                if "content" in item:
                    data.append(item["content"])

# Tạo DataFrame từ danh sách dữ liệu
vdata = pd.DataFrame(data, columns=["content"])

# Hiển thị DataFrame
print(vdata)


                                                 content
0      Tweet\n(ĐTCK) Ngày 16/9 Công ty cổ phần kinh d...
1      VietinBank đột phá tăng vốn điều lệ, nâng cao ...
2      Ông Nguyễn Thành Phong nhận quyết định Phó Trư...
3      19 'quả đấm thép' lãi trước thuế 34.179 tỷ đồn...
4      Thứ ba, 11/1/2022, 21:25 (GMT+7)\nThủ tướng yê...
...                                                  ...
90428  Thứ bảy, 26/3/2022, 16:31 (GMT+7)\nĐộng lực ph...
90429  Sun Life Việt Nam ra mắt sản phẩm mới, Bảo hiể...
90430  Thứ sáu, 25/3/2022, 18:00 (GMT+7)\nLoạt smartp...
90431  Thứ hai, 28/3/2022, 09:00 (GMT+7)\nNhững phụ n...
90432  Tweet\n(ĐTCK) Ngày 29/3, Tổng công ty Bảo hiểm...

[90433 rows x 1 columns]


In [67]:
v_count_vectorizer = CountVectorizer(max_df=0.9, min_df=2)
vcorpus_matrix = v_count_vectorizer.fit_transform(vdata['content'])

In [68]:
vLDA = LatentDirichletAllocation(n_components=7,random_state=42)
vLDA.fit(vcorpus_matrix)

In [69]:
len(v_count_vectorizer.vocabulary_.keys())

151251

In [70]:
for i in range(10):
    random_word_id = random.randint(0,38669)
    print(v_count_vectorizer.get_feature_names_out()[random_word_id])

3333
analog
160ha
bř
aquatex
9ӈ
cheer
_qv
5d3
bb0


In [71]:
vLDA.components_

array([[2.40590653e+03, 2.50508562e+04, 3.44178321e+00, ...,
        1.42857143e-01, 1.42857143e-01, 1.42857143e-01],
       [3.44240070e+02, 8.71541128e+00, 1.42867371e-01, ...,
        2.14285714e+00, 3.14285714e+00, 2.14285714e+00],
       [5.93180781e+02, 3.61761463e+03, 2.69219632e+01, ...,
        1.42857143e-01, 1.42857143e-01, 1.42857143e-01],
       ...,
       [2.74846242e+02, 2.80388950e+04, 1.73763949e+00, ...,
        1.42857143e-01, 1.42857143e-01, 1.42857143e-01],
       [1.27536476e+03, 9.82034388e+03, 1.36629074e+01, ...,
        1.42857143e-01, 1.42857143e-01, 1.42857143e-01],
       [9.43274906e+03, 1.16152190e+04, 6.94978514e+00, ...,
        1.42857143e-01, 1.42857143e-01, 1.42857143e-01]])

In [72]:
len(vLDA.components_[0])

151251

In [73]:
for index, topic in enumerate(vLDA.components_):
    print(f"The top 15 words for topic #{index}")
    print([v_count_vectorizer.get_feature_names_out()[i] for i in topic.argsort()[-15:]])
    print('\n')

The top 15 words for topic #0
['là', 'tại', 'được', '19', 'covid', 'tăng', 'cho', 'triệu', 'ngày', 'đồng', 'số', 'dịch', 'bệnh', 'người', 'giá']


The top 15 words for topic #1
['ba', 'ol', 'gb', 'cx', 'pz', 'co', 'xd', 'ag', 'by', 'mo', 'ry', 'bi', 'be', 'gp', 'go']


The top 15 words for topic #2
['cục', 'lý', 'bộ', 'được', 'bị', 'dụng', 'vụ', 'số', 'thông', 'đồng', 'định', 'xe', 'quan', 'thuế', 'công']


The top 15 words for topic #3
['khu', 'thông', 'tại', 'được', 'cho', 'chính', 'hội', 'bộ', 'thành', 'đầu', 'tư', 'quốc', 'án', 'dự', 'công']


The top 15 words for topic #4
['cho', 'doanh', 'vốn', 'công', 'phiếu', 'là', 'giá', 'hàng', 'ngân', 'đầu', 'cổ', 'năm', 'tăng', 'tỷ', 'đồng']


The top 15 words for topic #5
['đầu', 'bảo', 'động', 'kinh', 'được', 'năm', 'sản', 'cho', 'nam', 'là', 'việt', 'công', 'hàng', 'nghiệp', 'doanh']


The top 15 words for topic #6
['đến', 'năm', 'tôi', 'học', 'thể', 'nhiều', 'để', 'khi', 'cho', 'những', 'được', 'người', 'không', 'một', 'là']




In [74]:
vcorpus_matrix.shape

(90433, 151251)

In [75]:
len(vdata)

90433

In [76]:
vtopic_results = vLDA.transform(vcorpus_matrix)

In [77]:
vtopic_results.shape

(90433, 7)

In [78]:
vdata['Topic'] = vtopic_results.argmax(axis=1)

In [79]:
vdata.head()

Unnamed: 0,content,Topic
0,Tweet\n(ĐTCK) Ngày 16/9 Công ty cổ phần kinh d...,4
1,"VietinBank đột phá tăng vốn điều lệ, nâng cao ...",5
2,Ông Nguyễn Thành Phong nhận quyết định Phó Trư...,3
3,19 'quả đấm thép' lãi trước thuế 34.179 tỷ đồn...,4
4,"Thứ ba, 11/1/2022, 21:25 (GMT+7)\nThủ tướng yê...",0
