In [1]:
import warnings
warnings.filterwarnings(action='ignore')

### Import library and load data

In [99]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import numpy as np
from sklearn.preprocessing import LabelEncoder
import re
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
import nltk
from nltk.stem import WordNetLemmatizer

# CSV 파일 불러오기
data = pd.read_csv("./data/news.csv")
data.head()

Unnamed: 0,id,title,contents
0,NEWS_00000,Spanish coach facing action in race row,MADRID (AFP) - Spanish national team coach Lui...
1,NEWS_00001,Bruce Lee statue for divided city,"In Bosnia, where one man #39;s hero is often a..."
2,NEWS_00002,Only Lovers Left Alive's Tilda Swinton Talks A...,Yasmine Hamdan performs 'Hal' which she also s...
3,NEWS_00003,Macromedia contributes to eBay Stores,Macromedia has announced a special version of ...
4,NEWS_00004,Qualcomm plans to phone it in on cellular repairs,Over-the-air fixes for cell phones comes to Qu...


In [108]:
# 필요한 열 선택 (contents: 기사 내용)
df = data[['contents']]
df.head()

Unnamed: 0,contents
0,MADRID (AFP) - Spanish national team coach Lui...
1,"In Bosnia, where one man #39;s hero is often a..."
2,Yasmine Hamdan performs 'Hal' which she also s...
3,Macromedia has announced a special version of ...
4,Over-the-air fixes for cell phones comes to Qu...


In [109]:
# 텍스트 정제 함수 (불용어 제거와 1글자짜리 단어 제거)
stop_words = set(stopwords.words('english'))

def clean_text(text):
    
    # URL 링크 뒤에 short_description 이 붙은 기사가 많음 -- 뉴스 기사에 큰 영향을 미치지 않으므로 제거
    text = text.replace('short_description','')
    
    # URL 제거
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

    # 해시태그 제거
    text = re.sub(r'#\w+', '', text)

    # 멘션 제거
    text = re.sub(r'@\w+', '', text)

    # 이모지 제거
    text = text.encode('ascii', 'ignore').decode('ascii')

    # 공백 및 특수문자 제거
    text = re.sub(r'[^\w\s]', '', text).strip()
    
    # 숫자 제거
    text = re.sub(r'\d+', '', text)

    # 같은 글자가 3번 이상 연속으로 나타나는 패턴을 찾아서 1번으로 대체
    text = re.sub(r'(.)\1{2,}', r'\1', text)

    # 소문자 변환
    text = text.lower()

    # 단어 원형으로 변환 후 시행
    # 불용어 제거 및 2글자 미만 단어 제거
    # text = ' '.join(word for word in text.split() if word not in stop_words and len(word) > 2)

    return text

# 명사와 동사만 추출하는 함수
# 명사와 동사만 추출해서 했을 경우 결과가 좋지 않았음

# import nltk
# from nltk import word_tokenize, pos_tag

# def extract_nouns_and_verbs(text):
#     # 문장을 토큰화하고 품사 태깅
#     words = word_tokenize(text)
#     tagged_words = pos_tag(words)
    
#     # 명사와 동사만 추출
#     nouns_and_verbs = [word for word, tag in tagged_words if tag.startswith('N') or tag.startswith('V')]
    
#     return nouns_and_verbs

In [110]:
# 텍스트 정제 적용
df['contents'] = df['contents'].apply(clean_text)
df.head(5)

Unnamed: 0,contents
0,madrid afp spanish national team coach luis a...
1,in bosnia where one man s hero is often anothe...
2,yasmine hamdan performs hal which she also sin...
3,macromedia has announced a special version of ...
4,overtheair fixes for cell phones comes to qual...


In [111]:
from nltk.stem import WordNetLemmatizer
import nltk
import string

# 단어 원형 추출 함수
lemmar = WordNetLemmatizer()
def LemTokens(tokens):
    return [lemmar.lemmatize(token) for token in tokens]

# 특수 문자 사전 생성: {33: None ...}
# ord(): 아스키 코드 생성
remove_punct_dict = dict((ord(punct), None) for punct in string.punctuation)

# 특수 문자 제거 및 단어 원형 추출
def LemNormalize(text):
    # 텍스트 소문자 변경 후 특수 문자 제거
    text_new = text.lower().translate(remove_punct_dict)
    
    # 단어 토큰화
    word_tokens = nltk.word_tokenize(text_new)
    
    # 단어 원형 추출
    return LemTokens(word_tokens)

In [112]:
df['contents'] = df['contents'].apply(LemNormalize)
df.head()

Unnamed: 0,contents
0,"[madrid, afp, spanish, national, team, coach, ..."
1,"[in, bosnia, where, one, man, s, hero, is, oft..."
2,"[yasmine, hamdan, performs, hal, which, she, a..."
3,"[macromedia, ha, announced, a, special, versio..."
4,"[overtheair, fix, for, cell, phone, come, to, ..."


In [113]:
# 텍스트 정제 적용
df['contents'] = df['contents'].apply(lambda x : ' '.join(word for word in x if word not in stop_words and len(word) > 2))
df.head(5)

Unnamed: 0,contents
0,madrid afp spanish national team coach luis ar...
1,bosnia one man hero often another man villain ...
2,yasmine hamdan performs hal also sings film sc...
3,macromedia announced special version contribut...
4,overtheair fix cell phone come qualcomms cdma


In [618]:
# # 역토큰화 (토큰화 작업을 되돌림)
# detokenized_doc = []
# for i in range(len(df)):
#     t = ' '.join(df['contents'][i])
#     detokenized_doc.append(t)

In [619]:
# # 다시 text['headline_text']에 재저장
# df['contents'] = detokenized_doc
# df.head(5)

Unnamed: 0,contents
0,madrid afp spanish national team coach luis ar...
1,bosnia one man hero often another man villain ...
2,yasmine hamdan perform hal also sing film scen...
3,macromedia announce special version contribute...
4,overtheair fix cell phone come qualcomms cdma


### TF-IDF 벡터화

In [114]:
# TF-IDF 벡터화
tfidf_vectorizer = TfidfVectorizer(stop_words = 'english', max_features = 3000) # max_features = 10000 설정
X = tfidf_vectorizer.fit_transform(df['contents'])
X.shape

(60000, 3000)

In [115]:
tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()

In [116]:
tfidf_feature_names

array(['ability', 'able', 'aboard', ..., 'ziff', 'zimbabwe', 'zone'],
      dtype=object)

In [117]:
tfidf_tokens = []
for i in range(len(data)):
    doc = X[i, :].toarray()
    tokens = [tfidf_feature_names[j] for j in np.where(doc > 0)[1]]
    tfidf_tokens.append(tokens)

In [118]:
# 토큰화된 결과를 데이터프레임에 추가
df['tfidf_tokens'] = tfidf_tokens
df.head()

Unnamed: 0,contents,tfidf_tokens
0,madrid afp spanish national team coach luis ar...,"[afp, arsenal, coach, comment, decided, face, ..."
1,bosnia one man hero often another man villain ...,"[citizen, decided, great, hero, lee, look, man..."
2,yasmine hamdan performs hal also sings film sc...,"[begin, continue, creation, film, living, myst..."
3,macromedia announced special version contribut...,"[announced, application, creation, designed, e..."
4,overtheair fix cell phone come qualcomms cdma,"[cell, come, fix, phone]"


### LDA 모델링

In [119]:
from gensim.models.ldamodel import LdaModel
from gensim.corpora import Dictionary

In [120]:
# Dictionary 및 corpus 생성
dictionary = Dictionary(tfidf_tokens)
corpus = [dictionary.doc2bow(text) for text in tfidf_tokens]
# print(corpus[1]) # 수행된 결과에서 두번째 뉴스 출력. 첫번째 문서의 인덱스는 0

pass 는 epoch 과 같은 역할로 군집이 잘 나눠지는 passes 를 찾자

In [121]:
import gensim
NUM_TOPICS = 6 # 6개의 토픽
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=30, random_state = 42)
topics = ldamodel.print_topics(num_words=10)
for topic in topics:
    print(topic)

(0, '0.016*"new" + 0.012*"service" + 0.010*"microsoft" + 0.010*"internet" + 0.010*"company" + 0.009*"computer" + 0.009*"software" + 0.008*"online" + 0.008*"technology" + 0.007*"phone"')
(1, '0.014*"game" + 0.011*"night" + 0.011*"year" + 0.010*"win" + 0.009*"time" + 0.008*"victory" + 0.008*"world" + 0.008*"team" + 0.008*"lead" + 0.008*"sunday"')
(2, '0.022*"said" + 0.017*"reuters" + 0.015*"company" + 0.014*"new" + 0.011*"price" + 0.010*"year" + 0.010*"corp" + 0.009*"oil" + 0.009*"percent" + 0.009*"market"')
(3, '0.025*"said" + 0.014*"reuters" + 0.011*"country" + 0.011*"minister" + 0.011*"official" + 0.011*"iraq" + 0.010*"people" + 0.008*"leader" + 0.008*"government" + 0.007*"prime"')
(4, '0.018*"president" + 0.016*"said" + 0.014*"say" + 0.010*"state" + 0.010*"trump" + 0.010*"court" + 0.008*"bush" + 0.008*"election" + 0.006*"house" + 0.006*"federal"')
(5, '0.029*"new" + 0.013*"york" + 0.010*"red" + 0.009*"network" + 0.009*"sport" + 0.008*"space" + 0.007*"team" + 0.007*"wednesday" + 0.007

In [122]:
import pyLDAvis.gensim_models

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(ldamodel, corpus, dictionary)
pyLDAvis.display(vis)

Business, Entertainment, Politics, Sports, Tech, World  
Politics - 4 <br/>
Tech - 0 <br/>
Sports - 1 <br/>
Business - 2 <br/>
World - 3 <br/>
Entertainment - 5 <br/>

In [123]:
for i, topic_list in enumerate(ldamodel[corpus]):
    if i==5:
        break
    print(i,'번째 문서의 topic 비율은',topic_list)
    # doc = topic_list[0] if ldamodel.per_word_topics else topic_list            
    doc = sorted(topic_list, key=lambda x: (x[1]), reverse=True)
    print(doc)

0 번째 문서의 topic 비율은 [(1, 0.30019727), (3, 0.3870032), (4, 0.16775833), (5, 0.1283482)]
[(3, 0.3870032), (1, 0.30019727), (4, 0.16775833), (5, 0.1283482)]
1 번째 문서의 topic 비율은 [(0, 0.01857243), (1, 0.77370006), (2, 0.018521767), (3, 0.15209904), (4, 0.018545438), (5, 0.01856129)]
[(1, 0.77370006), (3, 0.15209904), (0, 0.01857243), (5, 0.01856129), (4, 0.018545438), (2, 0.018521767)]
2 번째 문서의 topic 비율은 [(0, 0.5534282), (1, 0.26930615), (2, 0.015234957), (3, 0.015307034), (4, 0.015248857), (5, 0.13147484)]
[(0, 0.5534282), (1, 0.26930615), (5, 0.13147484), (3, 0.015307034), (4, 0.015248857), (2, 0.015234957)]
3 번째 문서의 topic 비율은 [(0, 0.7848649), (1, 0.016687931), (2, 0.1483837), (3, 0.016679322), (4, 0.016694361), (5, 0.016689822)]
[(0, 0.7848649), (2, 0.1483837), (4, 0.016694361), (5, 0.016689822), (1, 0.016687931), (3, 0.016679322)]
4 번째 문서의 topic 비율은 [(0, 0.83310395), (1, 0.033453334), (2, 0.033335418), (3, 0.033335418), (4, 0.033436485), (5, 0.03333542)]
[(0, 0.83310395), (1, 0.033453334)

In [124]:
def make_topictable_per_doc(ldamodel, corpus):
    topic_table = pd.DataFrame()

    # 몇 번째 문서인지를 의미하는 문서 번호와 해당 문서의 토픽 비중을 한 줄씩 꺼내온다.
    for i, topic_list in enumerate(ldamodel[corpus]):
        doc = topic_list[0] if ldamodel.per_word_topics else topic_list            
        doc = sorted(doc, key=lambda x: (x[1]), reverse=True)
        # 각 문서에 대해서 비중이 높은 토픽순으로 토픽을 정렬한다.

        # 모든 문서에 대해서 각각 아래를 수행
        for j, (topic_num, prop_topic) in enumerate(doc): #  몇 번 토픽인지와 비중을 나눠서 저장한다.
            if j == 0:  # 정렬을 한 상태이므로 가장 앞에 있는 것이 가장 비중이 높은 토픽
                topic_table = pd.concat([topic_table, pd.DataFrame([int(topic_num), round(prop_topic,4), doc]).T], ignore_index= True)
                # pd.DataFrame({'가장 비중이 높은 토픽' : int(topic_num), '가장 높은 토픽의 비중' : round(prop_topic,4), '각 토픽의 비중':topic_list})
                # print(topic_table)
                # topic_table.append(pd.Series([int(topic_num), round(prop_topic,4), topic_list]), ignore_index=True)
                # 가장 비중이 높은 토픽과, 가장 비중이 높은 토픽의 비중과, 전체 토픽의 비중을 저장한다.
            else:
                break
    return topic_table

In [125]:
topictable = make_topictable_per_doc(ldamodel, corpus)

In [126]:
topictable = topictable.reset_index() # 문서 번호을 의미하는 열(column)로 사용하기 위해서 인덱스 열을 하나 더 만든다.
topictable.columns = ['num', 'best_topic', 'best_topic_rate', 'topic_rate']
topictable

Unnamed: 0,num,best_topic,best_topic_rate,topic_rate
0,0,3,0.3871,"[(3, 0.38706496), (1, 0.30021867), (4, 0.16776..."
1,1,1,0.7737,"[(1, 0.77370757), (3, 0.1520915), (0, 0.018572..."
2,2,0,0.5535,"[(0, 0.55345213), (1, 0.2692915), (5, 0.131465..."
3,3,0,0.7849,"[(0, 0.7848793), (2, 0.14836921), (4, 0.016694..."
4,4,0,0.8331,"[(0, 0.8331153), (1, 0.033446807), (4, 0.03343..."
...,...,...,...,...
59995,59995,1,0.5471,"[(1, 0.5470573), (4, 0.38566053), (3, 0.016893..."
59996,59996,2,0.5543,"[(2, 0.55431587), (3, 0.29467893), (0, 0.11966..."
59997,59997,1,0.4721,"[(1, 0.47208622), (0, 0.29982916), (5, 0.18620..."
59998,59998,2,0.8808,"[(2, 0.88078445), (4, 0.023868008), (0, 0.0238..."


In [127]:
# 토픽 비율이 가장 큰 토픽을 베스트 토픽으로 추가
df['best_topic_rate'] = topictable['best_topic_rate']
df['topic'] = topictable['best_topic']
df.head()

Unnamed: 0,contents,tfidf_tokens,best_topic_rate,topic
0,madrid afp spanish national team coach luis ar...,"[afp, arsenal, coach, comment, decided, face, ...",0.3871,3
1,bosnia one man hero often another man villain ...,"[citizen, decided, great, hero, lee, look, man...",0.7737,1
2,yasmine hamdan performs hal also sings film sc...,"[begin, continue, creation, film, living, myst...",0.5535,0
3,macromedia announced special version contribut...,"[announced, application, creation, designed, e...",0.7849,0
4,overtheair fix cell phone come qualcomms cdma,"[cell, come, fix, phone]",0.8331,0


Business, Entertainment, Politics, Sports, Tech, World  
Politics - 4 <br/>
Tech - 0 <br/>
Sports - 1 <br/>
Business - 2 <br/>
World - 3 <br/>
Entertainment - 5 <br/>

In [135]:
# 토픽 번호에 맞게 재설정
df['topic'] = df['topic'].replace({5: 'Entertainment', 2: 'Business', 0:'Tech', 3:'World', 1:'Sports', 4:'Politics'})
df.topic.value_counts()

topic
Politics         12983
World            12678
Tech             11105
Entertainment     9897
Business          8817
Sports            4520
Name: count, dtype: int64

In [130]:
df['topic'] = df['topic'].replace({'Business':0,'Entertainment':1, 'Politics':2, 'Sports':3, 'Tech':4, 'World':5})
df.head()

Unnamed: 0,contents,tfidf_tokens,best_topic_rate,topic
0,madrid afp spanish national team coach luis ar...,"[afp, arsenal, coach, comment, decided, face, ...",0.3871,5
1,bosnia one man hero often another man villain ...,"[citizen, decided, great, hero, lee, look, man...",0.7737,3
2,yasmine hamdan performs hal also sings film sc...,"[begin, continue, creation, film, living, myst...",0.5535,4
3,macromedia announced special version contribut...,"[announced, application, creation, designed, e...",0.7849,4
4,overtheair fix cell phone come qualcomms cdma,"[cell, come, fix, phone]",0.8331,4


In [132]:
submission = pd.read_csv('./data/sample_submission.csv')
submission

Unnamed: 0,id,category
0,NEWS_00000,-1
1,NEWS_00001,-1
2,NEWS_00002,-1
3,NEWS_00003,-1
4,NEWS_00004,-1
...,...,...
59995,NEWS_59995,-1
59996,NEWS_59996,-1
59997,NEWS_59997,-1
59998,NEWS_59998,-1


In [133]:
submission['category'] = df['topic'].values
submission.head()

Unnamed: 0,id,category
0,NEWS_00000,5
1,NEWS_00001,3
2,NEWS_00002,4
3,NEWS_00003,4
4,NEWS_00004,4


In [134]:
submission.to_csv('submission1.csv', index = 0)