In [1]:
import warnings
warnings.filterwarnings(action='ignore')

In [631]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import numpy as np
from sklearn.preprocessing import LabelEncoder
import re
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
import nltk
from nltk.stem import WordNetLemmatizer

# CSV 파일 불러오기

data = pd.read_csv("./data/news.csv")
data

Unnamed: 0,id,title,contents
0,NEWS_00000,Spanish coach facing action in race row,MADRID (AFP) - Spanish national team coach Lui...
1,NEWS_00001,Bruce Lee statue for divided city,"In Bosnia, where one man #39;s hero is often a..."
2,NEWS_00002,Only Lovers Left Alive's Tilda Swinton Talks A...,Yasmine Hamdan performs 'Hal' which she also s...
3,NEWS_00003,Macromedia contributes to eBay Stores,Macromedia has announced a special version of ...
4,NEWS_00004,Qualcomm plans to phone it in on cellular repairs,Over-the-air fixes for cell phones comes to Qu...
...,...,...,...
59995,NEWS_59995,"Dolphins Break Through, Rip Rams For First Win",But that #39;s OK. Because after a 31-14 rout ...
59996,NEWS_59996,"After Steep Drop, Price of Oil Rises",The freefall in oil prices ended Monday on a s...
59997,NEWS_59997,Pro football: Culpepper puts on a show,To say Daunte Culpepper was a little frustrate...
59998,NEWS_59998,Albertsons on the Rebound,The No. 2 grocer reports double-digit gains in...


우선 contents 로만 해보자

In [632]:
# 필요한 열 선택 (contents: 기사 내용)
df = data[['contents']]
df.head()

Unnamed: 0,contents
0,MADRID (AFP) - Spanish national team coach Lui...
1,"In Bosnia, where one man #39;s hero is often a..."
2,Yasmine Hamdan performs 'Hal' which she also s...
3,Macromedia has announced a special version of ...
4,Over-the-air fixes for cell phones comes to Qu...


In [633]:
# 텍스트 정제 함수 (불용어 제거와 1글자짜리 단어 제거)
stop_words = set(stopwords.words('english'))

def clean_text(text):
    
    text = text.replace('short_description','')
    # URL 제거
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

    # 해시태그 제거
    text = re.sub(r'#\w+', '', text)

    # 멘션 제거
    text = re.sub(r'@\w+', '', text)

    # 이모지 제거
    text = text.encode('ascii', 'ignore').decode('ascii')

    # 공백 및 특수문자 제거
    text = re.sub(r'[^\w\s]', '', text).strip()
    
    # 숫자 제거
    text = re.sub(r'\d+', '', text)

    # 같은 글자가 3번 이상 연속으로 나타나는 패턴을 찾아서 1번으로 대체
    text = re.sub(r'(.)\1{2,}', r'\1', text)

    # # 영문자와 공백 이외의 문자 제거
    # text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # 소문자 변환
    text = text.lower()
    # 불용어 제거 및 2글자 이하 단어 제거
    text = ' '.join(word for word in text.split() if word not in stop_words and len(word) > 2)

    return text

import nltk
from nltk import word_tokenize, pos_tag

# # 명사와 동사만 추출하는 함수
# def extract_nouns_and_verbs(text):
#     # 문장을 토큰화하고 품사 태깅
#     # words = word_tokenize(text)
#     tagged_words = pos_tag(text)
    
#     # 명사와 동사만 추출
#     nouns_and_verbs = [word for word, tag in tagged_words if tag.startswith('N') or tag.startswith('V')]
    
#     return nouns_and_verbs

In [634]:
# 텍스트 정제 적용
df['contents'] = df['contents'].apply(clean_text)
df.head(5)

Unnamed: 0,contents
0,madrid afp spanish national team coach luis ar...
1,bosnia one man hero often another man villain ...
2,yasmine hamdan performs hal also sings film sc...
3,macromedia announced special version contribut...
4,overtheair fixes cell phones comes qualcomms cdma


In [635]:
# 동사 시제에 따라 변한 것을 동사 원형으로 만드는 작업
df['contents'] = df.apply(lambda x: nltk.word_tokenize(x['contents']), axis=1)
df['contents'] = df['contents'].apply(lambda x: [WordNetLemmatizer().lemmatize(word, pos='v') for word in x])
df.head(5)

Unnamed: 0,contents
0,"[madrid, afp, spanish, national, team, coach, ..."
1,"[bosnia, one, man, hero, often, another, man, ..."
2,"[yasmine, hamdan, perform, hal, also, sing, fi..."
3,"[macromedia, announce, special, version, contr..."
4,"[overtheair, fix, cell, phone, come, qualcomms..."


In [636]:
def cleaning_text(text):
    text = [word for word in text if word not in stop_words and len(word) > 3]
    return text

In [637]:
df['contents'] = df['contents'].apply(lambda x: cleaning_text(x))
df.head()

Unnamed: 0,contents
0,"[madrid, afp, spanish, national, team, coach, ..."
1,"[bosnia, one, man, hero, often, another, man, ..."
2,"[yasmine, hamdan, perform, hal, also, sing, fi..."
3,"[macromedia, announce, special, version, contr..."
4,"[overtheair, fix, cell, phone, come, qualcomms..."


In [203]:
# # 명사와 동사만 추출
# df['contents'] = df['contents'].apply(lambda x: extract_nouns_and_verbs(x))

In [618]:
# 역토큰화 (토큰화 작업을 되돌림)
detokenized_doc = []
for i in range(len(df)):
    t = ' '.join(df['contents'][i])
    detokenized_doc.append(t)

In [619]:
# 다시 text['headline_text']에 재저장
df['contents'] = detokenized_doc
df.head(5)

Unnamed: 0,contents
0,madrid afp spanish national team coach luis ar...
1,bosnia one man hero often another man villain ...
2,yasmine hamdan perform hal also sing film scen...
3,macromedia announce special version contribute...
4,overtheair fix cell phone come qualcomms cdma


In [620]:
from nltk.stem import WordNetLemmatizer
import nltk
import string

# 단어 원형 추출 함수
lemmar = WordNetLemmatizer()
def LemTokens(tokens):
    return [lemmar.lemmatize(token) for token in tokens]

# 특수 문자 사전 생성: {33: None ...}
# ord(): 아스키 코드 생성
remove_punct_dict = dict((ord(punct), None) for punct in string.punctuation)

# 특수 문자 제거 및 단어 원형 추출
def LemNormalize(text):
    # 텍스트 소문자 변경 후 특수 문자 제거
    text_new = text.lower().translate(remove_punct_dict)
    
    # 단어 토큰화
    word_tokens = nltk.word_tokenize(text_new)
    
    # 단어 원형 추출
    return LemTokens(word_tokens)

In [621]:
# TF-IDF 벡터화
tfidf_vectorizer = TfidfVectorizer(stop_words = 'english', max_features = 3000, tokenizer= LemNormalize) # max_features = 10000 설정
X = tfidf_vectorizer.fit_transform(df['contents'])
X.shape

(60000, 3000)

### LDA 모델링

In [622]:
tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()

In [623]:
tfidf_feature_names

array(['abandon', 'ability', 'able', ..., 'ziff', 'zimbabwe', 'zone'],
      dtype=object)

In [624]:
tfidf_tokens = []
for i in range(len(data)):
    doc = X[i, :].toarray()
    tokens = [tfidf_feature_names[j] for j in np.where(doc > 0)[1]]
    tfidf_tokens.append(tokens)

In [625]:
# 토큰화된 결과를 데이터프레임에 추가
df['tfidf_tokens'] = tfidf_tokens
df.head()

Unnamed: 0,contents,tfidf_tokens
0,madrid afp spanish national team coach luis ar...,"[afp, arsenal, coach, comment, decide, face, f..."
1,bosnia one man hero often another man villain ...,"[citizen, decide, great, hero, honour, lee, lo..."
2,yasmine hamdan perform hal also sing film scen...,"[begin, continue, creation, film, live, myster..."
3,macromedia announce special version contribute...,"[announce, application, contribute, creation, ..."
4,overtheair fix cell phone come qualcomms cdma,"[cell, come, fix, phone]"


LDA 모델링

In [626]:
from gensim.models.ldamodel import LdaModel
from gensim.corpora import Dictionary

In [627]:
# Dictionary 및 corpus 생성
dictionary = Dictionary(tfidf_tokens)
corpus = [dictionary.doc2bow(text) for text in tfidf_tokens]
# print(corpus[1]) # 수행된 결과에서 두번째 뉴스 출력. 첫번째 문서의 인덱스는 0

pass 는 epoch 과 같은 역할로 군집이 잘 나눠지는 passes 를 찾자

In [628]:
import gensim
NUM_TOPICS = 6 # 6개의 토픽
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=30, random_state = 42)
topics = ldamodel.print_topics(num_words=10)
for topic in topics:
    print(topic)

(0, '0.016*"new" + 0.012*"company" + 0.011*"service" + 0.011*"software" + 0.010*"use" + 0.010*"microsoft" + 0.010*"internet" + 0.009*"computer" + 0.009*"announce" + 0.008*"technology"')
(1, '0.016*"say" + 0.013*"president" + 0.010*"make" + 0.009*"like" + 0.008*"want" + 0.008*"trump" + 0.008*"time" + 0.007*"bush" + 0.007*"year" + 0.007*"come"')
(2, '0.029*"say" + 0.014*"people" + 0.013*"kill" + 0.011*"iraq" + 0.011*"official" + 0.010*"court" + 0.010*"attack" + 0.009*"force" + 0.008*"city" + 0.007*"police"')
(3, '0.018*"say" + 0.016*"reuters" + 0.015*"minister" + 0.014*"oil" + 0.011*"state" + 0.010*"prime" + 0.010*"unite" + 0.009*"leader" + 0.009*"country" + 0.009*"president"')
(4, '0.023*"say" + 0.016*"new" + 0.015*"reuters" + 0.014*"company" + 0.011*"year" + 0.011*"million" + 0.010*"york" + 0.010*"price" + 0.010*"report" + 0.010*"percent"')
(5, '0.015*"game" + 0.012*"win" + 0.011*"team" + 0.010*"night" + 0.010*"new" + 0.010*"season" + 0.009*"world" + 0.009*"play" + 0.008*"lead" + 0.008

In [629]:
import pyLDAvis.gensim_models

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(ldamodel, corpus, dictionary)
pyLDAvis.display(vis)

Business, Entertainment, Politics, Sports, Tech, World  
Politics - 0   <br/>
Tech - 2 <br/>
Sports -  1 <br/>
Business - 4   <br/>
World - 5  <br/>
Entertainment - 3 <br/>

In [593]:
for i, topic_list in enumerate(ldamodel[corpus]):
    if i==5:
        break
    print(i,'번째 문서의 topic 비율은',topic_list)
    # doc = topic_list[0] if ldamodel.per_word_topics else topic_list            
    doc = sorted(topic_list, key=lambda x: (x[1]), reverse=True)
    print(doc)

0 번째 문서의 topic 비율은 [(0, 0.08747146), (1, 0.4461617), (3, 0.4411979)]
[(1, 0.4461617), (3, 0.4411979), (0, 0.08747146)]
1 번째 문서의 topic 비율은 [(0, 0.018624393), (1, 0.01865809), (2, 0.01857992), (3, 0.68317616), (4, 0.24243534), (5, 0.0185261)]
[(3, 0.68317616), (4, 0.24243534), (1, 0.01865809), (0, 0.018624393), (2, 0.01857992), (5, 0.0185261)]
2 번째 문서의 topic 비율은 [(0, 0.14873558), (1, 0.013919738), (2, 0.013987724), (3, 0.7954868), (4, 0.013942142), (5, 0.013927992)]
[(3, 0.7954868), (0, 0.14873558), (2, 0.013987724), (4, 0.013942142), (5, 0.013927992), (1, 0.013919738)]
3 번째 문서의 topic 비율은 [(0, 0.015186861), (1, 0.015190888), (2, 0.828408), (3, 0.11086553), (4, 0.015189747), (5, 0.015158966)]
[(2, 0.828408), (3, 0.11086553), (1, 0.015190888), (4, 0.015189747), (0, 0.015186861), (5, 0.015158966)]
4 번째 문서의 topic 비율은 [(0, 0.04177914), (1, 0.041886277), (2, 0.541658), (3, 0.29125613), (4, 0.041751448), (5, 0.041669015)]
[(2, 0.541658), (3, 0.29125613), (1, 0.041886277), (0, 0.04177914), (4, 0

In [594]:
def make_topictable_per_doc(ldamodel, corpus):
    topic_table = pd.DataFrame()

    # 몇 번째 문서인지를 의미하는 문서 번호와 해당 문서의 토픽 비중을 한 줄씩 꺼내온다.
    for i, topic_list in enumerate(ldamodel[corpus]):
        doc = topic_list[0] if ldamodel.per_word_topics else topic_list            
        doc = sorted(doc, key=lambda x: (x[1]), reverse=True)
        # 각 문서에 대해서 비중이 높은 토픽순으로 토픽을 정렬한다.
        # EX) 정렬 전 0번 문서 : (2번 토픽, 48.5%), (8번 토픽, 25%), (10번 토픽, 5%), (12번 토픽, 21.5%), 
        # Ex) 정렬 후 0번 문서 : (2번 토픽, 48.5%), (8번 토픽, 25%), (12번 토픽, 21.5%), (10번 토픽, 5%)
        # 48 > 25 > 21 > 5 순으로 정렬이 된 것.

        # 모든 문서에 대해서 각각 아래를 수행
        for j, (topic_num, prop_topic) in enumerate(doc): #  몇 번 토픽인지와 비중을 나눠서 저장한다.
            if j == 0:  # 정렬을 한 상태이므로 가장 앞에 있는 것이 가장 비중이 높은 토픽
                topic_table = pd.concat([topic_table, pd.DataFrame([int(topic_num), round(prop_topic,4), doc]).T], ignore_index= True)
                # pd.DataFrame({'가장 비중이 높은 토픽' : int(topic_num), '가장 높은 토픽의 비중' : round(prop_topic,4), '각 토픽의 비중':topic_list})
                # print(topic_table)
                # topic_table.append(pd.Series([int(topic_num), round(prop_topic,4), topic_list]), ignore_index=True)
                # 가장 비중이 높은 토픽과, 가장 비중이 높은 토픽의 비중과, 전체 토픽의 비중을 저장한다.
            else:
                break
    return topic_table

In [595]:
topictable = make_topictable_per_doc(ldamodel, corpus)

In [596]:
topictable = topictable.reset_index() # 문서 번호을 의미하는 열(column)로 사용하기 위해서 인덱스 열을 하나 더 만든다.
topictable.columns = ['num', 'best_topic', 'best_topic_rate', 'topic_rate']
topictable

Unnamed: 0,num,best_topic,best_topic_rate,topic_rate
0,0,1,0.4462,"[(1, 0.44615817), (3, 0.44119608), (0, 0.08747..."
1,1,3,0.6832,"[(3, 0.6831718), (4, 0.24243565), (1, 0.018662..."
2,2,3,0.7955,"[(3, 0.7954998), (0, 0.14872257), (2, 0.013987..."
3,3,2,0.8284,"[(2, 0.82840747), (3, 0.11086603), (1, 0.01519..."
4,4,2,0.5417,"[(2, 0.541658), (3, 0.29126903), (1, 0.0418749..."
...,...,...,...,...
59995,59995,1,0.6549,"[(1, 0.65489787), (2, 0.24966879), (3, 0.02389..."
59996,59996,5,0.5335,"[(5, 0.53351486), (4, 0.30272928), (2, 0.12197..."
59997,59997,1,0.5512,"[(1, 0.5511734), (3, 0.31109333), (0, 0.099145..."
59998,59998,4,0.861,"[(4, 0.86095774), (3, 0.027859796), (5, 0.0278..."


In [597]:
df['best_topic_rate'] = topictable['best_topic_rate']
df['topic'] = topictable['best_topic']
df.head()

Unnamed: 0,contents,tfidf_tokens,best_topic_rate,topic
0,madrid spanish national team coach luis aragon...,"[arsenal, coach, comment, decide, face, federa...",0.4462,1
1,bosnia hero often another villain citizens dec...,"[bruce, citizens, decide, great, hero, honour,...",0.6832,3
2,yasmine hamdan perform also sing film scene wo...,"[begin, continue, creation, film, live, myster...",0.7955,3
3,macromedia announce special version contribute...,"[announce, application, contribute, creation, ...",0.8284,2
4,overtheair cell phone come qualcomms cdma,"[cell, come, phone]",0.5417,2


In [598]:
df.topic.value_counts()

topic
0    14254
3    12012
1    10206
2     9694
4     7230
5     6604
Name: count, dtype: int64

Business, Entertainment, Politics, Sports, Tech, World  
Politics - 0   <br/>
Tech - 2 <br/>
Sports -  1 <br/>
Business - 4   <br/>
World - 5  <br/>
Entertainment - 3 <br/>

In [599]:
df['topic'] = df['topic'].replace({3: 'Entertainment', 4: 'Business', 2:'Tech', 5:'World', 1:'Sports', 0:'Politics'})
df.topic.value_counts()

topic
Politics         14254
Entertainment    12012
Sports           10206
Tech              9694
Business          7230
World             6604
Name: count, dtype: int64

In [600]:
df['topic'] = df['topic'].replace({'Business':0,'Entertainment':1, 'Politics':2, 'Sports':3, 'Tech':4, 'World':5})
df.head()

Unnamed: 0,contents,tfidf_tokens,best_topic_rate,topic
0,madrid spanish national team coach luis aragon...,"[arsenal, coach, comment, decide, face, federa...",0.4462,3
1,bosnia hero often another villain citizens dec...,"[bruce, citizens, decide, great, hero, honour,...",0.6832,1
2,yasmine hamdan perform also sing film scene wo...,"[begin, continue, creation, film, live, myster...",0.7955,1
3,macromedia announce special version contribute...,"[announce, application, contribute, creation, ...",0.8284,4
4,overtheair cell phone come qualcomms cdma,"[cell, come, phone]",0.5417,4


In [601]:
df.topic.value_counts()

topic
2    14254
1    12012
3    10206
4     9694
0     7230
5     6604
Name: count, dtype: int64

In [602]:
submission = pd.read_csv('./data/sample_submission.csv')
submission

Unnamed: 0,id,category
0,NEWS_00000,-1
1,NEWS_00001,-1
2,NEWS_00002,-1
3,NEWS_00003,-1
4,NEWS_00004,-1
...,...,...
59995,NEWS_59995,-1
59996,NEWS_59996,-1
59997,NEWS_59997,-1
59998,NEWS_59998,-1


In [603]:
submission['category'] = df['topic'].values
submission.head()

Unnamed: 0,id,category
0,NEWS_00000,3
1,NEWS_00001,1
2,NEWS_00002,1
3,NEWS_00003,4
4,NEWS_00004,4


In [604]:
submission.to_csv('submission1.csv', index = 0)