In [25]:
!pip install gensim
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from gensim import corpora,models
import nltk
import feedparser



In [30]:
class IdentifyingTopicExample:
    def getDocuments(self):
        #url언급된 문서다운로드 -> feed라는 변수에 저장
        url = 'https://sports.yahoo.com/mlb/rss.xml'
        feed = feedparser.parse(url)
        self.documents = []
        #상위 5개 문서를 가져와 변수에 저장
        for entry in feed['entries'][:5]:
            text =  entry['summary']
            #민감한 단어가 포함돼있음 해당단어 건너뜀.
            if 'ex' in text:
                continue
            self.documents.append(text)
            print("-- {}".format(text))
        print("INFO: Fetching documents from {} completed".format(url))
    #입력 텍스트를 지우는 역할을 하는 새로운 함수.
    def cleanDocuments(self):
        tokenizer = RegexpTokenizer(r'[a-zA-Z]+') #잘못된 데이터 못들어가게 방지
        en_stop = set(stopwords.words('english'))
        self.cleaned = []
        for doc in self.documents:
            lowercase_doc = doc.lower() #소문자 변환
            words = tokenizer.tokenize(lowercase_doc)
            non_stopped_words = [i for i in words if not i in en_stop] # 불용어 카테고리 ->무시, 나머지 모두 변수에 저장 
            self.cleaned.append(non_stopped_words)
        print("INFO: Cleaning {} documents completed".format(len(self.documents)))
    #LDA 분석을 수행하는 함수
    def doLDA(self):
        dictionary = corpora.Dictionary(self.cleaned) #문서들로 딕셔너리 생성
        corpus = [dictionary.doc2bow(cleandoc) for cleandoc in self.cleaned] #정리된 각문장에 대한 단어들의 모음으로 정의됨
        #토픽 2인 말뭉치모델 생성 , id2word 파라미터를 사용해 어휘크기 설정 -> 매핑
        ldamodel = models.ldamodel.LdaModel(corpus, num_topics = 2, id2word = dictionary)
        print(ldamodel.print_topics(num_topics = 2, num_words = 4))
    #단계를 순서대로 수행하는 함수
    def run(self):
        self.getDocuments()
        self.cleanDocuments()
        self.doLDA()
    
    
        
    
        

In [31]:
if __name__ == '__main__':
    topicExample = IdentifyingTopicExample()
    topicExample.run()

-- Houston shrugged off Atlanta's first-inning grand slam in a World Series elimination game, casually fighting back to keep its season alive.
-- The Astros sent World Series back to Houston for Game 6, trailing 3-2 but brimming with confidence after coming back against Atlanta down four runs.
-- Just in time, Carlos Correa and the Houston Astros broke out the bats.
-- Astros come from behind for 9-5 win, force Game 6
INFO: Fetching documents from https://sports.yahoo.com/mlb/rss.xml completed
INFO: Cleaning 4 documents completed
[(0, '0.054*"astros" + 0.049*"game" + 0.044*"back" + 0.042*"world"'), (1, '0.057*"houston" + 0.056*"back" + 0.052*"game" + 0.048*"astros"')]
