# news-title-extract 테스트 진행하기

### BoW

### N-gram

### TF-IDF

In [None]:
try:
    import Sastrawi
except ModuleNotFoundError:
    ! pip install PySastrawi
finally:
    from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
    from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

! pip install feedparser
! pip install goose3
! pip install gensim

In [None]:
import requests
import datetime
import feedparser
import pandas as pd
import os
import logging
from goose3 import Goose
from goose3.text import StopWordsKorean
from gensim.summarization.summarizer import summarize
import warnings

# To Ignore warning sign
warnings.filterwarnings('ignore')


# Class :
class newsScrap():
    """Main Class related News Crawl"""

    def __init__(self): 
        print("Constructor")
        self._title = []
    def __del__(self): 
        print("Garbage Collection")

    """Get Webpage response method
        @param keyword : keyword to look for news
        @param day : how many days ago from today
        @param country : 2 types ["ko" || "en"]
        @param news_room : RSS feed list
    """
    def eccess(self, news_room):
        print("Crawl start")
        URL = news_room # you need to override this method
        
        res = requests.get(URL)
        if res.status_code == 200:
            datas = feedparser.parse(res.text).entries ## what is entries?
            
            for data in datas:
                self._title.append(data.title)

        else:
            print("No response")


    """Set data frame & change format & save (can override)"""
    def setDataFrame(self):
        raw_data = {'title' : self._title}
        res = pd.DataFrame(raw_data)
        file_name = "./result.csv"
        if os.path.isfile(file_name):
            os.remove(file_name)
        res.to_csv(file_name)



class googleScrap(newsScrap):
    """Extend NewsScrap class to use google news feed
    @param _time : the time news writed
    @param _link : news link
    @param _summary : news context
    @param _source : news source (ex. chosun, jungang...)
    @param _keyword : title keyword(we are looking for)
    @param _dataFrame : dataframe for changing format (.html, .csv...)
    """
    def __init__(self):
        newsScrap.__init__(self)
        self._time = []
        self._link = []
        self._summary = []
        self._source = []
        self._keyword = []
        self._dataFrame = None
        self._footNote = {}

    def eccess(self, keyword, day, country = 'ko'): # Google News Feed parsing method

        print ('Google News Cron Start: ' + datetime.datetime.now().strftime("%m/%d/%Y, %H:%M:%S"))
        URL = 'https://news.google.com/rss/search?q={}+when:{}d'.format(keyword, day)
        
        if country == 'en':
            URL += '&hl=en-NG&gl=NG&ceid=NG:en'
        elif country == 'ko':
            URL += '&hl=ko&gl=KR&ceid=KR:ko'

        res = requests.get(URL)
        if res.status_code == 200:
            datas = feedparser.parse(res.text).entries
            
            for data in datas:
                self._title.append(data.title)
                self._time.append(data.published)
                self._source.append(data.source.title)
                self._keyword.append(keyword)   
                self._link.append(data.link)
                # try: # merge with company version
                #     g = Goose({'stopwords_class':StopWordsKorean})
                #     article = g.extract(url=URL)
                #     self._summary.append(article.cleaned_text[:500])
                #     # self._summary.append(article.meta_description)
                #     # self._summary.append(summarize(article.cleaned_text[:1500]))

                # except:
                #     self._summary.append(data.title)
                #     pass
                                 
        else:
            print ('Google Search Error!!')

    def addFootNote(self, keywords_li, country="kr"):
        for keywords in keywords_li:
            foot_link = "https://news.google.com/search?q={}".format(keywords)
        if country == 'en':
            foot_link += '&hl=en-NG&gl=NG&ceid=NG:en'
        elif country == 'ko':
            foot_link += '&hl=ko&gl=KR&ceid=KR:ko'
        self._footNote.update({keywords : foot_link})
       
    def setDataFrame(self):
        raw_data = {
            'time' : self._time,
            'title' : self._title,
            'link' : self._link,
            'source' : self._source,
            'keyword' : self._keyword
        }
        self._dataFrame = pd.DataFrame(raw_data)
    
    def createCSV(self, file_name):
        file = './' + file_name + '.csv'
        if os.path.isfile(file):
            os.remove(file)
        self._dataFrame.to_csv(file, encoding='utf-8-sig')

    def createHTML(self, file_name):
        file = './' + file_name + '.html'
        if os.path.isfile(file):
            os.remove(file)
        self._dataFrame.to_html(file, encoding='utf-8-sig') # use (escape=False) if you want to make URL tag in html

    def appendFootNode(self, file_name):
        # 마크업 링크를 만든다
        markup = ""
        # ./{file_name}.html에 추가로 입력한다.
        with open("./{}.html".format(file_name), "a") as file:
            file.write(markup)
            file.close()
        
# if __name__ == "__main__":
#     today = googleScrap()
#     today.eccess('smartfactory', 1)
#     today.setDataFrame()
#     today.createHTML('result')
#     del today

In [None]:
keywords = ['covid', 'smartfactory', 'politics', 'korea', 'samsung', 'hyundai', 'LG', 'Apple', 'seoul', 'bts', 'stackoverflow', 'microsoft', 'netflix'
            'disney', 'naver', 'stock', 'heungmin', 'manchester', 'kaist', 'mcu', 'doctor strange', 'kakao', 'tesla', 'musk', 'nvidia']


today = googleScrap()
for  keyword in keywords:
    today.eccess(keyword, 100, 'en')
today.setDataFrame()
today.createCSV('news_feed')
del today

In [None]:
df = pd.read_csv('news_feed.csv')
print(df.head())
print('dataframe length : {}'.format(len(df)))

In [None]:
df['word_count'] = df['title'].apply(lambda x: len(str(x).split(' '))) # 한글의 경우 형태소를 분리가 필요 Mecab ...
df[['title', 'word_count']].head() # 통계자료 확인

In [None]:
df.hist(column = 'word_count', bins=20)
df.word_count.describe()

In [None]:
# Most Common and Uncommon Words per Category

def get_top_words(df, n=10, column='title'):
    top_words = pd.Series(' '.join(df[column]).split()).value_counts()[:n]
    print(f'Top words {df.category.unique()[0]} category: ', end='') # df 에 category 를 추가할 필요가 있음
    for index, value in top_words.items():
        print(f'{index} ({value}), ', end='')
    print('\n')

for category in df.category.unique():
    get_top_words(df[df.category == category])

In [None]:
# Using a regex

import re
def preprocess(text):
    # remove punctuation
    text = re.sub('[^a-zA-Z]', ' ', text)
    # convert to lower case
    text = text.lower()
    # remove special char and digit
    text=re.sub("(\\d|\\W)+"," ",text)
    # remove stop words
    stopword = StopWordRemoverFactory().create_stop_word_remover() 
    text = stopword.remove(text)
    # stemming
    stemmer = StemmerFactory().create_stemmer()
    text = stemmer.stem(text)
    return text




In [None]:
# check time 

import time

start_time = time.time()
for i in range(0, len(df)):
    if (i%500) == 0:
        mid_time = time.time()
        print(f'Processing #{i} data, {(mid_time - start_time):.2f} seconds elapsed.')

    df.at[i, 'corpus'] = preprocess(df['title'][i])

end_time = time.time()
print(f'End processing {len(df)} data for {(end_time-start_time):.2f} seconds.')
df.head(15)

In [None]:
# Extracting Topic using tf-idf

## example code

from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer(max_df=0.8, max_features=10000, ngram_range=(1,2))
X=cv.fit_transform(df['corpus'])

In [None]:
# Visuallize top word

from sklearn.feature_extraction.text import CountVectorizer

#Most frequently occuring words
def get_top_n_words(corpus, n=None):
    vec = CountVectorizer().fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in      
                   vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], 
                       reverse=True)
    return words_freq[:n]
#Convert most freq words to dataframe for plotting bar plot
top_words = get_top_n_words(df['corpus'], n=10)
top_df = pd.DataFrame(top_words)
top_df.columns=["Word", "Freq"]
# #Barplot of most freq words
import seaborn as sns
sns.set(rc={'figure.figsize':(15,8)})
g = sns.barplot(x="Freq", y="Word", data=top_df)

In [None]:
#Most frequently occuring Bi-grams

def get_top_n2_words(corpus, n=None):
    vec1 = CountVectorizer(ngram_range=(5,5),  
            max_features=2000).fit(corpus)
    bag_of_words = vec1.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in     
                  vec1.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], 
                reverse=True)
    return words_freq[:n]
top2_words = get_top_n2_words(df['corpus'], n=10)
top2_df = pd.DataFrame(top2_words)
top2_df.columns=["Bi-gram", "Freq"]
#Barplot of most freq Bi-grams
import seaborn as sns
sns.set(rc={'figure.figsize':(13,8)})
h=sns.barplot(x="Freq", y="Bi-gram", data=top2_df)

In [None]:
# Create TF-IDF Vector using vector of wordcount

from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True)
tfidf_transformer.fit(X)

In [None]:
Extracting Topic from Random Title
feature_names=cv.get_feature_names()
b
#Function for sorting tf_idf in descending order
from scipy.sparse import coo_matrix
def sort_coo(coo_matrix):
    tuples = zip(coo_matrix.col, coo_matrix.data)
    return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)
 
def extract_topn_from_vector(feature_names, sorted_items, topn=10):
    """get the feature names and tf-idf score of top n items"""
    
    #use only topn items from vector
    sorted_items = sorted_items[:topn]
 
    score_vals = []
    feature_vals = []
    
    # word index and corresponding tf-idf score
    for idx, score in sorted_items:
        
        #keep track of feature name and its corresponding score
        score_vals.append(round(score, 3))
        feature_vals.append(feature_names[idx])
 
    #create a tuples of feature,score
    #results = zip(feature_vals,score_vals)
    results= {}
    for idx in range(len(feature_vals)):
        results[feature_vals[idx]]=score_vals[idx]
    
    return results
#sort the tf-idf vectors by descending order of scores
sorted_items=sort_coo(tf_idf_vector.tocoo())
#extract only the top n; n here is 10
keywords=extract_topn_from_vector(feature_names,sorted_items,5)
 
# now print the results
print("\nAbstract:")
print(doc)
print("\nKeywords:")
for k in keywords:
    print(k,keywords[k])