In [267]:
from datetime import datetime, timezone, timedelta
import re
import requests
from bs4 import BeautifulSoup
import pandas as pd
from googletrans import Translator

huggingface_API_URL = "https://api-inference.huggingface.co/models/sshleifer/distilbart-cnn-12-6"
huggingface_headers = {"Authorization": "Bearer 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"}

def clean_text(text):
    # Remove clauses like [헤럴드경제=신현주 기자]
    text = re.sub(r'\[[^\]]+\]', '', text)
    
    # Remove newlines, backslashes, and multiple spaces
    text = text.replace('\n', ' ').replace('\\', '')
    text = re.sub(r'\s+', ' ', text)
    
    # Remove leading and trailing spaces
    cleaned_text = text.strip()
    
    return cleaned_text

def get_body_content(URL):
    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36'}
    news = requests.get(URL, headers=headers)
    soup = BeautifulSoup(news.content, 'html.parser')

    news_element = soup.find('article', class_='go_trans _article_content')

    if news_element:
        tags_to_extract = ['strong', 'em', 'b', 'td']

        for tag in tags_to_extract:
            for elem in news_element.find_all(tag):
                elem.extract()            

        body = news_element.get_text(separator='\n')
        return clean_text(body)

    else:
        return 'News content not found.'
    
def google_translate(text):
    google = Translator()
    result = google.translate(text, dest='en')
    return result.text

def distilbart_summarize(text, translate='google'):
    if translate == 'google':
        text = google_translate(text)
    response = requests.post(huggingface_API_URL, headers=huggingface_headers, json={"inputs": text,})
    return response.json()[0]['summary_text']

def crawl_and_translate_news():
    category = {100: 'Politics', 101: 'Economics', 102: 'Social', 103: 'Life/Cultures', 104: 'World', 105: 'IT/Science'}

    news_data = []
    for field in category:
        url = 'https://news.naver.com/main/main.naver?mode=LSD&mid=shm&sid1=' + str(field)

        headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36'}
        news = requests.get(url, headers=headers)
        soup = BeautifulSoup(news.content, 'html.parser')

        li_elements = soup.find_all('li', class_='sh_item _cluster_content')

        max_articles, max_title, max_body, max_url = 0, '', '', ''
        # Iterate over the <li> elements and extract the URLs and titles
        for li in li_elements:
            url = li.find('a')['href']
            title = li.find('a', class_='sh_text_headline').text
            number = int(li.find('span', class_='sh_head_more_icon_num').text)

            if number > max_articles:
                max_articles, max_title, max_url = number, title, url

        korean_time = (datetime.now(timezone(timedelta(hours=9)))).strftime('%Y-%m-%d')        
            
        max_body = get_body_content(max_url)
        news_data.append({'date':korean_time, 'category': category[field], 'url': max_url, 'title': max_title, 'body': max_body})

    df = pd.DataFrame(news_data)

    df.index = [f'N{i}' for i in range(len(df))] #후에는 마지막 인덱스 보고 거기에 1 더하는 걸로 변경.
    df.reset_index(inplace=True)
    df['title_google_translated'] = df['title'].apply(lambda text: google_translate(text))
    df['body_google_translated_distilbart_summarized'] = df['body'].apply(lambda text: distilbart_summarize(text))
    return df

In [268]:
news_df = crawl_and_translate_news()

In [269]:
news_df

Unnamed: 0,index,date,category,url,title,body,title_google_translated,body_google_translated_distilbart_summarized
0,N0,2023-08-29,Politics,https://n.news.naver.com/mnews/article/009/000...,국방부 '홍범도 흉상' 이전 해군함정 명칭도 바뀌나,광주 출신 중국 혁명음악가 정율성과 일제강점기 항일운동가 홍범도를 둘러싼 논란이 진...,Will the name of the naval vessel before the M...,The controversy surrounding Chinese revolutio...
1,N1,2023-08-29,Economics,https://n.news.naver.com/mnews/article/029/000...,"중대재해 발생한 건설사 영업정지 처분, `무용지물` 이유가…",건설현장 등에서 중대재해가 발생해 건설사들이 영업정지 처분을 받더라도 행정소송 등으...,The disposition of business suspension of cons...,Ssangyong E&C was suspended for two months on...
2,N2,2023-08-29,Social,https://n.news.naver.com/mnews/article/082/000...,"'은평 흉기 자해소동' 요리사 ""엄마가 날 못 믿어 속상해 그랬다""",지난 주말 저녁 서울 지하철 6호선 구산역 인근 주택가 한복판에서 양손에 흉기를 들...,"'Eunpyeong weapon self-harm' chef ""I was upset...",An arrest warrant was dismissed for a former ...
3,N3,2023-08-29,Life/Cultures,https://n.news.naver.com/mnews/article/055/000...,가을은 태풍과 함께? 세 개 한꺼번에 온다[자막뉴스],"전국 대부분 지역에 비가 내리는 오늘(28일), 기상청이 예보한 태풍의 반경입니다....",Autumn with typhoons? Three come at once [subt...,"Typhoon No. 11 'Haikui', which occurred in th..."
4,N4,2023-08-29,World,https://n.news.naver.com/mnews/article/029/000...,"러, 프리고진 사망 공식 확인 ""유전자 분석 결과 일치""",러시아 당국이 바그너그룹 수장 예브게니 프리고진의 사망을 공식 확인했다. 27일(현...,"Russia officially confirms Prigogine's death ""...","Yevgeny Prigozhin, head of the Wagner Group, ..."
5,N5,2023-08-29,IT/Science,https://n.news.naver.com/mnews/article/030/000...,"“최신 스마트폰도 익일 배송” e커머스, 모토로라 '레이저40울트라' 판매 개시",e커머스 업체들이 모토로라 플립형 스마트폰 '레이저40울트라' 판매에 나선다. 익일...,“The latest smartphones are also delivered the...,E-commerce companies are starting to sell Mot...


In [271]:
news_df.to_csv('../data/NewsNudge/NN_news.tsv', sep='\t', index=False)

4. 해당 데이터셋, NAML의 preprocessing에 적용해보기 - parsing for the test.

In [2]:
import pandas as pd

In [3]:
print(pd.__version__)

2.0.3


In [6]:
import os
import tempfile 
import pandas as pd
from google.cloud import storage
from datetime import datetime, timezone, timedelta
import re
import requests
from bs4 import BeautifulSoup
import pandas as pd
from googletrans import Translator

huggingface_API_URL = "https://api-inference.huggingface.co/models/sshleifer/distilbart-cnn-12-6"
huggingface_headers = {"Authorization": "Bearer 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"}

def clean_text(text):
    # Remove clauses like [헤럴드경제=신현주 기자]
    text = re.sub(r'\[[^\]]+\]', '', text)
    
    # Remove newlines, backslashes, and multiple spaces
    text = text.replace('\n', ' ').replace('\\', '')
    text = re.sub(r'\s+', ' ', text)
    
    # Remove leading and trailing spaces
    cleaned_text = text.strip()
    
    return cleaned_text

def get_body_content(URL):
    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36'}
    news = requests.get(URL, headers=headers)
    soup = BeautifulSoup(news.content, 'html.parser')

    news_element = soup.find('article', class_='go_trans _article_content')

    if news_element:
        tags_to_extract = ['strong', 'em', 'b', 'td']

        for tag in tags_to_extract:
            for elem in news_element.find_all(tag):
                elem.extract()            

        body = news_element.get_text(separator='\n')
        return clean_text(body)

    else:
        return 'News content not found.'
    
def google_translate(text):
    google = Translator()
    result = google.translate(text, dest='en')
    return result.text

def distilbart_summarize(text, translate='google'):
    if translate == 'google':
        text = google_translate(text)
    response = requests.post(huggingface_API_URL, headers=huggingface_headers, json={"inputs": text,})
    return response.json()[0]['summary_text']

def crawl_and_translate_news():
    category = {100: 'Politics', 101: 'Economics', 102: 'Social', 103: 'Life/Cultures', 104: 'World', 105: 'IT/Science'}

    print('Crawling started.')
    news_data = []
    for field in category:
        url = 'https://news.naver.com/main/main.naver?mode=LSD&mid=shm&sid1=' + str(field)

        headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36'}
        news = requests.get(url, headers=headers)
        soup = BeautifulSoup(news.content, 'html.parser')

        li_elements = soup.find_all('li', class_='sh_item _cluster_content')

        max_articles, max_title, max_body, max_url = 0, '', '', ''
        # Iterate over the <li> elements and extract the URLs and titles
        for li in li_elements:
            url = li.find('a')['href']
            title = li.find('a', class_='sh_text_headline').text
            number = int(li.find('span', class_='sh_head_more_icon_num').text)

            if number > max_articles:
                max_articles, max_title, max_url = number, title, url

        korean_time = (datetime.now(timezone(timedelta(hours=9)))).strftime('%Y-%m-%d')        
            
        max_body = get_body_content(max_url)
        news_data.append({'date':korean_time, 'category': category[field], 'url': max_url, 'title': max_title, 'body': max_body})

    df = pd.DataFrame(news_data)

    df.index = [f'N{i}' for i in range(len(df))] 
    df.reset_index(inplace=True)

    print('Translation started.')
    df['title_google_translated'] = df['title'].apply(lambda text: google_translate(text))
    df['body_google_translated_distilbart_summarized'] = df['body'].apply(lambda text: distilbart_summarize(text))
    return df

def generate_tsv():
    print('Main function requested.')
    df = crawl_and_translate_news()

    print('Dataset ready to go.')
    temp_file = tempfile.NamedTemporaryFile(delete=False)
    tsv_content = df.to_csv(sep='\t', index=False)
    temp_file.write(tsv_content.encode())
    temp_file.close()

    bucket_name = 'newsnudge'
    blob_name = 'news.tsv'

    
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(blob_name)

    blob.upload_form_filename('hello world')

    # if blob.exists():
    #     # If 'news.tsv' exists already, download the existing content
    #     existing_content = blob.download_as_text()

    #     # Append the new content to the existing content
    #     new_content = existing_content + tsv_content
    #     blob.upload_from_string(new_content)
    # else:
    #     # If 'news.tsv' doesn't exist, upload the new content as a new file
    #     blob.upload_form_filename(temp_file.name)

    # Remove the temporary file
    os.unlink(temp_file.name)

    return 'TSV file generated and added to "news.tsv" in Cloud Storage.'

In [7]:
generate_tsv()

Main function requested.
Crawling started.
Translation started.
Dataset ready to go.


DefaultCredentialsError: Your default credentials were not found. To set up Application Default Credentials, see https://cloud.google.com/docs/authentication/external/set-up-adc for more information.