In [38]:
import os
import tempfile 
import pandas as pd
from google.cloud import storage
from datetime import datetime, timezone, timedelta
import re
import requests
from bs4 import BeautifulSoup
import pandas as pd
from googletrans import Translator

huggingface_API_URL = "https://api-inference.huggingface.co/models/sshleifer/distilbart-cnn-12-6"
huggingface_headers = {"Authorization": "Bearer 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"}

def clean_text(text):
    # Remove clauses like [헤럴드경제=신현주 기자]
    text = re.sub(r'\[[^\]]+\]', '', text)
    
    # Remove newlines, backslashes, and multiple spaces
    text = text.replace('\n', ' ').replace('\\', '')
    text = re.sub(r'\s+', ' ', text)
    
    # Remove leading and trailing spaces
    cleaned_text = text.strip()
    
    return cleaned_text

def get_body_content(URL):
    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36'}
    news = requests.get(URL, headers=headers)
    soup = BeautifulSoup(news.content, 'html.parser')

    news_element = soup.find('article', class_='go_trans _article_content')

    if news_element:
        tags_to_extract = ['strong', 'em', 'b', 'td']

        for tag in tags_to_extract:
            for elem in news_element.find_all(tag):
                elem.extract()            

        body = news_element.get_text(separator='\n')
        return clean_text(body)

    else:
        return 'News content not found.'
    
def google_translate(text):
    google = Translator()
    result = google.translate(text, dest='en')
    return result.text

def distilbart_summarize(text, translate='google'):
    if translate == 'google':
        text = google_translate(text)
    response = requests.post(huggingface_API_URL, headers=huggingface_headers, json={"inputs": text,})
    return response.json()[0]['summary_text']

def crawl_and_translate_news():
    category = {100: 'Politics', 101: 'Economics', 102: 'Social', 103: 'Life/Cultures', 104: 'World', 105: 'IT/Science'}

    news_data = []
    for field in category:
        url = 'https://news.naver.com/main/main.naver?mode=LSD&mid=shm&sid1=' + str(field)

        headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36'}
        news = requests.get(url, headers=headers)
        soup = BeautifulSoup(news.content, 'html.parser')

        li_elements = soup.find_all('li', class_='sh_item _cluster_content')

        max_articles, max_title, max_body, max_url = 0, '', '', ''
        # Iterate over the <li> elements and extract the URLs and titles
        for li in li_elements:
            url = li.find('a')['href']
            title = li.find('a', class_='sh_text_headline').text
            number = int(li.find('span', class_='sh_head_more_icon_num').text)

            # if number > max_articles:
            #     max_articles, max_title, max_url = number, title, url

            korean_time = (datetime.now(timezone(timedelta(hours=9)))).strftime('%Y-%m-%d')        
            
            news_data.append({'date':korean_time, 'category': category[field], 'url': url, 'title': title, 'body': get_body_content(url)})

    df = pd.DataFrame(news_data)

    # Encode the Korean text before saving to DataFrame
    df['title'] = df['title'].apply(lambda text: text.encode('utf-8').decode('utf-8'))
    df['body'] = df['body'].apply(lambda text: text.encode('utf-8').decode('utf-8'))    

    df.index = [f'N{i}' for i in range(len(df))] #후에는 마지막 인덱스 보고 거기에 1 더하는 걸로 변경.
    df.reset_index(inplace=True)

    df['title_google_translated'] = df['title'].apply(lambda text: google_translate(text))
    df['body_google_translated_distilbart_summarized'] = df['body'].apply(lambda text: distilbart_summarize(text))
    return df

def generate_tsv():
    df = crawl_and_translate_news()
    return df

In [39]:
df = generate_tsv()
df.to_csv('news_predict.tsv', sep='\t', index=False)

KeyboardInterrupt: 

In [None]:
!python3 data_preprocess.py

[nltk_data] Error loading punkt: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1002)>

Process data for prediction
Parse news
Parse ../data/NewsNudge/news_predict.tsv


In [1]:
!python3 predict.py

Using device: cpu
Evaluating model NAML
Calculating vectors for news: 100%|███████████████| 4/4 [00:02<00:00,  1.92it/s]
Calculating vectors for users: 100%|██████████████| 1/1 [00:06<00:00,  6.63s/it]
Calculating probabilities: 100%|██████████████████| 1/1 [00:06<00:00,  6.65s/it]
{'Politics': 'N3', 'Economics': 'N10', 'Social': 'N29', 'Life/Cultures': 'N34', 'World': 'N40', 'IT/Science': 'N48'}


In [1]:
from predict import *

model = Model(config).to(device)
news_dataset = NewsDataset('news_predict_parsed.tsv')
news_dataloader = DataLoader(news_dataset,
                                batch_size=config.batch_size, # 256
                                shuffle=False,
                                num_workers=config.num_workers, # 4
                                drop_last=False,
                                pin_memory=True)

news2vector = {}
for minibatch in tqdm(news_dataloader,
                        desc="Calculating vectors for news"):
    news_ids = minibatch['id']
    if any(id not in news2vector for id in news_ids):
        news_vector = model.get_news_vector(minibatch)
        for id, vector in zip(news_ids, news_vector):
            if id not in news2vector:
                news2vector[id] = vector

news2vector['PADDED_NEWS'] = torch.zeros(
    list(news2vector.values())[0].size())

user_dataset = UserDataset('behaviors.tsv', 'user2int.tsv')

user_dataloader = DataLoader(user_dataset,
                            batch_size=config.batch_size * 16,
                            shuffle=False,
                            num_workers=config.num_workers,
                            drop_last=False,
                            pin_memory=True)

user2vector = {}
for minibatch in tqdm(user_dataloader,
                        desc="Calculating vectors for users"):
    user_strings = minibatch["clicked_news_string"]
    if any(user_string not in user2vector for user_string in user_strings):
        clicked_news_vector = torch.stack([
            torch.stack([news2vector[x].to(device) for x in news_list],
                        dim=0) for news_list in minibatch["clicked_news"]
        ],
                                            dim=0).transpose(0, 1)
        user_vector = model.get_user_vector(clicked_news_vector)
        for user, vector in zip(user_strings, user_vector):
            if user not in user2vector:
                user2vector[user] = vector

behaviors_dataset = BehaviorsDataset('behaviors.tsv')
behaviors_dataloader = DataLoader(behaviors_dataset,
                                    batch_size=1,
                                    shuffle=False,
                                    num_workers=config.num_workers)                

Calculating vectors for news: 100%|██████████| 4/4 [00:02<00:00,  1.71it/s]
Calculating vectors for users: 100%|██████████| 1/1 [00:06<00:00,  6.96s/it]


In [2]:
for minibatch in tqdm(behaviors_dataloader, desc="Calculating probabilities"):
    news_index = [news[0] for news in minibatch['impressions']]
    candidate_news_vector = torch.stack([news2vector[news[0]] for news in minibatch['impressions']], dim=0)
    user_vector = user2vector[minibatch['clicked_news_string'][0]]
    click_probability = model.get_prediction(candidate_news_vector,
                                                user_vector)

    y_pred = click_probability.tolist()  
    prediction = {news_index[i]: y_pred[i] for i in range(len(news_index))}

Calculating probabilities: 100%|██████████| 1/1 [00:06<00:00,  6.80s/it]


In [3]:
news = pd.read_table('news_predict.tsv',
                        header=0,
                        usecols=[0, 2, 6, 7],
                        quoting=csv.QUOTE_NONE,
                        names=[
                            'id', 'category', 'title',
                            'abstract'
                        ])
news.fillna(' ', inplace=True)

In [5]:
category_to_news = {}

for news_id, prediction_value in prediction.items():
    category = news[news['id']==news_id]['category'].iloc[0]
    if category not in category_to_news or prediction_value > prediction[category_to_news[category]]:
        category_to_news[category] = news_id

In [6]:
category_to_news

{'Politics': 'N3',
 'Economics': 'N10',
 'Social': 'N29',
 'Life/Cultures': 'N34',
 'World': 'N40',
 'IT/Science': 'N48'}