# Proof-of-Concept notepad

In [1]:
import requests
import json
import csv

from datetime import datetime, timezone, timedelta
from bs4 import BeautifulSoup
from time import sleep
from random import randint
from collections import namedtuple

Article = namedtuple("Article", ["title", "content", "url", "pub_time", "section", "press"])
SummerizedArticle = namedtuple("SummerizedArticle", ["title", "content", "url", "pub_time", "section", "press", "problem", "issue", "keyword", "tag"])

model = None
tokenizer = None


## Wrapper

In [None]:
from llama_cpp import Llama
from transformers import AutoTokenizer

class LLM:
    MODEL_ID = "MLP-KTLim/llama-3-Korean-Bllossom-8B-gguf-Q4_K_M"
    MODEL_PATH = "/home/gpp/src/model/llama3-korean-bllossom-8b/llama-3-Korean-Bllossom-8B-Q4_K_M.gguf"

    model: Llama | None = None
    tokenizer: AutoTokenizer | None = None

    prompt: str = ""

    def __init__(self, verbose: bool=False, temperature: float=0.3, top_p: float=0.9, max_tokens: int=2048):
        if LLM.model is None:
            LLM.model = Llama(
                model_path=LLM.MODEL_PATH,
                n_ctx=8192,
                n_gpu_layers=-1,
                verbose=verbose
            )
        
        if LLM.tokenizer is None:
            LLM.tokenizer = AutoTokenizer.from_pretrained(LLM.MODEL_ID)

        self.temperature = temperature
        self.top_p = top_p
        self.max_tokens = max_tokens
        self.temperature = temperature
        

        
        self.prompt: str = ""
    
    def __del__(self):
        del LLM.model
        del LLM.tokenizer

        LLM.model = None
        LLM.tokenizer = None

    def set_prompt(self, prompt: str):
        self.prompt = prompt
        return
    
    def generate(self, instruction: str):
        if len(self.prompt) == 0:
            raise ValueError("prompt is not set.")
        
        if len(instruction) == 0:
            raise ValueError("instruction is not set.")
        
        generation_kwargs = {
            "max_tokens":self.max_tokens,
            "stop":["<|eot_id|>"],
            "top_p":self.top_p,
            "temperature":self.temperature,
        }

        messages = [
            {"role": "system", "content": f"{self.prompt}"},
            {"role": "user", "content": f"{instruction}"}
        ]

        p = LLM.tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True,
            truncation=True
        )

        response = LLM.model(p, **generation_kwargs)
        return response["choices"][0]["text"]

llm: LLM = LLM()

In [None]:
"""
    We want to use the following press name as the ID for the database.
    It must be in English and contain no special characters and spaces.
    please give me converted journal IDs only.
"""

In [None]:
import requests

def ss():
    pass

## Register

In [None]:
response = requests.get(
    "https://alimexpress-api.rrkim.com/auth/eula",
)

eula_ids: list[str] = []

for i in response.json()["data"]:
    eula_ids.append(i["eulaId"])

eula_ids

In [None]:
response = requests.post(
    "https://alimexpress-api.rrkim.com/auth/sign-up",

    headers={
        "Content-Type": "application/json"
    },

    json={
        "userId": "LLM_TEST@test.com",
        "password": "LLM_TEST",
        "userNm": "LLM_TEST",
        "gender": "MALE",
        "birthDate": "2001-08-16",
        "interestTagIds": [
            "test_tag"
        ],
        "agreedEulaIds": eula_ids
    }
)

print(response.text)

## get token

In [None]:
response = requests.post(
    URL + "/auth/token",

    headers={
        "Content-Type": "application/json"
    },

    json=AUTH
)

TOKEN: str = response.json()["data"][0]["token"]
TOKEN

## fetch news

In [None]:
NOW: str = datetime.now(tz=timezone(timedelta(hours=9))).strftime("%Y%m%d%H%M")
header: dict = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36"}

from datetime import datetime, timedelta, timezone
import re

sections = {
    # "politics" : 100,
    # "economy": 101,
    "social": 102,
    "life": 103,
    # "world": 104,
    # "science": 105 
}

news = {
    section : []
    for section in sections.keys()
}

for section in sections.keys():
    articles: list[Article] = []

    for page in range(2):
        url = f"https://news.naver.com/section/template/SECTION_ARTICLE_LIST?sid={sections[section]}&sid2=&cluid=&pageNo={page}&date=&next={NOW}"
        response = requests.get(url, headers=header)
        bs = BeautifulSoup(json.loads(response.text)["renderedComponent"]["SECTION_ARTICLE_LIST"], 'html')
        sleep(0.5 + randint(0, 100) * 0.1)

        for element in bs.findAll("li"):
            url: str = element.select("a")[0]["href"].strip()
            response = requests.get(url, headers=header)

            pub_time: str = element.select(".sa_text_datetime")[0].text.strip()

            if pub_time[-2:-1] == "분전": # "xx분전"
                numbers = int("".join(re.findall(r'\d+', )))

                timestamp_utc = datetime.now(timezone.utc) - timedelta(minutes=numbers)
                kst_time = timestamp_utc.astimezone(timezone(timedelta(hours=9)))
                pub_time = kst_time.isoformat()

            articles.append(
                Article(
                    title=element.select("strong")[0].text.strip(),
                    # content=element.select(".sa_text_lede")[0].text.strip(),
                    content=BeautifulSoup(response.text).select("#newsct_article")[0].text.strip(),
                    url=element.select("a")[0]["href"].strip(),
                    pub_time=element.select(".sa_text_datetime")[0].text.strip(),
                    section=section,
                    press=element.select(".sa_text_press")[0].text.strip()
                )
            )

        sleep(0.5 + randint(0, 30))

    news[section] = articles[:]
    print(section)


## save news data to csv file

In [21]:
for section in sections.keys():
    articles = news[section]
    articles.insert(0, tuple(title for title in Article._fields))

    with open(f"./articles-{section}.csv", 'w', encoding="utf8") as f:
        csv.writer(f).writerows(articles)


## load news data from csv file

In [None]:
news_life: list[Article] = []

with open("./articles-life.csv", 'r', encoding="utf8") as f:
    for i in csv.reader(f):
        news_life.append(
            Article(
                title=i[0],
                content=i[1],
                url=i[2],
                pub_time=i[3],
                section=i[4],
                press=i[5]
            )
        )

news_life.pop(0)
news_life[:5]

## 뉴스 업로드

In [None]:
sample_news = news_life[0]
sample_news

In [12]:
press = set([news.press for news in news_life])

presses = {
    i: news_life[i].press for i in range(len(press))
}

In [None]:
for press_id in presses.keys():
    print(press_id, presses[press_id])

In [44]:
response = requests.post(
    URL + "/journal",

    headers={
        "Content-Type": "application/json",
        "Authorization": f"Bearer {TOKEN}"
    },

    data='journalNm:"tes"\njournalId:"tet"'
)

In [None]:
response.text

In [None]:
for press_id in presses.keys():
    response = requests.post(
        URL + "/journal",

        headers={
            "Content-Type": "application/json",
            "Authorization": f"Bearer {TOKEN}"
        },

        json={
            "journalNm": press_id,
            "journalId": presses[press_id]
        }
    )

    print(response.status_code, response.text)

In [None]:
response = requests.post(
    URL + "/news",

    headers={
        "Content-Type": "application/json",
        "Authorization": f"Bearer {TOKEN}"
    },

    json={
        "title": sample_news.title,
        "link": sample_news.url,
        "journalId": "",
        "publicationDate": sample_news.pub_time,
        "tagIds": [
            ""
        ]
    }
)

print(response.status_code, response.text)

## test LLM

In [None]:
from llama_cpp import Llama
from transformers import AutoTokenizer

if model is not None:
    del model
if tokenizer is not None:
    del tokenizer

model_id = 'MLP-KTLim/llama-3-Korean-Bllossom-8B-gguf-Q4_K_M'
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = Llama(
    model_path='/home/gpp/src/model/llama3-korean-bllossom-8b/llama-3-Korean-Bllossom-8B-Q4_K_M.gguf',
    n_ctx=8192,
    n_gpu_layers=-1,
    verbose=True
)

def apply_prompt(instruction: str) -> str:
    PROMPT = \
    """
    다음은 [뉴스 제목]과 [뉴스 본문]입니다.
    요약에는 "주제"와 "주요 내용"이 포함되어야 합니다.
    - "problem": 뉴스의 주요 문제를 설명하는 핵심 문장
    - "summerized": 뉴스의 요약, 50자 이상의 세 문장
    - "keyword": 뉴스의 핵심 키워드
    다음은 예시입니다. keyword는 반드시 명사형으로 된 한 문장으로 작성해야 합니다.

    {
        "problem": "기후 변화로 부산 지역 해수면 상승",
        "summerized": "최근 부산 지역 해수면이 0.5cm 상승했다. 부산대학교 해양과학과 조교수는 이를 기후 변화로 인한 결과로 보았다. 환경부와 국토교통부는 내달 합동 조사단을 꾸려 해수면 상승 원인을 찾기로 했다.",
        "keyword": "부산 지역 해수면 상승"
    }
    """

    generation_kwargs = {
        "max_tokens":4096,
        "stop":["<|eot_id|>"],
        "top_p":0.9,
        "temperature":0.3,

    }

    messages = [
        {"role": "system", "content": f"{PROMPT}"},
        {"role": "user", "content": f"{instruction}"}
    ]

    prompt = tokenizer.apply_chat_template(
        messages, 
        tokenize = False,
        add_generation_prompt=True,
        truncation=True
    )

    response = model(prompt, **generation_kwargs)
    return response["choices"][0]["text"]

apply_prompt("")

In [None]:
from tqdm import tqdm

news_summarized: list[SummerizedArticle] = []

section = "생활 분야"

# PROMPT = \
# f"""
# 다음은 {section} 뉴스의 [뉴스 제목]과 [뉴스 본문]입니다.
# 요약에는 "주제"와 "주요 내용"이 포함되어야 합니다.
# 출력은 JSON 형식으로 해주세요. 각 키는 "problem", "issue", "summerized", "keyword"입니다.
# - "problem": 뉴스의 주요 문제를 설명하는 문장
# - "summerized": 뉴스의 요약, 50자 이상의 세 문장
# - "issue": 뉴스의 핵심 문장을 포함하는 완결된 한 문장
# - "keyword": 뉴스의 핵심을 집약하는 한 단어, 7어절 이하 명사형 문장

# 출력은 JSON 형식으로 요청 드린 내용만 담아 주세요.
# """


generation_kwargs = {
    "max_tokens":4096,
    "stop":["<|eot_id|>"],
    "top_p":0.9,
    "temperature":0.3,

}

for article in tqdm(news_life):
    title = article.title.replace("[", "<").replace("]", ">")
    content = article.content.replace("[", "<").replace("]", ">")

    instruction = f'''
    [뉴스 제목]\n
    {title}\n
    [뉴스 본문]\n
    {content}
    '''

    messages = [
        {"role": "system", "content": f"{PROMPT}"},
        {"role": "user", "content": f"{instruction}"}
    ]

    prompt = tokenizer.apply_chat_template(
        messages, 
        tokenize = False,
        add_generation_prompt=True,
        truncation=True
    )

    flag = True
    
    while flag:
        flag = False
        response = model(prompt, **generation_kwargs)
        print(response["choices"][0]["text"])

        try:
            response_json = json.loads(response["choices"][0]["text"])

            news_summarized.append(
                SummerizedArticle(
                    title=title,
                    content=response_json['problem'],
                    url=article.url,
                    pub_time=article.pub_time,
                    section=article.section,
                    press=article.press,
                    problem=response_json['problem'],
                    issue=response_json['issue'],
                    keyword=response_json['keyword'],
                    tag=article.section
                )
            )
        except:
            flag = True
            print("error!\r", end="")
            messages[1]["content"] = instruction + "\n출력은 JSON 형식이어야 합니다. 다시 시도하세요."
            continue
        

    # print(response['choices'][0]['text'][len(prompt):])

    # 요약 뉴스: 아티클, 뉴스 원문: 뉴스
    # 키워드: LLM이 분류한 이슈 - 요약 뉴스에 들어감
    # 태그: 원본 뉴스가 속한 카테고리

In [None]:
news_life[17]

In [None]:
from tqdm import tqdm

section = "생활 분야"

PROMPT = \
f"""
다음은 {section} 뉴스의 [뉴스 제목]과 [뉴스 본문]입니다.
이 뉴스는 어떤 연령대, 어떤 성별의 사람들이 가장 관심을 가질까요?
"""

generation_kwargs = {
    "max_tokens":4096,
    "stop":["<|eot_id|>"],
    "top_p":0.9,
    "temperature":0.3,

}

for article in tqdm([news_life[17]]):
    title = article.title.replace("[", "<").replace("]", ">")
    content = article.content.replace("[", "<").replace("]", ">")

    instruction = \
    f'''
    [뉴스 제목]\n
    {title}\n
    [뉴스 본문]\n
    {content}
    '''

    messages = [
        {"role": "system", "content": f"{PROMPT}"},
        {"role": "user", "content": f"{instruction}"}
    ]

    prompt = tokenizer.apply_chat_template(
        messages, 
        tokenize = False,
        add_generation_prompt=True,
        truncation=True
    )

    response = model(prompt, **generation_kwargs)
    print(response["choices"][0]["text"])


In [None]:
news_summarized

In [None]:
news_life[25]

In [None]:
print(response["choices"][0]["text"])

In [None]:
response