In [1]:
import sys

sys.path.append('../src')

In [3]:
import torch
import transformers
from transformers import AutoModel, AutoTokenizer, pipeline, AutoModelForSequenceClassification
from dotenv import load_dotenv
import pandas as pd
import os
from dotenv import load_dotenv
import time
import re
import random
from tqdm import tqdm

from utils.util_fnc import *
from models.llm import LLM

In [4]:
load_dotenv()
huggingface_token = os.getenv("huggingface_token")
openai_key = os.getenv("open_ai_key")

In [5]:
config = load_yaml('../src/config/text_classification_config.yaml')
config.huggingface_token = huggingface_token
config.openai_key = openai_key
prompt_dict = load_yaml(os.path.join(config.root_dir, 'prompt', 'first_step_prompt.yaml'))

In [6]:
# dummy_news = pd.read_csv(os.path.join(config.root_dir, 'data', 'dummy_articles', 'articles.csv'), index_col=0)
ag_news = pd.read_csv(os.path.join(config.root_dir, 'data', 'ag_news.csv'), index_col=0)
automotive_news = pd.read_csv(os.path.join(config.root_dir, 'data', 'automotive_news.csv'), index_col=0)

In [7]:
input_text = dummy_text = pd.concat([ag_news, automotive_news])
input_text = input_text.reset_index(drop=True)

In [8]:
llm = LLM(config)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [9]:
result_dict = {}
cleaned_output_dict = {}

for index, row in input_text.iterrows():
    article = re.sub(r'[^\w\s]', '', row.text).strip()

    output, cleaned_output = llm.evaluate_text(article, prompt_dict)

    result_dict[index] = output
    cleaned_output_dict[index] = cleaned_output



In [12]:
true_indices = [index for index, value in result_dict.items() if value == True]

for index in true_indices:
    print(f"Index: {index},\nInput: {input_text.iloc[index].text}\nCleaned Output: {cleaned_output_dict[index]}\n")

Index: 23,
Cleaned Output: True This article relates to economic conditions which may affect the automative industry since they rely heavily on consumer spending power

Index: 32,
Input: Saudi Arabia to open up oil taps Saudi Arabia says it is ready to push an extra 1.3 million barrels a day of oil into the market, to help reverse surging prices.
Cleaned Output: Saudi Aramco has announced plans to increase its crude oil production capacity to  million barrelday by  which will have direct impact on global demand for gasoline and diesel fuel used in cars Therefore i judge this article as Related to Automotive Domain so my answer would be True

Index: 49,
Input: Quality Gets Swept Away Quality Distribution is hammered after reporting a large loss for the second quarter.
Cleaned Output: This article discusses financial performance and losses which are relevant to the auto industry since car companies need to manage their finances effectively to remain competitive True

Index: 68,
Input: BE

In [52]:
new_df = pd.DataFrame(columns=['Text', 'Automotive_domain', 'model_description_1', 'Hyundai_group', 'model_description_2', 'Mobis', 'model_description_3'])

for index, row in input_text.iterrows():
    article = re.sub(r'[^\w\s]', '', row.text).strip()
    output, cleaned_output = llm.evaluate_text(article, prompt_dict)
    
    # 첫 번째 모델에 의한 판별
    new_row = pd.DataFrame({'Text': [article], 'Automotive_domain': [output], 'model_description_1': [cleaned_output]})
    new_df = pd.concat([new_df, new_row])



In [60]:
# Automotive_domain이 True인 경우만 필터링
automotive_text = new_df[new_df['Automotive_domain'] == True]

for index, row in automotive_text.iterrows():
    article = row['Text']
    output, cleaned_output = llm.evaluate_text(article, second_prompt)
    
    # 두 번째 모델에 의한 판별 결과로 기존 DataFrame 업데이트
    new_df.at[index, 'Hyundai_group'] = output
    new_df.at[index, 'model_description_2'] = cleaned_output



In [61]:
new_df

Unnamed: 0,Text,Automotive_domain,model_description_1,Hyundai_group,model_description_2,Mobis,model_description_3
0,Wall St Bears Claw Back Into the Black Reuters...,False,This article does not mention anything specifi...,True,True This text relates to Kia which is part of...,,
0,Carlyle Looks Toward Commercial Aerospace Reut...,False,This article does not seem to be directly rela...,True,True This text relates to Kia which is part of...,,
0,Oil and Economy Cloud Stocks Outlook Reuters R...,False,Unrelated because there is no mention of cars ...,True,True This text relates to Kia which is part of...,,
0,Iraq Halts Oil Exports from Main Southern Pipe...,False,The article does not mention anything specific...,True,True This text relates to Kia which is part of...,,
0,Oil prices soar to alltime record posing new m...,False,This text does not relate to the auto mobile s...,True,True This text relates to Kia which is part of...,,
...,...,...,...,...,...,...,...
0,UK Scientists Allowed to Clone Human Embryos R...,False,This article does not mention anything about t...,True,True This text relates to Kia which is part of...,,
0,Russian Alien Spaceship Claims Raise Eyebrows ...,False,This article does not contain any information ...,True,True This text relates to Kia which is part of...,,
0,Hyundai Kia expect vehicle sales to jump 10 in...,False,This text is not directly related tp the autom...,True,True This text relates to Kia which is part of...,,
0,Hyundai Mobis sees billions in US revenue grow...,True,Based on my understanding this text discusses ...,True,True This text relates to Kia which is part of...,,
