In [2]:
from dotenv import load_dotenv
import os
from pathlib import Path
from datetime import datetime, timedelta

import torch
from googleapiclient.discovery import build
import yt_dlp
import whisper
import re
from tqdm import tqdm

import google.generativeai as genai
from langchain.chat_models import ChatOpenAI
from langchain.schema import SystemMessage, HumanMessage

import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [10]:
df=pd.read_csv('./data_kr/video/뉴스 영상 수집본.csv', encoding='utf-8')
for code in df["code"].unique():
    df_ = df[df["code"] == code].reset_index(drop=True)
    predict_list = []
    reason_list = []
    score_list = []

    for row in tqdm(df_.itertuples(), total=len(df_), desc=f"{code}LLM predicting"):
        if pd.isna(row.url) or row.url == '':
            predict_list.append(None)
            reason_list.append(None)
            score_list.append(None)
            continue
        
        code = str(row.code).zfill(6)	
        name = row.name
        predict_dir = f'preprocessed_data/llm/predict_video/{row.sector}/{code}/'
        os.makedirs(predict_dir, exist_ok=True)

        try:
            filename = f'{row.year}-{row.quarter}-{str(row.month).zfill(2)}-{row.week}.txt'
            stock = f'{name}({code})'
            with open(f'{predict_dir}{filename}', "r", encoding="utf-8") as file:
                data = file.read()
                
            predict = data.split('\n')[0].split(':')[1].strip()
            reason = data.split('\n')[1].split(':')[1].strip()
            score = data.split('\n')[2].split(':')[1].strip()
            predict_list.append(predict)
            reason_list.append(reason)
            score_list.append(int(score))
            
        except Exception as e:
            predict_list.append("불가능")
            reason_list.append("관련 없음")
            score_list.append(0)
            
    df_predict = df_.copy()
    df_predict["prediction"] = predict_list
    df_predict["reason"] = reason_list
    df_predict["score"] = score_list
    df_predict = df_predict[["year", "quarter", "month", "week", "code", "name", "sector", "upload_dt", "prediction", "reason", "score"]]
    df_predict.to_csv(f"{predict_dir}{code}.csv", index=False, encoding="utf-8")

120LLM predicting: 100%|████████████████████████████████████████████████████████████| 204/204 [00:00<00:00, 372.40it/s]
150LLM predicting: 100%|████████████████████████████████████████████████████████████| 204/204 [00:00<00:00, 317.35it/s]
660LLM predicting: 100%|████████████████████████████████████████████████████████████| 204/204 [00:00<00:00, 301.11it/s]
1120LLM predicting: 100%|███████████████████████████████████████████████████████████| 204/204 [00:00<00:00, 471.00it/s]
3490LLM predicting: 100%|███████████████████████████████████████████████████████████| 204/204 [00:00<00:00, 308.15it/s]
3550LLM predicting: 100%|███████████████████████████████████████████████████████████| 204/204 [00:00<00:00, 308.16it/s]
3570LLM predicting: 100%|██████████████████████████████████████████████████████████| 204/204 [00:00<00:00, 1372.47it/s]
4710LLM predicting: 100%|██████████████████████████████████████████████████████████| 204/204 [00:00<00:00, 2101.15it/s]
5930LLM predicting: 100%|███████████████