# 1. One video sample test 
> *setting the environment and making helper functions*

## 1.1 Import packages and prepare a sample json file

In [1]:
import json
import os
from tqdm import tqdm
from datetime import datetime

import boto3
from langchain_aws import ChatBedrock
from langchain.prompts import PromptTemplate
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains import create_tagging_chain
# from langchain.chains import LLMChain, SequentialChain
from langchain_core.runnables import RunnableSequence
from langchain_core.output_parsers import JsonOutputParser, CommaSeparatedListOutputParser, StrOutputParser

In [2]:
### Set a specific json file to analyze and set keywords between 'LITHIUM INVESTMENT' and 'LITHIUM SUPPLY CHAIN'
today = datetime.today().strftime('%Y-%m-%d')
# type = ['normal', 'comma', 'and', 'or']
# query = ['gm EV', 'CATL Battery', 'LITHIUM SUPPLY CHAIN', 'EV Market News']
query = ['"SK On"', 'SK On', 'SK Battery', 'Battery SK']
query2 = ['"SK On" (Investment | America | Plant)', '"SK On" (Investment | America)',
            '"SK On" (Investment)', '"SK On" (America)', '"SK On" (Plant)', '"SK On" Battery', 'Battery "SK On"']
subject = 'electric vehicle or battery market'

## 1.2 Set helper functions

### 1.2.1 Get a full transcripts from one video

In [3]:
def get_full_caption(video):
    video_id = video['id']['videoId']

    text = " ".join([item['text'] for item in video['caption']])
    text = text.replace("[Music]","")
    text = text.replace("[음악]","")
    text = text[:760000]

    if len(text) < 10:
        text = "No caption available"
    return text

In [4]:
def get_stats(video):
    stats = video['statistics']
    return stats

### 1.2.2 Make a Bedrock object

In [5]:
def call_bedrock(**kwargs):
    bedrock_runtime = boto3.client(
        service_name="bedrock-runtime",
        region_name="us-east-1",
    )
    # model_id = "anthropic.claude-3-sonnet-20240229-v1:0" # Claude 3 Sonnet
    model_id = "anthropic.claude-3-haiku-20240307-v1:0" # Claude 3 Haiku
    # model_id = "meta.llama3-8b-instruct-v1:0" # Llama3-8b Instruct


    model_kwargs =  { 
        # Reference: https://docs.aws.amazon.com/bedrock/latest/userguide/inference-parameters.html
        "max_tokens": 10000,
        "temperature": 0.01,
        "top_p": 0.9
    }
    
    llm = ChatBedrock(
        client=bedrock_runtime,
        model_id=model_id,
        model_kwargs=model_kwargs,
    )

    return llm

### 1.2.3 Check whether the input text has a relevance to the subject or not

In [6]:
def check_relevance(llm, text:str, subject:str='LITHIUM MARKET', verbose=False) -> str:
    template = """

    <paper>
    {text}
    </paper>
    
    [Task instructions]
    here is a paper above. 
    read the paper and determine whether the content is related to the {subject}. \
    If the content is related to the {subject}, please **just** say ‘yes’,
    IF the content is not related to the {subject}, please **just** say ‘no’. \
    You must follow this format. \
    
    result: ‘yes’ \
    result: ‘no’ \

    result: 
    """

    prompt = PromptTemplate(template=template, input_variables=['subject', 'text'])
    chain = prompt | llm
    intend = chain.invoke({'subject':subject, 'text':text}).content
    YesOrNo = intend.lower().replace(' ','').replace('result:','')
    
    if verbose==False:
        return YesOrNo
    else:
        template2 = """
        <paper>
        {text}
        </paper>
        
        [Task instructions]
        here is a paper above. 
        read the paper and determine whether the content is related to the {subject}. \
        If the content is related to the {subject}, please provide the reason,
        IF the content is not related to the {subject}, please **just** say ‘No reason’. \
        You must follow this format. \
        
        result: ‘It's related to {subject} since ...’ \
        result: ‘no’ \

        result: 
        """

        prompt = PromptTemplate(template=template2, input_variables=['subject', 'text'])
        chain = prompt | llm
        intend = chain.invoke({'subject':subject, 'text':text}).content
        reason = intend.lower()
        return "Answer: " + YesOrNo + "\n" + "Reason: " + reason

### 1.2.4 Get a summary of text

In [7]:
def get_summary(llm, text: str, subject: str = 'LITHIUM MARKET') -> str:
    template = '''
    Here is an article. 
    Read the article carefully and summarise the information related to {subject} using the [Task instructions]:

    [Task instructions]
    1. if you have specific numerical information, please use it in your summary.
    2. Use proper nouns in their original language.
    3. Summarize the information related to {subject}, focusing on the main points.
    4. Please give me the only summary without your comment or unnecessary information.
    
    <paper>
    {text}
    </paper>
    '''
    prompt = PromptTemplate(template=template, input_variables=['subject', 'text'])
    chain = prompt | llm
    response = chain.invoke({'subject':subject, 'text':text}).content
    
    return response


### 1.2.5 Extract some related sentences from the text given that the keyword is provided

In [8]:
def extract_sentences_with_keyword(llm, text: str, keyword: str) -> str:
    """
    Extracts sentences containing a specific keyword from a given text and returns them as bullet points.
    """
    output_parser = JsonOutputParser()
    format_instructions = output_parser.get_format_instructions()
    
    template = '''
    Here is an article.
    Read the article carefully and extract sentences that contain the keyword "{keyword}".
    Present each sentence as a bullet point in the following format:

    [Task instructions]
    1. Extract only the sentences that contain the keyword "{keyword}".
    2. Ensure each bullet point contains only one sentence.
    3. Do not modify the original sentences.
    4. If there are no related sentences, write "No related sentences".
    5. Please provide your own reason for your answer and follow the format below.
    6. ** Most Important ** : Please provide only answer in json format without any additional comment of you.

    <paper>
    {text}
    </paper>

    {format_instructions}
    Make a dictionary with the key 'sentence' and value as a list of sentence.
    Also add key 'reason' and value as a list of reasons.
    For example {{'sentence': ['sentence1.', sentence2', 'sentence3', ...], 'reason': ['reason1', 'reason2', 'reason3', ...]}}
    If there's no related sentences, provide me {{'sentence': ["No related sentences"]}}
    '''
    
    prompt = PromptTemplate(
        template=template,
        input_variables=['keyword', 'text'],
        partial_variables={"format_instructions": format_instructions}
    )
    
    chain = prompt | llm | output_parser

    try:
        result = chain.invoke({"text": text, "keyword": keyword})
        sentences = result.get('sentence', [])
        reasons = result.get('reason', [])
        if sentences==[]:
            return "- No related sentences"
        answer_s = '\n'.join(f'- {sentence}' for sentence in sentences)
        answer_r = '\n'.join(f'- {reason}' for reason in reasons)
        return '* Sentences from the original transcripts: \n' + answer_s + '\n' + '* Reasons by LLM: \n' + answer_r
    except Exception as e:
        return str(e)


In [22]:
### Set a specific json file to analyze and set keywords between 'LITHIUM INVESTMENT' and 'LITHIUM SUPPLY CHAIN'
today = datetime.today().strftime('%Y-%m-%d')
# type = ['normal', 'comma', 'and', 'or']
# query = ['gm EV', 'CATL Battery', 'LITHIUM SUPPLY CHAIN', 'EV Market News']
# query = ['"SK On"', 'SK On', 'SK Battery', 'Battery SK']
# query = ['"SK On" (Investment | America | Plant)', '"SK On" (Investment | America)',
#             '"SK On" (Investment)', '"SK On" (America)', '"SK On" (Plant)', '"SK On" Battery', 'Battery "SK On"']
query = ['"SK On" (투자 | 미국 | 공장)', '"SK On" (투자 | 미국)',
            '"SK On" (투자)', '"SK On" (미국)', '"SK On" (공장)', '"SK On" 배터리', '배터리 "SK On"']
subject = 'electric vehicle or battery market'

In [23]:
data_list = [] 
relevance_list = []

for i in range(len(query)):

    file_path = 'query_test_2.3/' + today + '_trial3' +'.json'
    with open(file_path, 'r') as f:
        data = json.load(f)
        # print(data)

        num_relevance = 0
        keyword = query[i]
        
        videos_count = len(data.get(keyword, []))
        
        for video in tqdm(data[keyword]):
            text = get_full_caption(video)
            # stats = get_stats(video)
            
            llm = call_bedrock()
            relevance = check_relevance(llm, text, subject, verbose=False)

            if relevance == 'yes':
                num_relevance += 1
                relevance_list.append({
                    'idx': num_relevance,
                    'Keyword': keyword,
                    # 'Type': type[j],
                    'Video ID': video['id']['videoId'],
                    'View Count': video['statistics']['viewCount'],
                    'Like Count': video['statistics']['likeCount'],
                    'Comment Count': video['statistics']['commentCount']
                })


        if len(data[keyword]) == 0:
            # print(f"Keyword: {keyword}")
            # print(f"Number of videos searched: 0")
            # print(f"Number of relevant videos: 0/0")
            data_list.append({
                'Keyword': keyword,
                # 'Type': type[j],
                'Videos Searched': 0,
                'Relevant Videos': "0/0"
            })
            continue

        # print(f"Keyword(Type): {keyword}({type[j]})")
        # print(f"Number of videos searched: {len(data[keyword])}")
        # print(f"Number of relevant videos: {num_relevance}/{len(data[keyword])}")
        data_list.append({
            'Keyword': keyword,
            # 'Type': type[j],
            'Videos Searched': videos_count,
            'Relevant Videos': f"{num_relevance}/{videos_count}"
        })

100%|██████████| 10/10 [00:18<00:00,  1.90s/it]
100%|██████████| 10/10 [00:19<00:00,  1.91s/it]
100%|██████████| 10/10 [00:19<00:00,  1.92s/it]
100%|██████████| 10/10 [00:19<00:00,  1.98s/it]
100%|██████████| 10/10 [00:15<00:00,  1.51s/it]
100%|██████████| 10/10 [00:18<00:00,  1.87s/it]
100%|██████████| 10/10 [00:16<00:00,  1.64s/it]


In [25]:
import pandas as pd 
from tabulate import tabulate

# 리스트를 DataFrame으로 변환
df = pd.DataFrame(data_list)

# Relevant Videos 컬럼을 float로 변경
df['Relevant Videos Ratio'] = df['Relevant Videos'].str.split('/').apply(lambda x: float(x[0])/float(x[1]) if x[1] != '0' else 0)
# 소수점 4자리까지 표시
df['Relevant Videos Ratio'] = df['Relevant Videos Ratio'].apply(lambda x: round(x, 4))
col_to_rename = {
    'Relevant Videos': 'Relevant Videos/Total Videos',
    'Relevant Videos Ratio': 'Relevant Videos Ratio(%)'
}
df.rename(columns=col_to_rename, inplace=True)

# DataFrame 출력
print(tabulate(df, headers='keys', tablefmt='psql', showindex=True, stralign='center', numalign='center'))

+----+------------------------------+-------------------+--------------------------------+----------------------------+
|    |           Keyword            |  Videos Searched  |  Relevant Videos/Total Videos  |  Relevant Videos Ratio(%)  |
|----+------------------------------+-------------------+--------------------------------+----------------------------|
| 0  | "SK On" (투자 | 미국 | 공장) |        10         |              5/10              |            0.5             |
| 1  |    "SK On" (투자 | 미국)     |        10         |              5/10              |            0.5             |
| 2  |        "SK On" (투자)        |        10         |              5/10              |            0.5             |
| 3  |        "SK On" (미국)        |        10         |              4/10              |            0.4             |
| 4  |        "SK On" (공장)        |        10         |              1/10              |            0.1             |
| 5  |        "SK On" 배터리        |        10         |  

In [50]:
df2 = pd.DataFrame(relevance_list).set_index('idx')
print(tabulate(df2, headers='keys', tablefmt='fancy_grid', showindex=True, stralign='center', numalign='center'))

╒═══════╤════════════════════════╤════════╤═════════════╤══════════════╤══════════════╤═════════════════╕
│  idx  │        Keyword         │  Type  │  Video ID   │  View Count  │  Like Count  │  Comment Count  │
╞═══════╪════════════════════════╪════════╪═════════════╪══════════════╪══════════════╪═════════════════╡
│   1   │         gm EV          │ normal │ 1y5giRY31n4 │     6929     │      95      │       70        │
├───────┼────────────────────────┼────────┼─────────────┼──────────────┼──────────────┼─────────────────┤
│   2   │         gm EV          │ normal │ QzJYe_uKgfc │     7617     │     646      │       77        │
├───────┼────────────────────────┼────────┼─────────────┼──────────────┼──────────────┼─────────────────┤
│   3   │         gm EV          │ normal │ D-8xPXvpqac │     1689     │      27      │       43        │
├───────┼────────────────────────┼────────┼─────────────┼──────────────┼──────────────┼─────────────────┤
│   4   │         gm EV          │ normal │ EM

### 1.2.6 Test helper functions with one video sample

>check 10 videos in one json file whether each video has a relevance with the lithium investment or not

In [31]:
## temp
### Set a specific json file to analyze and set keywords between 'LITHIUM INVESTMENT' and 'LITHIUM SUPPLY CHAIN'
file_path = 'jsons/lithium supply chain/2018-01-15.json'
keyword = 'LITHIUM SUPPLY CHAIN'
subject = 'LITHIUM OR BATTERY MARKET'

In [32]:
sample_data = json.load(open(file_path))

In [108]:
for i in range(len(sample_data[keyword])):
    llm = call_bedrock()

    print(f'\n[Sample Video {i+1}: ]\n')
    sample_caption = get_full_caption(sample_data[keyword][i])
    summary = get_summary(llm, sample_caption, subject=subject)

    print('\t', summary)
    print('\n[RELATED to keyword?: ]', check_relevance(llm, sample_caption, subject=subject, verbose=True), '\n')
    print('==========================================================================')


[Sample Video 1: ]

	 The article discusses a 30 MW, 120 MWh lithium-ion battery storage project in Escondido, California, operated by San Diego Gas & Electric (SDG&E). The key points related to the lithium or battery market are:

1. The lithium-ion batteries were selected for this project as the most cost-effective solution based on the use case.
2. The battery storage project helps integrate renewable energy sources, such as rooftop solar, into SDG&E's energy portfolio, which currently stands at 43% renewable energy and is expected to reach above 49% by 2020.
3. The batteries can provide grid regulation and stabilization services, such as spin and non-spin ancillary services, due to their ability to respond quickly.
4. SDG&E has an additional 70 MW of battery storage projects awaiting approval from the California Public Utilities Commission, and there is a possibility of adding up to 160 MW more as part of a separate initiative.

[RELATED to keyword?: ] ['yes', "reason: it's related

>now check all the functions with only one video samples

In [125]:
sample_caption = get_full_caption(sample_data[keyword][2])

In [126]:
print('[Original transcript]', '\n', sample_caption)

[Original transcript] 
 hi I'm at CES 2018 showing you how Panasonic is reimagining the omni-channel customer experience through personalization and deep learning at the RFID check out Panasonic is using facial recognition to make purchasing super easy for you let's take a look we step up smile for the camera and all of your information comes up Purchase History and if this is your first purchase that's okay Panasonic is using artificial intelligence to make smart recommendations personalized for you so according to my profile this is a wine that's recommended simply place the goods on the counter all of your information appears on the screen price year vintage complete description of the product and this is what's really cool guys Panasonic is displacing time and space I can have this item delivered to the restaurant delivered to my house or I can simply grab and go I'm gonna take this with me here we have the parallel link robot and what's unique about this is Panasonic is actually a

In [128]:
llm = call_bedrock()
keyword = "LITHIUM SUPPLY CHAIN"

print(f'[Is this video related to {subject}?]')
print(check_relevance(llm, sample_caption, subject=subject, verbose=True), '\n')

print('=='*10)

print('[Summary of video]')
print(get_summary(llm, sample_caption, subject=subject), '\n')

print('=='*10)

print('[Supply related sentences]')
print(extract_sentences_with_keyword(llm, sample_caption, 'LITHIUM SUPPLY CHAIN'), '\n')

print('=='*10)

print('[Demand related sentences]')
print(extract_sentences_with_keyword(llm, sample_caption, 'LITHIUM DEMAND ON MARKET'), '\n')

print('=='*10)

print('[Investment related sentences]')
print(extract_sentences_with_keyword(llm, sample_caption, 'INVESTMENT ON LITHIUM'), '\n')

[Is this video related to LITHIUM OR BATTERY MARKET?]
Answer: no
Reason: no reason.

the content of the paper is not related to the lithium or battery market. the paper discusses panasonic's use of facial recognition, artificial intelligence, and deep learning to enhance the customer experience at a retail checkout, as well as their use of a parallel link robot to automate and personalize product manufacturing. there is no mention of lithium or batteries in the text. 

[Summary of video]
The article discusses Panasonic's use of technology to enhance the customer experience at CES 2018. Key points related to the lithium or battery market:

1. Panasonic is using facial recognition technology to make the purchasing process easier for customers, allowing them to access their purchase history and receive personalized product recommendations.

2. Panasonic is also using artificial intelligence and deep learning to automate the manufacturing process, specifically with the Parallel Link Robot,

# 2. Extract impact scores (under the construction)

## 2.1 Set a output schema (json)

In [21]:
schema = {
    "properties": {
        "target": {
            "type": "string",
            "enum": ["demand quantity", "supply quantity", "price"],
            "description": "Specifies whether the information relates to the increase or decrease of the demand quantity, supply quantity, or price of lithium for battery production."
        },
        "target_condition": {
            "type": "string",
            "enum": ["increase", "decrease", "neutral"],
            "description": "Describes whether the text indicates an increase, decrease, or neutral outlook for the specified target in the lithium market."
        },
        "target_reason": {
            "type": "string",
            "description": "Describes the reason behind the impact on the target. If there is no significant impact, state 'There is no significant/direct impact'."
        },
        "influential_keywords": {
            "type": "array",
            "items": {
                "type": "string"
            },
            # "description": "List the top keywords that have the greatest influence on the target's condition. Include up to three keywords. If there is no significant impact, state 'None'."
            "description": '''Finds and ranks up to the top five key-word chunks or phrasal verbs that have a strong impact on the target's situation. 
                            For example, if there's relationship such as A -> B, and B is target, A is influential keyword.
                            Or if there's relationship such as A -> B -> C, and C is target, A and B are influential keywords.
                            Please exclude target itself or equivalent words from the list.
                            This selection should be based on a qualitative assessment to determine if there is a direct influence or a causal relationship between the keyword and the market changes. 
                            Keywords should be chosen only if they are inferred to have a direct causal impact on the target’s increase, decrease, or stability. 
                            This process involves analyzing the context in which these keywords appear, assessing the strength of association, and considering the immediacy of their impact. 
                            If no keywords meet these criteria, state 'None'.
                            '''
        },


        "price_impact": {
            "type": "integer",
            "enum": [0, 1, 2, 3, 4, 5],
            "description": "Describes the short-term impact on lithium prices. A higher score indicates a greater impact. If there is no significant/direct impact, the score should be close to 0."
        },
        "price_condition": {
            "type": "string",
            "enum": ["increase", "decrease", "neutral"],
            "description": "Describes whether the text indicates an increase, decrease, or neutral outlook for the price in the short-term lithium market."
        },
        "price_reason": {
            "type": "string",
            "description": "Describes the reason behind the price impact. If there is no significant impact, state 'There is no significant/direct impact'."
        },

        "demand_impact": {
            "type": "integer",
            "enum": [0, 1, 2, 3, 4, 5],
            "description": "Describes the short-term impact on the demand quantity for lithium. A higher score indicates a greater impact. If there is no significant/direct impact, the score should be close to 0."
        },
        "demand_condition": {
            "type": "string",
            "enum": ["increase", "decrease", "neutral"],
            "description": "Describes whether the text indicates an increase, decrease, or neutral outlook for the market demand quantity in the short-term lithium market."
        },
        "demand_reason": {
            "type": "string",
            "description": "Describes the reason behind the demand impact. If there is no significant impact, state 'There is no significant/direct impact'."
        },

        "supply_impact": {
            "type": "integer",
            "enum": [0, 1, 2, 3, 4, 5],
            "description": "Describes the short-term impact on the supply quantity of lithium. A higher score indicates a greater impact. If there is no significant/direct impact, the score should be close to 0."
        },
        "supply_condition": {
            "type": "string",
            "enum": ["increase", "decrease", "neutral"],
            "description": "Describes whether the text indicates an increase, decrease, or neutral outlook for the supply quantity in the short-term lithium market."
        },
        "supply_reason": {
            "type": "string",
            "description": "Describes the reason behind the supply impact. If there is no significant impact, state 'There is no significant/direct impact'."
        }
    },
    "required": ["target", "target_condition", "target_reason", "influential_keywords", "price_impact", "price_condition", "price_reason", "demand_impact", "demand_condition", "demand_reason", "supply_impact", "supply_condition", "supply_reason"]
}


## 2.2 Set a prompt to LLM and chain

In [22]:
# Define the prompt template
### This part should be revised by DG ###

prompt_template = """
Please remember that you must give me the output in JSON format according to the schema provided.
Never include any additional text in the output.

You are an analyst researching the electric vehicle market and battery-related industries.
The following information pertains to the lithium market for battery production and its short-term outlook (within the next three months).
Again, **Very important**: Please return only the JSON output without any additional text such as "Here is the output in JSON format according to the provided schema:"

You are required to analyze the text and provide a structured summary based on the following schema:
Here is the schema (guideline for the output):
{schema}

Please provide the output strictly in JSON format according to the schema provided.
**Very important**: Please return only the JSON output without any additional text such as "Here is the output in JSON format according to the provided schema:"

Input Text: 
{input_text}
"""

prompt = PromptTemplate(template=prompt_template, input_variables=['schema', 'input_text'])


In [23]:
# Use JsonOutputParser for the final chain
json_output_parser = JsonOutputParser()
output_parser = StrOutputParser()

# Create the runnable sequence
# chain = prompt | llm | json_output_parser
chain = prompt | llm | output_parser

## 2.3 Run a test

In [53]:
# # Example function to run the chain
# def run_chain(input_text, schema):
#     # Run the final chain
#     final_result = chain.invoke({"schema": schema, "input_text": input_text})
#     return final_result


In [25]:
# import json

# # Example function to run the chain with additional exception handling
# def run_chain(input_text, schema):
#     try:
#         # Run the final chain
#         final_result = chain.invoke({"schema": schema, "input_text": input_text})
        
#         # Attempt to parse the text to extract only the JSON part
#         # Find the first '{' and last '}' to isolate JSON
#         # print(final_result)
#         start_idx = final_result.find('{')
#         end_idx = final_result.rfind('}') + 1
#         json_text = final_result[start_idx:end_idx]
#         parsed_json = json.loads(json_text)
        
#         return parsed_json
#     except (ValueError, json.JSONDecodeError) as e:
#         raise ValueError(f"Error parsing the JSON output: {e}")

In [27]:
# Define a default JSON object to use in case of errors
default_json = {
    "target": "None",
    "target_condition": "neutral",
    "target_reason": "There is no significant/direct impact",
    "influential_keywords": ["None"],
    "price_impact": 0,
    "price_condition": "neutral",
    "price_reason": "There is no significant/direct impact",
    "demand_impact": 0,
    "demand_condition": "neutral",
    "demand_reason": "There is no significant/direct impact",
    "supply_impact": 0,
    "supply_condition": "neutral",
    "supply_reason": "There is no significant/direct impact"
}

# Example function to run the chain with additional exception handling
def run_chain(input_text, schema):
    try:
        # Run the final chain
        final_result = chain.invoke({"schema": schema, "input_text": input_text})
        
        # Attempt to parse the text to extract only the JSON part
        start_idx = final_result.find('{')
        end_idx = final_result.rfind('}') + 1
        json_text = final_result[start_idx:end_idx]
        parsed_json = json.loads(json_text)
        
        return parsed_json
    except (ValueError, json.JSONDecodeError) as e:
        # Log the error
        print(f"Error parsing the JSON output: {e}")
        # Return the default JSON object
        return default_json

In [28]:
run_chain(sample_caption, schema)

{'target': 'demand quantity',
 'target_condition': 'increase',
 'target_reason': 'Increased demand for lithium-ion batteries for industrial applications, particularly for powering large class one forklifts.',
 'influential_keywords': ['lithium-ion battery solution',
  'power large class one forklifts',
  'developer of advanced lithium batteries',
  'industrial applications',
  'class 1 lithium-ion battery pack'],
 'price_impact': 3,
 'price_condition': 'increase',
 'price_reason': 'Increased demand for lithium-ion batteries for industrial applications may lead to higher prices for lithium.',
 'demand_impact': 4,
 'demand_condition': 'increase',
 'demand_reason': 'Increased demand for lithium-ion batteries to power large class one forklifts and other industrial applications.',
 'supply_impact': 3,
 'supply_condition': 'increase',
 'supply_reason': 'Flex Power Holdings is developing and expanding its production of lithium-ion batteries for industrial applications, which may increase the 

# 2. Collecting all impact scores

In [31]:

keyword = "LITHIUM INVESTMENT"
folder_name = "lithium investment"
dict_condition = {"increase": 1, "decrease": -1, "neutral": 0}

# Collect all the file names in the directory
file_names = os.listdir(f'jsons/{folder_name}/')

# 
dict_result = {}
dict_mean_by_date = {}

### Processing each json file
for file in tqdm(file_names):
    json_file = json.load(open(f'jsons/{folder_name}/{file}'))
    date = file.split('.')[0]
    # print(date)

    
    lst_target= []
    lst_target_condition = []
    lst_target_reason = []
    lst_influential_keywords = []
    
    lst_price_impact = []
    lst_demand_impact = []
    lst_supply_impact = []

    ### Processing each video in the json file
    for idx, video in enumerate(json_file[keyword]):
        video_caption = get_full_caption(video)
        llm = call_bedrock()

        # Check relevance
        relevance = check_relevance(llm, video_caption, keyword)
        if relevance != 'yes':
            continue
        else:
            result = run_chain(video_caption, schema)
            # print(result)
            lst_price_impact.append(result['price_impact'] * dict_condition[result['price_condition']])
            lst_demand_impact.append(result['demand_impact'] * dict_condition[result['demand_condition']])
            lst_supply_impact.append(result['supply_impact'] * dict_condition[result['supply_condition']])
            # print(lst_price_impact)

            lst_target.append(result['target'])
            lst_target_condition.append(result['target_condition'])
            lst_target_reason.append(result['target_reason'])
            lst_influential_keywords.append(result['influential_keywords'])
        
    dict_result[date] = {
        "target": lst_target,
        "target_condition": lst_target_condition,
        "target_reason": lst_target_reason,
        "influential_keywords": lst_influential_keywords,
        "price_impact": lst_price_impact,
        "demand_impact": lst_demand_impact,
        "supply_impact": lst_supply_impact
    }
    # print(dict_result)

    try:
        dict_mean_by_date[date] = {
            "price_impact": sum(lst_price_impact) / len(lst_price_impact),
            "demand_impact": sum(lst_demand_impact) / len(lst_demand_impact),
            "supply_impact": sum(lst_supply_impact) / len(lst_supply_impact)
        }
    except:
        dict_mean_by_date[date] = {
            "price_impact": 0,
            "demand_impact": 0,
            "supply_impact": 0
        }

100%|██████████| 335/335 [3:20:53<00:00, 35.98s/it]  


In [32]:
with open('investment_whole_output.json', 'w') as f:
    sorted_data = dict(sorted(dict_result.items(), key=lambda x: datetime.strptime(x[0], '%Y-%m-%d')))
    json.dump(sorted_data, f, indent=4)

with open('investment_mean_output.json', 'w') as f:
    sorted_data = dict(sorted(dict_mean_by_date.items(), key=lambda x: datetime.strptime(x[0], '%Y-%m-%d')))
    json.dump(sorted_data, f, indent=4)

In [None]:
# Feature Extarction
# entity_dict = {}
# for k in tqdm(file_list):
#     print(k)
#     score_list = []
#     reason_list = []
#     data = s3_read_json_file(bucket, k)
#     for video in tqdm(data['Ford EV']):
#         text = get_caption(video)
#         output = entity_chain.run(text[:75000])
#         try:
#             score_list.append(output['impact'] * condition_dict[output['condition']])
#         except:
#             score_list.append(0)
#         reason_list.append(output['impact reason'])
#     entity_dict[k] = {'score':score_list, 'reason':reason_list}
    
# entity_df = pd.DataFrame(entity_dict).T.reset_index()
# entity_df['date'] = entity_df['index'].apply(lambda x:x.split(".")[0].split("/")[-1])
# entity_df['year'] = entity_df['date'].apply(lambda x:int(x.split("-")[0]))
# entity_df['month'] = entity_df['date'].apply(lambda x:int(x.split("-")[1]))
# entity_df['day'] = entity_df['date'].apply(lambda x:int(x.split("-")[2]))

In [None]:
feature_dict = {}
for year in [2022, 2023]:
    for month in range(1, 13):
        month_df = entity_df[(entity_df['month']==month) & (entity_df['year']==year)].reset_index(drop=True)
        month_df['week'] = np.arange(month_df.shape[0])+1
        score_ = month_df['score'].values.sum()
        reason_ = month_df['reason'].values.sum()
        score = np.mean(score_)
        context = " ".join(reason_)
        reason = chain.invoke({'target':"FORD", 'context':context, 'score':score})['Impact Score Reason']
        feature_dict["{}_{}".format(year, month)] = [score, reason]

        plt.figure(figsize=(10, 3))
        plt.title("Yeaa={}, Month={}, Mean Impact Score={}, Sample={}".format(year, month, np.mean(score_), len(score_)))
        sns.distplot(score_)
        plt.show()

        plt.figure(figsize=(10, 3))
        plt.title("Change in distribution by Weekly")
        for idx, i in enumerate(month_df['score']):
            sns.distplot(i, hist=None, label="{} Week".format(idx+1))
        plt.legend()
        plt.show()

        plt.figure(figsize=(10, 3))
        plt.title("Distribution by Weekly")
        for idx, data_group in enumerate(month_df['score']):
            plt.subplot(1, len(month_df), idx + 1)
            sns.distplot(data_group)
            plt.xlim(-10, 10)
            plt.ylim(0, 0.2)
        plt.tight_layout()
        plt.show()

        print("=="*60)
    print("=="*60)
    print("=="*60)
    print("=="*60)

In [None]:
# def process_video_summary(self, shared_list, idx, video, key):
    
#     video_id = video['id']['videoId']
    
#     link = f'링크 : https://www.youtube.com/watch?v={video_id}\n'

#     text = " ".join([item['text'] for item in video['caption']])
#     text = text.replace("[Music]","")
#     text = text.replace("[음악]","")
#     text = text[:760000]

#     if len(text) < 10:
#         item = shared_list[idx]
#         item['summary'] = "subtitle is not available"
#         item['cod'] = "subtitle is not available"
#         item['event'] = []
#         item['news'] = []
#         shared_list[idx] = item
#         return video

#     intend = self._get_intend(text, key)

#     if 'no' in intend :
#         item = shared_list[idx]
#         item['summary'] = "Not Related to Electric Car"
#         item['cod'] = "Not Related to Electric Car"
#         item['event'] = []
#         item['news'] = []
#         shared_list[idx] = item
#         return video

#     # 본문을 Chunk 단위로 쪼갬
#     text_splitter = CharacterTextSplitter(
#         chunk_size=300000,     # 쪼개는 글자수
#         chunk_overlap=30000,   # 오버랩 글자수
#         length_function=len,
#     )

#     text = link + text
#     text = text_splitter.split_text(text)
#     docs = [Document(page_content=t) for t in text]
#     summary = self._get_summary(docs, key)
#     cod = self._get_cod(text, key)
    
#     item = shared_list[idx]
#     item['summary'] = summary
#     item['cod'] = cod
#     item['event'] = self._get_event(item)
#     item['news'] = self._get_news(item['event'], cod[-1]['Denser_Summary'], key)
#     # item['enhanced_summary'] = self._get_enhance(cod[-1]['Denser_Summary'], item['news'])
#     shared_list[idx] = item  

#     return video