# 1. One video sample test 
> *setting the environment and making helper functions*

## 1.1 Import packages and prepare a sample json file

In [1]:
import json
import os
from tqdm import tqdm
from datetime import datetime

import boto3
from langchain_aws import ChatBedrock
from langchain.prompts import PromptTemplate
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains import create_tagging_chain
# from langchain.chains import LLMChain, SequentialChain
from langchain_core.runnables import RunnableSequence
from langchain_core.output_parsers import JsonOutputParser, CommaSeparatedListOutputParser, StrOutputParser

In [2]:
### Set a specific json file to analyze and set keywords between 'LITHIUM INVESTMENT' and 'LITHIUM SUPPLY CHAIN'
file_path = 'jsons/lithium supply chain/2018-01-08.json'
keyword = 'LITHIUM SUPPLY CHAIN'

## 1.2 Set helper functions

### 1.2.1 Get a full transcripts from one video

In [3]:
def get_full_caption(video):
    video_id = video['id']['videoId']

    text = " ".join([item['text'] for item in video['caption']])
    text = text.replace("[Music]","")
    text = text.replace("[음악]","")
    text = text[:760000]

    if len(text) < 10:
        text = "No caption available"
    return text

### 1.2.2 Make a Bedrock object

In [4]:
def call_bedrock(**kwargs):
    bedrock_runtime = boto3.client(
        service_name="bedrock-runtime",
        region_name="us-east-1",
    )
    # model_id = "anthropic.claude-3-sonnet-20240229-v1:0" # Claude 3 Sonnet
    # model_id = "anthropic.claude-3-haiku-20240307-v1:0" # Claude 3 Haiku
    # model_id = "meta.llama3-8b-instruct-v1:0" # Llama3-8b Instruct
    model_id = "anthropic.claude-3-5-sonnet-20240620-v1:0"


    model_kwargs =  { 
        # Reference: https://docs.aws.amazon.com/bedrock/latest/userguide/inference-parameters.html
        "max_tokens": 10000,
        "temperature": 0.01,
        "top_p": 0.9
    }
    
    llm = ChatBedrock(
        client=bedrock_runtime,
        model_id=model_id,
        model_kwargs=model_kwargs,
    )

    return llm

### 1.2.3 Check whether the input text has a relevance to the subject or not

In [5]:
def check_relevance(llm, text:str, subject:str='LITHIUM MARKET', verbose=False) -> str:
    """get_intend _summary_

    Args:
        text (str): _description_
        args (_type_): _description_

    Returns:
        str: _description_
    """

    template = """

    <paper>
    {text}
    </paper>
    
    [Task instructions]
    here is a paper above. 
    read the paper and determine whether the content is related to the {subject}. \
    If the content is related to the {subject}, please **just** say ‘yes’,
    IF the content is not related to the {subject}, please **just** say ‘no’. \
    You must follow this format. \
    
    result: ‘yes’ \
    result: ‘no’ \

    result: 
    """

    prompt = PromptTemplate(template=template, input_variables=['subject', 'text'])
    chain = prompt | llm
    intend = chain.invoke({'subject':subject, 'text':text}).content
    YesOrNo = intend.lower().replace(' ','').replace('result:','')
    
    if verbose==False:
        return YesOrNo
    else:
        template2 = """
        <paper>
        {text}
        </paper>
        
        [Task instructions]
        here is a paper above. 
        read the paper and determine whether the content is related to the {subject}. \
        If the content is related to the {subject}, please provide the reason,
        IF the content is not related to the {subject}, please **just** say ‘No reason’. \
        You must follow this format. \
        
        result: ‘It's related to {subject} since ...’ \
        result: ‘no’ \

        result: 
        """

        prompt = PromptTemplate(template=template2, input_variables=['subject', 'text'])
        chain = prompt | llm
        intend = chain.invoke({'subject':subject, 'text':text}).content
        reason = intend.lower()
        return "Answer: " + YesOrNo + "\n" + "Reason: " + reason

### 1.2.4 Get a summary of text

In [6]:
def get_summary(llm, text: str, subject: str = 'LITHIUM MARKET') -> str:
    template = '''
    Here is an article. 
    Read the article carefully and summarise the information related to {subject} using the [Task instructions]:

    [Task instructions]
    1. if you have specific numerical information, please use it in your summary.
    2. Use proper nouns in their original language.
    3. Summarize the information related to {subject}, focusing on the main points.
    4. Please give me the only summary without your comment or unnecessary information.
    
    <paper>
    {text}
    </paper>
    '''
    prompt = PromptTemplate(template=template, input_variables=['subject', 'text'])
    chain = prompt | llm
    response = chain.invoke({'subject':subject, 'text':text}).content
    
    return response


### 1.2.5 Extract some related sentences from the text given that the keyword is provided

In [7]:
def extract_sentences_with_keyword(llm, text: str, keyword: str) -> str:
    """
    Extracts sentences containing a specific keyword from a given text and returns them as bullet points.
    """
    output_parser = JsonOutputParser()
    format_instructions = output_parser.get_format_instructions()
    
    template = '''
    Here is an article.
    Read the article carefully and extract sentences that contain the keyword "{keyword}".
    Present each sentence as a bullet point in the following format:

    [Task instructions]
    1. Extract only the sentences that contain the keyword "{keyword}".
    2. Ensure each bullet point contains only one sentence.
    3. Do not modify the original sentences.
    4. If there are no related sentences, write "No related sentences".
    5. Please provide your own reason for your answer and follow the format below.
    6. ** Most Important ** : Please provide only answer in json format without any additional comment of you.

    <paper>
    {text}
    </paper>

    {format_instructions}
    Make a dictionary with the key 'sentence' and value as a list of sentence.
    Also add key 'reason' and value as a list of reasons.
    For example {{'sentence': ['sentence1.', sentence2', 'sentence3', ...], 'reason': ['reason1', 'reason2', 'reason3', ...]}}
    If there's no related sentences, provide me {{'sentence': ["No related sentences"]}}
    '''
    
    prompt = PromptTemplate(
        template=template,
        input_variables=['keyword', 'text'],
        partial_variables={"format_instructions": format_instructions}
    )
    
    chain = prompt | llm | output_parser

    try:
        result = chain.invoke({"text": text, "keyword": keyword})
        sentences = result.get('sentence', [])
        reasons = result.get('reason', [])
        if sentences==[]:
            return "- No related sentences"
        answer_s = '\n'.join(f'- {sentence}' for sentence in sentences)
        answer_r = '\n'.join(f'- {reason}' for reason in reasons)
        return '* Sentences from the original transcripts: \n' + answer_s + '\n' + '* Reasons by LLM: \n' + answer_r
    except Exception as e:
        return str(e)


### 1.2.6 Test helper functions with one video sample

>check 10 videos in one json file whether each video has a relevance with the lithium investment or not

In [8]:
## temp
### Set a specific json file to analyze and set keywords between 'LITHIUM INVESTMENT' and 'LITHIUM SUPPLY CHAIN'
file_path = 'jsons/lithium supply chain/2023-06-16.json'
keyword = 'LITHIUM SUPPLY CHAIN'
subject = 'LITHIUM OR BATTERY MARKET'

In [9]:
sample_data = json.load(open(file_path))

In [10]:
for i in range(len(sample_data[keyword])):
    llm = call_bedrock()

    print(f'\n[Sample Video {i+1}: ]\n')
    sample_caption = get_full_caption(sample_data[keyword][i])
    summary = get_summary(llm, sample_caption, subject=subject)

    print('\t', summary)
    print('\n[RELATED to keyword?: ]', check_relevance(llm, sample_caption, subject=subject, verbose=True), '\n')
    print('==========================================================================')


[Sample Video 1: ]

	 Summary of information related to the lithium or battery market:

Nickel is a crucial component in electric vehicle (EV) batteries, prized for increasing the range of EVs. Indonesia has become the world's largest nickel producer, with about half of the global nickel currently mined there. By 2027, analysts predict that approximately 80% of the global nickel supply used in EV batteries will come from Indonesia. The country's nickel is attractive to car makers because it's found close to the surface, making extraction relatively cheap compared to deep underground mining in countries like Canada and Russia. However, the extraction process in Indonesia, often using high-pressure acid leaching, is about twice as carbon-intensive as methods used elsewhere. This process leaves behind large amounts of waste and poses environmental risks, particularly to pristine ecosystems and coral reefs surrounding the mining areas. Despite these concerns, Indonesia's government sees n

>now check all the functions with only one video samples

In [11]:
sample_caption = get_full_caption(sample_data[keyword][1])

In [12]:
print('[Original transcript]', '\n', sample_caption)

[Original transcript] 
 our commodity race continues I hope you're entertained guys I made eight videos about uranium I brought the biggest uranium buyer in the world here to the office and you heard what he said now uranium broke out is up by a lot and it followed this trend line exactly and to those who say that Commodities are boring because price movements are small remember that unlike altcoins Commodities you can definitely trade successfully with leverage there are wheat Futures with 500x leverage and there are commodity companies that can go 100x thousand X so boring it's just not the correct term congrats to everyone who paid attention and maybe took our course with the commodity scope now with cryptos dwelling and chopping in the wake of regulatory attacks from the SEC in the US I hope you see the power of true diversification and today we continue a few weeks ago I covered oil and gas today it's an episode I'm really looking forward to myself battery Metals given the volume 

In [13]:
llm = call_bedrock()
keyword = "LITHIUM SUPPLY CHAIN"

print(f'[Is this video related to {subject}?]')
print(check_relevance(llm, sample_caption, subject=subject, verbose=True), '\n')

print('=='*10)

print('[Summary of video]')
print(get_summary(llm, sample_caption, subject=subject), '\n')

print('=='*10)

print('[Supply related sentences]')
print(extract_sentences_with_keyword(llm, sample_caption, 'LITHIUM SUPPLY CHAIN'), '\n')

print('=='*10)

print('[Demand related sentences]')
print(extract_sentences_with_keyword(llm, sample_caption, 'LITHIUM DEMAND ON MARKET'), '\n')

print('=='*10)

print('[Investment related sentences]')
print(extract_sentences_with_keyword(llm, sample_caption, 'INVESTMENT ON LITHIUM'), '\n')

[Is this video related to LITHIUM OR BATTERY MARKET?]
Answer: 'yes'
Reason: it's related to lithium or battery market since the paper extensively discusses lithium and battery metals as part of the energy transition and electric vehicle (ev) industry. it covers topics such as:

1. the composition of ev batteries, including lithium, nickel, manganese, and cobalt.
2. lithium production, processing, and supply chain details.
3. the growing demand for lithium in batteries (80% of lithium production goes to batteries).
4. supply and demand projections for lithium, indicating potential shortages in the future.
5. discussion of other battery metals like copper and their roles in the battery and ev markets.

the paper provides a comprehensive overview of the lithium and battery metal markets, making it highly relevant to the topic. 

[Summary of video]
Here's a summary of the information related to the lithium and battery market from the article:

1. Lithium is a critical component in electric

# 2. Extract impact scores (under the construction)

## 2.1 Set a output schema (json)

In [14]:
schema = {
    "properties": {
        "target": {
            "type": "string",
            "enum": ["demand quantity", "supply quantity", "price"],
            "description": "Specifies whether the information relates to the increase or decrease of the demand quantity, supply quantity, or price of lithium for battery production."
        },
        "target_condition": {
            "type": "string",
            "enum": ["increase", "decrease", "neutral"],
            "description": "Describes whether the text indicates an increase, decrease, or neutral outlook for the specified target in the lithium market."
        },
        "target_reason": {
            "type": "string",
            "description": "Describes the reason behind the impact on the target. If there is no significant impact, state 'There is no significant/direct impact'."
        },
        "influential_keywords": {
            "type": "array",
            "items": {
                "type": "string"
            },
            # "description": "List the top keywords that have the greatest influence on the target's condition. Include up to three keywords. If there is no significant impact, state 'None'."
            "description": '''Finds and ranks up to the top five key-word chunks or phrasal verbs that have a strong impact on the target's situation. 
                            For example, if there's relationship such as A -> B, and B is target, A is influential keyword.
                            Or if there's relationship such as A -> B -> C, and C is target, A and B are influential keywords.
                            Please exclude target itself or equivalent words from the list.
                            This selection should be based on a qualitative assessment to determine if there is a direct influence or a causal relationship between the keyword and the market changes. 
                            Keywords should be chosen only if they are inferred to have a direct causal impact on the target’s increase, decrease, or stability. 
                            This process involves analyzing the context in which these keywords appear, assessing the strength of association, and considering the immediacy of their impact. 
                            If no keywords meet these criteria, state 'None'.
                            '''
        },


        "price_impact": {
            "type": "integer",
            "enum": [0, 1, 2],
            "description": "Describes the short-term impact on lithium prices. A higher score indicates a greater impact. If there is no significant/direct impact, the score should be close to 0."
        },
        "price_condition": {
            "type": "string",
            "enum": ["increase", "decrease", "neutral"],
            "description": "Describes whether the text indicates an increase, decrease, or neutral outlook for the price in the short-term lithium market."
        },
        "price_reason": {
            "type": "string",
            "description": "Describes the reason behind the price impact. If there is no significant impact, state 'There is no significant/direct impact'."
        },

        "demand_impact": {
            "type": "integer",
            "enum": [0, 1, 2],
            "description": "Describes the short-term impact on the demand quantity for lithium. A higher score indicates a greater impact. If there is no significant/direct impact, the score should be close to 0."
        },
        "demand_condition": {
            "type": "string",
            "enum": ["increase", "decrease", "neutral"],
            "description": "Describes whether the text indicates an increase, decrease, or neutral outlook for the market demand quantity in the short-term lithium market."
        },
        "demand_reason": {
            "type": "string",
            "description": "Describes the reason behind the demand impact. If there is no significant impact, state 'There is no significant/direct impact'."
        },

        "supply_impact": {
            "type": "integer",
            "enum": [0, 1, 2],
            "description": "Describes the short-term impact on the supply quantity of lithium. A higher score indicates a greater impact. If there is no significant/direct impact, the score should be close to 0."
        },
        "supply_condition": {
            "type": "string",
            "enum": ["increase", "decrease", "neutral"],
            "description": "Describes whether the text indicates an increase, decrease, or neutral outlook for the supply quantity in the short-term lithium market."
        },
        "supply_reason": {
            "type": "string",
            "description": "Describes the reason behind the supply impact. If there is no significant impact, state 'There is no significant/direct impact'."
        }
    },
    "required": ["target", "target_condition", "target_reason", "influential_keywords", "price_impact", "price_condition", "price_reason", "demand_impact", "demand_condition", "demand_reason", "supply_impact", "supply_condition", "supply_reason"]
}

In [15]:
# schema = {
#     "properties": {
#         "target": {
#             "type": "string",
#             "enum": ["demand quantity", "supply quantity", "price"],
#             "description": "Specifies whether the information relates to the increase or decrease of the demand quantity, supply quantity, or price of lithium for battery production."
#         },
#         "target_condition": {
#             "type": "string",
#             "enum": ["increase", "decrease"],
#             "description": "Describes whether the text indicates an increase, decrease outlook for the specified target in the lithium market."
#         },
#         "influential_keywords": {
#             "type": "array",
#             "items": {
#                 "type": "string"
#             },
#             # "description": "List the top keywords that have the greatest influence on the target's condition. Include up to three keywords. If there is no significant impact, state 'None'."
#             "description": '''Finds and ranks up to the top five key-word chunks or phrasal verbs that have a strong impact on the target's situation. 
#                             For example, if there's relationship such as A -> B, and B is target, A is influential keyword.
#                             Or if there's relationship such as A -> B -> C, and C is target, A and B are influential keywords.
#                             Please exclude target itself or equivalent words from the list.
#                             This selection should be based on a qualitative assessment to determine if there is a direct influence or a causal relationship between the keyword and the market changes. 
#                             Keywords should be chosen only if they are inferred to have a direct causal impact on the target’s increase, decrease, or stability. 
#                             This process involves analyzing the context in which these keywords appear, assessing the strength of association, and considering the immediacy of their impact. 
#                             If no keywords meet these criteria, state 'None'.
#                             '''
#         },


#         "price_impact": {
#             "type": "integer",
#             "enum": [1],
#             "description": "Describes the short-term impact on lithium prices. A higher score indicates a greater impact. If there is no significant/direct impact, the score should be close to 0."
#         },
#         "price_condition": {
#             "type": "string",
#             "enum": ["increase", "decrease"],
#             "description": "Describes whether the text indicates an increase, decrease outlook for the price in the short-term lithium market."
#         },
#         "demand_impact": {
#             "type": "integer",
#             "enum": [1],
#             "description": "Describes the short-term impact on the demand quantity for lithium. A higher score indicates a greater impact. If there is no significant/direct impact, the score should be close to 0."
#         },
#         "demand_condition": {
#             "type": "string",
#             "enum": ["increase", "decrease"],
#             "description": "Describes whether the text indicates an increase, decrease outlook for the market demand quantity in the short-term lithium market."
#         },
#         "supply_impact": {
#             "type": "integer",
#             "enum": [1],
#             "description": "Describes the short-term impact on the supply quantity of lithium. A higher score indicates a greater impact. If there is no significant/direct impact, the score should be close to 0."
#         },
#         "supply_condition": {
#             "type": "string",
#             "enum": ["increase", "decrease"],
#             "description": "Describes whether the text indicates an increase, decrease outlook for the supply quantity in the short-term lithium market."
#         },

#     },
#     "required": ["target", "target_condition", "influential_keywords", "price_impact", "price_condition", "demand_impact", "demand_condition", "supply_impact", "supply_condition"]
# }


## 2.2 Set a prompt to LLM and chain

In [16]:
# Define the prompt template
### This part should be revised by DG ###

prompt_template = """
Please remember that you must give me the output in JSON format according to the schema provided.
Never include any additional text in the output.

You are an analyst researching the electric vehicle market and battery-related industries.
The following information pertains to the lithium market for battery production and its short-term outlook (within the next three months).
Again, **Very important**: Please return only the JSON output without any additional text such as "Here is the output in JSON format according to the provided schema:"

You are required to analyze the text and provide a structured summary based on the following schema:
Here is the schema (guideline for the output):
{schema}

Please provide the output strictly in JSON format according to the schema provided.
**Very important**: Please return only the JSON output without any additional text such as "Here is the output in JSON format according to the provided schema:"

Input Text: 
{input_text}
"""

prompt = PromptTemplate(template=prompt_template, input_variables=['schema', 'input_text'])


In [17]:
# Use JsonOutputParser for the final chain
json_output_parser = JsonOutputParser()
output_parser = StrOutputParser()

# Create the runnable sequence
# chain = prompt | llm | json_output_parser
chain = prompt | llm | output_parser

## 2.3 Run a test

In [18]:
# # Example function to run the chain
# def run_chain(input_text, schema):
#     # Run the final chain
#     final_result = chain.invoke({"schema": schema, "input_text": input_text})
#     return final_result


In [19]:
# import json

# # Example function to run the chain with additional exception handling
# def run_chain(input_text, schema):
#     try:
#         # Run the final chain
#         final_result = chain.invoke({"schema": schema, "input_text": input_text})
        
#         # Attempt to parse the text to extract only the JSON part
#         # Find the first '{' and last '}' to isolate JSON
#         # print(final_result)
#         start_idx = final_result.find('{')
#         end_idx = final_result.rfind('}') + 1
#         json_text = final_result[start_idx:end_idx]
#         parsed_json = json.loads(json_text)
        
#         return parsed_json
#     except (ValueError, json.JSONDecodeError) as e:
#         raise ValueError(f"Error parsing the JSON output: {e}")

In [20]:
# # Define a default JSON object to use in case of errors
# default_json = {
#     "target": "None",
#     "target_condition": "neutral",
#     "target_reason": "There is no significant/direct impact",
#     "influential_keywords": ["None"],
#     "price_impact": 0,
#     "price_condition": "neutral",
#     "price_reason": "There is no significant/direct impact",
#     "demand_impact": 0,
#     "demand_condition": "neutral",
#     "demand_reason": "There is no significant/direct impact",
#     "supply_impact": 0,
#     "supply_condition": "neutral",
#     "supply_reason": "There is no significant/direct impact"
# }

# Define a default JSON object to use in case of errors
default_json = {
    "target": "None",
    "target_condition": "increase",
    "influential_keywords": ["None"],
    "price_impact": 0,
    "price_condition": "increase",
    "demand_impact": 0,
    "demand_condition": "increase",
    "supply_impact": 0,
    "supply_condition": "increase"
}

# Example function to run the chain with additional exception handling
def run_chain(input_text, schema):
    try:
        # Run the final chain
        final_result = chain.invoke({"schema": schema, "input_text": input_text})
        
        # Attempt to parse the text to extract only the JSON part
        start_idx = final_result.find('{')
        end_idx = final_result.rfind('}') + 1
        json_text = final_result[start_idx:end_idx]
        parsed_json = json.loads(json_text)
        
        return parsed_json
    except (ValueError, json.JSONDecodeError) as e:
        # Log the error
        print(f"Error parsing the JSON output: {e}")
        # Return the default JSON object
        return default_json

In [21]:
run_chain(sample_caption, schema)

{'target': 'supply quantity',
 'target_condition': 'increase',
 'target_reason': 'There is a growing demand for battery metals, particularly lithium and copper, due to the energy transition and increasing EV production. However, processing capacity is limited, especially in the West.',
 'influential_keywords': ['energy transition',
  'EV battery',
  'processing capacity',
  'China',
  'recycling'],
 'price_impact': 1,
 'price_condition': 'neutral',
 'price_reason': "While there's increasing demand, current supply and processing capacity are balanced in the short term. Lithium prices have recently decreased, and copper prices are range-bound.",
 'demand_impact': 2,
 'demand_condition': 'increase',
 'demand_reason': 'Growing demand for EVs and renewable energy technologies is driving up the need for battery metals, particularly lithium and copper.',
 'supply_impact': 1,
 'supply_condition': 'increase',
 'supply_reason': "While raw material supply is sufficient, there's a need to increase

# 2. Collecting all impact scores

In [29]:

keyword = "LITHIUM INVESTMENT"
folder_name = "lithium investment"
dict_condition = {"increase": 1, "decrease": -1, "stable": 0, 'neutral': 0}

# Collect all the file names in the directory
file_names = os.listdir(f'jsons/{folder_name}/')

# 
dict_result = {}
dict_mean_by_date = {}

### Processing each json file
for file in tqdm(file_names):
    json_file = json.load(open(f'jsons/{folder_name}/{file}'))
    date = file.split('.')[0]
    # print(date)

    
    lst_target= []
    lst_target_condition = []
    lst_target_reason = []
    lst_influential_keywords = []
    
    lst_price_impact = []
    lst_demand_impact = []
    lst_supply_impact = []

    ### Processing each video in the json file
    for idx, video in enumerate(json_file[keyword]):
        video_caption = get_full_caption(video)
        llm = call_bedrock()

        # Check relevance
        relevance = check_relevance(llm, video_caption, keyword)
        relevance = relevance.strip("'")
        # print('actual: ', relevance)

        if relevance != 'yes':
            # print('not related')
            continue
        else:
            result = run_chain(video_caption, schema)
            # print(result)
            lst_price_impact.append(result['price_impact'] * dict_condition[result['price_condition']])
            lst_demand_impact.append(result['demand_impact'] * dict_condition[result['demand_condition']])
            lst_supply_impact.append(result['supply_impact'] * dict_condition[result['supply_condition']])
            # print(lst_price_impact)

            lst_target.append(result['target'])
            lst_target_condition.append(result['target_condition'])
            lst_influential_keywords.append(result['influential_keywords'])
        
    dict_result[date] = {
        "target": lst_target,
        "target_condition": lst_target_condition,
        "influential_keywords": lst_influential_keywords,
        "price_impact": lst_price_impact,
        "demand_impact": lst_demand_impact,
        "supply_impact": lst_supply_impact
    }
    # print(dict_result)

    try:
        dict_mean_by_date[date] = {
            "price_impact": sum(lst_price_impact) / len(lst_price_impact),
            "demand_impact": sum(lst_demand_impact) / len(lst_demand_impact),
            "supply_impact": sum(lst_supply_impact) / len(lst_supply_impact)
        }
    except:
        dict_mean_by_date[date] = {
            "price_impact": 0,
            "demand_impact": 0,
            "supply_impact": 0
        }

 16%|█▋        | 55/335 [1:01:44<5:08:55, 66.20s/it]

Error parsing the JSON output: Expecting value: line 1 column 1 (char 0)


 18%|█▊        | 61/335 [1:08:34<4:54:38, 64.52s/it]

Error parsing the JSON output: Expecting value: line 1 column 1 (char 0)


 19%|█▉        | 64/335 [1:13:13<5:55:57, 78.81s/it]

Error parsing the JSON output: Expecting value: line 1 column 1 (char 0)


 24%|██▍       | 81/335 [1:30:17<4:12:19, 59.60s/it]

Error parsing the JSON output: Expecting value: line 1 column 1 (char 0)


 44%|████▍     | 149/335 [2:46:32<3:53:54, 75.45s/it]

Error parsing the JSON output: Expecting value: line 1 column 1 (char 0)
Error parsing the JSON output: Expecting value: line 1 column 1 (char 0)


 52%|█████▏    | 173/335 [3:13:02<3:11:35, 70.96s/it]

Error parsing the JSON output: Expecting value: line 1 column 1 (char 0)


 70%|███████   | 235/335 [4:20:35<1:43:02, 61.83s/it]

Error parsing the JSON output: Expecting value: line 1 column 1 (char 0)


 72%|███████▏  | 241/335 [4:27:20<1:52:58, 72.11s/it]

Error parsing the JSON output: Expecting value: line 1 column 1 (char 0)


100%|██████████| 335/335 [6:14:05<00:00, 67.00s/it]  


In [30]:
with open('investment_whole_output_cl3.5.json', 'w') as f:
    sorted_data = dict(sorted(dict_result.items(), key=lambda x: datetime.strptime(x[0], '%Y-%m-%d')))
    json.dump(sorted_data, f, indent=4)

with open('investment_mean_output_cl3.5.json', 'w') as f:
    sorted_data = dict(sorted(dict_mean_by_date.items(), key=lambda x: datetime.strptime(x[0], '%Y-%m-%d')))
    json.dump(sorted_data, f, indent=4)

In [None]:
# Feature Extarction
# entity_dict = {}
# for k in tqdm(file_list):
#     print(k)
#     score_list = []
#     reason_list = []
#     data = s3_read_json_file(bucket, k)
#     for video in tqdm(data['Ford EV']):
#         text = get_caption(video)
#         output = entity_chain.run(text[:75000])
#         try:
#             score_list.append(output['impact'] * condition_dict[output['condition']])
#         except:
#             score_list.append(0)
#         reason_list.append(output['impact reason'])
#     entity_dict[k] = {'score':score_list, 'reason':reason_list}
    
# entity_df = pd.DataFrame(entity_dict).T.reset_index()
# entity_df['date'] = entity_df['index'].apply(lambda x:x.split(".")[0].split("/")[-1])
# entity_df['year'] = entity_df['date'].apply(lambda x:int(x.split("-")[0]))
# entity_df['month'] = entity_df['date'].apply(lambda x:int(x.split("-")[1]))
# entity_df['day'] = entity_df['date'].apply(lambda x:int(x.split("-")[2]))

In [None]:
feature_dict = {}
for year in [2022, 2023]:
    for month in range(1, 13):
        month_df = entity_df[(entity_df['month']==month) & (entity_df['year']==year)].reset_index(drop=True)
        month_df['week'] = np.arange(month_df.shape[0])+1
        score_ = month_df['score'].values.sum()
        reason_ = month_df['reason'].values.sum()
        score = np.mean(score_)
        context = " ".join(reason_)
        reason = chain.invoke({'target':"FORD", 'context':context, 'score':score})['Impact Score Reason']
        feature_dict["{}_{}".format(year, month)] = [score, reason]

        plt.figure(figsize=(10, 3))
        plt.title("Yeaa={}, Month={}, Mean Impact Score={}, Sample={}".format(year, month, np.mean(score_), len(score_)))
        sns.distplot(score_)
        plt.show()

        plt.figure(figsize=(10, 3))
        plt.title("Change in distribution by Weekly")
        for idx, i in enumerate(month_df['score']):
            sns.distplot(i, hist=None, label="{} Week".format(idx+1))
        plt.legend()
        plt.show()

        plt.figure(figsize=(10, 3))
        plt.title("Distribution by Weekly")
        for idx, data_group in enumerate(month_df['score']):
            plt.subplot(1, len(month_df), idx + 1)
            sns.distplot(data_group)
            plt.xlim(-10, 10)
            plt.ylim(0, 0.2)
        plt.tight_layout()
        plt.show()

        print("=="*60)
    print("=="*60)
    print("=="*60)
    print("=="*60)

In [None]:
# def process_video_summary(self, shared_list, idx, video, key):
    
#     video_id = video['id']['videoId']
    
#     link = f'링크 : https://www.youtube.com/watch?v={video_id}\n'

#     text = " ".join([item['text'] for item in video['caption']])
#     text = text.replace("[Music]","")
#     text = text.replace("[음악]","")
#     text = text[:760000]

#     if len(text) < 10:
#         item = shared_list[idx]
#         item['summary'] = "subtitle is not available"
#         item['cod'] = "subtitle is not available"
#         item['event'] = []
#         item['news'] = []
#         shared_list[idx] = item
#         return video

#     intend = self._get_intend(text, key)

#     if 'no' in intend :
#         item = shared_list[idx]
#         item['summary'] = "Not Related to Electric Car"
#         item['cod'] = "Not Related to Electric Car"
#         item['event'] = []
#         item['news'] = []
#         shared_list[idx] = item
#         return video

#     # 본문을 Chunk 단위로 쪼갬
#     text_splitter = CharacterTextSplitter(
#         chunk_size=300000,     # 쪼개는 글자수
#         chunk_overlap=30000,   # 오버랩 글자수
#         length_function=len,
#     )

#     text = link + text
#     text = text_splitter.split_text(text)
#     docs = [Document(page_content=t) for t in text]
#     summary = self._get_summary(docs, key)
#     cod = self._get_cod(text, key)
    
#     item = shared_list[idx]
#     item['summary'] = summary
#     item['cod'] = cod
#     item['event'] = self._get_event(item)
#     item['news'] = self._get_news(item['event'], cod[-1]['Denser_Summary'], key)
#     # item['enhanced_summary'] = self._get_enhance(cod[-1]['Denser_Summary'], item['news'])
#     shared_list[idx] = item  

#     return video