## Install selenium, webdriver

In [90]:
!pip install selenium
!pip install webdriver-manager
# !pip install WebDriverWait



## Import &  Check Data

In [4]:
import pandas as pd
import numpy as np
import time
import itertools # Use to extract nested lists
import requests
import os
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support import expected_conditions as EC
from io import StringIO
from selenium.webdriver.support.ui import WebDriverWait

data_stocks_name = pd.read_csv('Stocks_Name.csv')
data_stocks_name.tail()

Unnamed: 0,Stocks_Name
918,YONG
919,YUASA
920,ZAA
921,ZEN
922,ZIGA


### ! Function check stock name  

In [6]:
# Check stock name in the data_stocks_name
def check_stock_exists_insensitive(stock_names):
    lowercase_stock_names = set(data_stocks_name['Stocks_Name'].str.lower())
    if isinstance(stock_names, str):
        stock_names = [stock_names]
    for stock_name in stock_names:
        found = stock_name.lower() in lowercase_stock_names
        print(f"Find stock in df    : {stock_name} ✅" if found else f"No Find stock in df : {stock_name} ❌")

check_stock_exists_insensitive(['aav', 'bbL', 'PTT','peeat'])

Find stock in df    : aav ✅
Find stock in df    : bbL ✅
Find stock in df    : PTT ✅
No Find stock in df : peeat ❌


### ! Function get financial

#### ---- Use requests ----

In [16]:
# https://stockanalysis.com/

def requests_financial_data(ticker):
    ticker = ticker.upper()
    start_time = time.time()
    financial_data_list = []
    report_types = {
        "Income Statement": ("financials", "Income"),
        "Balance Sheet": ("financials/balance-sheet", "BalanceSheet"),
        "Cash Flow Statement": ("financials/cash-flow-statement", "CashFlow") }

    for folder_name, (report_path, report_name_short) in report_types.items():
        time.sleep(np.random.randint(1, 6))
        url = f'https://stockanalysis.com/quote/bkk/{ticker}/{report_path}/'
        file_name = f"{ticker}_{report_name_short}.csv"
        
        # Create a folder if it doesn't exist yet
        folder_path = os.path.join(os.getcwd(), folder_name)  # Create a path for the folde
        if not os.path.exists(folder_path):
            os.makedirs(folder_path)
        file_path = os.path.join(folder_path, file_name)

        try:
            response = requests.get(url)
            response.raise_for_status()  # Raise an exception for HTTP errors
            soup = BeautifulSoup(response.text, 'html.parser')
            table = soup.find('table', id='main-table')

            if table:
                df = pd.read_html(StringIO(str(table)))[0].iloc[:, :-1]
                if isinstance(df.columns, pd.MultiIndex):
                    df.columns = df.columns.get_level_values(0)
                financial_data_list.append(df)
                df.to_csv(file_path, index=False, encoding='utf-8')
                print(f" Time taken for {ticker} - {folder_name}: {time.time() - start_time:.2f} sec (Saved to {file_path})")
            else:
                print(f" Warning : Data table not found for {ticker} - {folder_name}")

        except requests.exceptions.RequestException as e:
            print(f"Error fetching URL for {ticker} - {folder_name} : {e}")
        except ValueError as e:
            print(f"Error parsing table for {ticker} - {folder_name} : {e}")
    return

In [18]:
requests_financial_data('yyyyyy')



In [2]:
# import สั้นลง

import pandas as pd
import numpy as np
import time
import requests
import os
from bs4 import BeautifulSoup

def requests_financial_data(ticker):
    ticker = ticker.upper()
    start_time = time.time()

    report_types = {
        "Income Statement": ("financials", "Income"),
        "Balance Sheet": ("financials/balance-sheet", "BalanceSheet"),
        "Cash Flow Statement": ("financials/cash-flow-statement", "CashFlow")
    }

    for folder_name, (report_path, _) in report_types.items():
        time.sleep(np.random.randint(1, 6))  # Random delay to avoid rate-limiting
        url = f'https://stockanalysis.com/quote/bkk/{ticker}/{report_path}/'
        file_name = f"{ticker}_{folder_name.replace(' ', '')}.csv"
        
        # Create folder if it doesn't exist
        folder_path = os.path.join(os.getcwd(), folder_name)
        os.makedirs(folder_path, exist_ok=True)  # Simplified folder creation
        file_path = os.path.join(folder_path, file_name)

        try:
            response = requests.get(url)
            response.raise_for_status()  # Raise an exception for HTTP errors
            soup = BeautifulSoup(response.text, 'html.parser')
            table = soup.find('table', id='main-table')

            if table:
                df = pd.read_html(str(table))[0].iloc[:, :-1] 
                if isinstance(df.columns, pd.MultiIndex):
                    df.columns = df.columns.get_level_values(0)  
                df.to_csv(file_path, index=False, encoding='utf-8')  # Save to CSV
                print(f"Time taken for {ticker} - {folder_name}: {time.time() - start_time:.2f} sec (Saved to {file_path})")
            else:
                print(f"Warning: Data table not found for {ticker} - {folder_name}")

        except requests.exceptions.RequestException as e:
            print(f"Error fetching URL for {ticker} - {folder_name}: {e}")
        except ValueError as e:
            print(f"Error parsing table for {ticker} - {folder_name}: {e}")

In [4]:
requests_financial_data('ttb')

  df = pd.read_html(str(table))[0].iloc[:, :-1]


Time taken for TTB - Income Statement: 5.89 sec (Saved to D:\Anaconda_Jupyter\DS Intern at Pi\Sprint01_ScrapFinStatement\Income Statement\TTB_IncomeStatement.csv)


  df = pd.read_html(str(table))[0].iloc[:, :-1]


Time taken for TTB - Balance Sheet: 9.73 sec (Saved to D:\Anaconda_Jupyter\DS Intern at Pi\Sprint01_ScrapFinStatement\Balance Sheet\TTB_BalanceSheet.csv)
Time taken for TTB - Cash Flow Statement: 11.51 sec (Saved to D:\Anaconda_Jupyter\DS Intern at Pi\Sprint01_ScrapFinStatement\Cash Flow Statement\TTB_CashFlowStatement.csv)


  df = pd.read_html(str(table))[0].iloc[:, :-1]


#### ---- Use selenium ----

In [12]:
def scrap_financial_data(ticker):
    start_time = time.time()
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
    url_income = f'https://stockanalysis.com/quote/bkk/{ticker}/financials/' # Imcome
    try:
        driver.get(url_income)
        wait = WebDriverWait(driver, 30)
        table_element = wait.until(
            EC.presence_of_element_located((By.TAG_NAME, 'table')))
        html = driver.page_source
        soup = BeautifulSoup(html, 'html.parser')
        table = soup.find('table')
        df = pd.read_html(StringIO(str(table)))[0].iloc[:, :-1] if table else None

        if df is not None and isinstance(df.columns, pd.MultiIndex):
            df.columns = df.columns.get_level_values(0)

        print(f"⌛ Time taken for {ticker} : {time.time() - start_time:.2f} sec")
        return df
    except Exception as e:
        print(f"Error scraping data for {ticker} : {e}")
        return None
    finally:
        driver.quit()

In [14]:
scrap_financial_data('bbl')

⌛ Time taken for bbl : 7.80 sec


Unnamed: 0,Fiscal Year,TTM,FY 2024,FY 2023,FY 2022,FY 2021,FY 2020
0,Interest Income on Loans,205827,208029,194365,139088,114313,112524
1,Interest Income on Investments,2288,2345,2330,2306,2252,2039
2,Total Interest Income,208116,210374,196695,141394,116564,114563
3,Interest Paid on Deposits,73441,74129,63504,36865,32156,35477
4,Net Interest Income,134674,136245,133191,104530,84408,79086
5,Net Interest Income Growth (YoY),-1.48%,2.29%,27.42%,23.84%,6.73%,5.67%
6,Gain (Loss) on Sale of Assets,1023,913.14,893.76,1891,519.2,734.13
7,Gain (Loss) on Sale of Investments,3606,890.88,-482.23,-1454,1225,2512
8,Gain (Loss) on Sale of Equity Investments,202.89,204.59,187.38,189.56,208.86,-14.49
9,Other Non-Interest Income,40276,37558,33698,33738,48180,36411
