In [1]:
# import os
# import sys
# PROJECT_ROOT = os.path.dirname(os.path.abspath(""))

# if PROJECT_ROOT not in sys.path:
#     sys.path.insert(0, PROJECT_ROOT)
# from scripts.extract import WorldBankExtractOperator
# from scripts.transform import SparkTransformOperator
# from scripts.loads import DatabaseLoaderService
# from scripts.load_wbapi_etl_config import ETLPipelineConfig


In [3]:
import requests
from bs4 import BeautifulSoup
import re

url = "https://www.cophieu68.vn/quote/financial_detail.php?id=vpb&type=year"
soup = BeautifulSoup(requests.get(url).text, 'html.parser')
print(soup.prettify())

<html>
 <head>
  <title>
  </title>
  <meta charset="utf-8"/>
  <meta content="width=device-width, initial-scale=1, maximum-scale=1" name="viewport"/>
  <meta content="654849931255-ih074eactp9a01969m3et8i16vuai71r.apps.googleusercontent.com" name="google-signin-client_id"/>
  <meta content="MOqJliEGOjPuN6S6rRC_ivdy9sgJpQ_K0Sads9VpcTQ" name="google-site-verification">
   <meta content="" name="keywords">
    <meta content="" name="description">
     <link href="https://www.cophieu68.vn/css/colorbox.css" media="screen" rel="stylesheet" type="text/css"/>
     <link href="https://www.cophieu68.vn/css/screen.css" media="screen" rel="stylesheet" type="text/css"/>
     <link href="https://www.cophieu68.vn/css/mobile.css" media="screen" rel="stylesheet" type="text/css"/>
     <script data-auto-replace-svg="nest" src="https://www.cophieu68.vn/js/fontawesome.js">
     </script>
     <script src="https://www.cophieu68.vn/js/jquery.min.js" type="text/javascript">
     </script>
     <script src="h

In [6]:
import pandas as pd
from dataclasses import dataclass
from typing import List, Optional

# ==== Dataclass ====
@dataclass
class StockFinancialReport:
    symbol: str
    report_type: str         # "income" | "balance" | "cashflow"
    table_index: int         # bảng số mấy trong page
    data: pd.DataFrame       # dữ liệu báo cáo (raw DataFrame)
    
@dataclass
class IncomeStatementReport:
    symbol: str
    period: Optional[str]   # Năm/Quý, có thể None nếu chưa parse
    table_index: int
    data: pd.DataFrame  


# ==== Hàm crawl generic ====
def crawl_financial_report(symbol: str, report_type: str) -> Optional[List[StockFinancialReport]]:
    url = f"https://www.cophieu68.vn/quote/financial_detail.php?id={symbol.upper()}&type={report_type}"
    try:
        tables = pd.read_html(url, flavor="lxml")
        reports = []
        for i, df in enumerate(tables):
            reports.append(
                StockFinancialReport(
                    symbol=symbol.upper(),
                    report_type=report_type,
                    table_index=i,
                    data=df
                )
            )
        return reports
    except Exception as e:
        print(f"❌ Error fetching {report_type} for {symbol}: {e}")
        return None


# ==== Hàm crawl Income Statement ====
def crawl_income_statement(symbol: str) -> Optional[List[IncomeStatementReport]]:
    reports = crawl_financial_report(symbol, "income")
    if not reports:
        return None
    return [
        IncomeStatementReport(
            symbol=r.symbol,
            period=None,   # bạn có thể parse từ df.columns[0] nếu muốn
            table_index=r.table_index,
            data=r.data
        )
        for r in reports
    ]


# ==== Test thử với VPB ====
income_reports = crawl_income_statement("VPB")

if income_reports:
    print(f"Số bảng lấy được: {len(income_reports)}")
    print("5 dòng đầu tiên của bảng 1:")
    display(income_reports[0].data)
else:
    print("Không crawl được báo cáo.")


Số bảng lấy được: 2
5 dòng đầu tiên của bảng 1:


Unnamed: 0,Chỉ tiêu,Qúy 4 2024,Qúy 3 2024,Qúy 2 2024,Qúy 1 2024,Qúy 4 2023,Qúy 3 2023,Qúy 2 2023,Qúy 1 2023,Qúy 4 2022,...,Qúy 2 2022,Qúy 1 2022,Qúy 4 2021,Qúy 3 2021,Qúy 2 2021,Qúy 1 2021,Qúy 4 2020,Qúy 3 2020,Qúy 2 2020,Qúy 1 2020
0,Thu nhập lãi thuần,13192997,12155772,12408240.0,11323398.0,11041836.0,8836748,8762153.0,9533939.0,10282653.0,...,10465653.0,9887714.0,8522870.0,7474232.0,9231836.0,9119891.0,8739998.0,7883845.0,7700802.0,8021178.0
1,Thu nhập từ lãi và các khoản thu nhập tương tự,21401247,19607046,19757425.0,19345927.0,20430922.0,19323045,18774908.0,18028502.0,16655830.0,...,15446690.0,14239240.0,12498074.0,11504643.0,13486135.0,13338246.0,13348549.0,12883536.0,12881819.0,13247863.0
2,Chi phí lãi và các chi phí tương tự,-8208250,-7451274,-7349185.0,-8022529.0,-9389086.0,-10486297,-10012755.0,-8494563.0,-6373177.0,...,-4981037.0,-4351526.0,-3975204.0,-4030411.0,-4254299.0,-4218355.0,-4608551.0,-4999691.0,-5181017.0,-5226685.0
3,Lãi/Lỗ thuần từ hoạt động dịch vụ,1541890,1147565,1883037.0,1553831.0,1881068.0,1889361,1657042.0,1668354.0,1881147.0,...,1538007.0,1249219.0,1196076.0,790544.0,1084925.0,987486.0,1033041.0,940521.0,687414.0,695151.0
4,Thu nhập từ hoạt động dịch vụ,3520835,2866247,3652877.0,3161701.0,3566973.0,3002653,2830715.0,2786451.0,3112294.0,...,2450690.0,2099591.0,2065323.0,1472398.0,1716381.0,1629959.0,1719173.0,1605722.0,1402594.0,1439264.0
5,Chi phí hoạt động dịch vụ,-1978945,-1718682,-1769840.0,-1607870.0,-1685905.0,-1113292,-1173673.0,-1118097.0,-1231147.0,...,-912683.0,-850372.0,-869247.0,-681854.0,-631456.0,-642473.0,-686132.0,-665201.0,-715180.0,-744113.0
6,Lãi/Lỗ thuần từ hoạt động kinh doanh ngoại hối,232871,96258,193939.0,304172.0,-185079.0,-63648,-210616.0,-346630.0,-339979.0,...,-166609.0,-82911.0,-30337.0,-9619.0,7087.0,-43525.0,-87266.0,-126783.0,-92620.0,-538.0
7,Lãi/Lỗ thuần từ mua bán chứng khoán kinh doanh,236288,116898,40310.0,-32540.0,47520.0,88897,148543.0,95213.0,68282.0,...,-232373.0,-66352.0,13522.0,-11009.0,56343.0,-49977.0,-3919.0,-4644.0,16699.0,218330.0
8,Lãi/Lỗ thuần từ mua bán chứng khoán đầu tư,3650,17135,411663.0,37219.0,-44912.0,11887,252537.0,30520.0,-22317.0,...,348602.0,171642.0,784220.0,727317.0,1390387.0,248942.0,298983.0,255196.0,95862.0,520690.0
9,Lãi/Lỗ thuần từ hoạt động khác,2427055,1526232,1199843.0,224532.0,600707.0,495639,2171847.0,1377357.0,898022.0,...,1323555.0,7110283.0,584093.0,1158668.0,276368.0,788659.0,724190.0,520855.0,540314.0,450802.0
