In [1]:
import pandas as pd
df  = pd.read_csv("final_financial_statements.csv", sep ="\t")

In [2]:
df.shape

(8096, 10)

In [24]:
pwd

'C:\\Users\\user\\Desktop\\SSID\\Project_ValuX\\Financial_Statement_Analysis\\model\\output_data'

In [31]:
# df[
#     (df['company_code'] == 'WSS')
#     & (df['report_date'] == 2015)
#     & (df['report_type'] == 'Balance Sheet')
# ]

In [10]:
df_load.to_parquet("final_financial_statements.parquet")

In [4]:
pwd

'C:\\Users\\user\\Desktop\\SSID\\Project_ValuX\\Financial_Statement_Analysis\\model\\output_data'

In [6]:
df_load = pd.read_parquet("all_financial_statements.parquet")

In [7]:
df_load.shape

(1408945, 10)

In [9]:
df_load.report_type.value_counts()

report_type
Balance Sheet          5236
Cash Flow Statement    1848
Income Statement       1012
Name: count, dtype: int64

In [9]:
financial_statement_schemas = ['company_code', 'exchange', 'company_name', 'industry', 'report_type', 'report_date', 'account', 'value', 'account_vi', 'account_en']
df_load.columns = financial_statement_schemas

In [11]:
df_load = pd.read_parquet('C:\\Users\\user\\Desktop\\SSID\\Project_ValuX\\Financial_Statement_Analysis\\apps\\data\\Financial_Statement__Full_Company_L10Y.parquet')

In [15]:
df_load['company_code']

0          AAA
1          AAA
2          AAA
3          AAA
4          AAA
          ... 
1408963    YEG
1408964    YEG
1408965    YEG
1408966    YEG
1408967    YEG
Name: company_code, Length: 1408968, dtype: object

In [1]:
import os
import logging
import datetime
import json
import argparse
import pandas as pd
from vnstock import Listing
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm

# ==============================================================================
# CONFIGURATION - THAY ĐỔI CÁC THAM SỐ TẠI ĐÂY
# ==============================================================================
CONFIG = {
    "start_year": 2015,
    "max_workers": 4,  # Số luồng chạy song song
    "output_dir": "output_data",
    "company_list_filename": "company_list.csv",
    "raw_data_filename": "raw_financials.parquet",
    "final_data_filename": "final_financial_statements.parquet",
    "final_data_filename_csv": "final_financial_statements.csv",
    "mapping_filepath": "account_mapping.json"
}
financial_statement_schemas = ['company_code', 'exchange', 'company_name', 'industry', 'report_type', 'report_date', 'account', 'value', 'account_vi', 'account_en']

# ==============================================================================
# LOGIC SCRAPING (CLASS) - PHIÊN BẢN ĐÃ SỬA LỖI
# ==============================================================================
class CafeFScraper:
    """Scrapes financial statements for a single company from cafef.vn."""
    BASE_URL = "https://s.cafef.vn/bao-cao-tai-chinh/{}/{}/{}/0/0/0/0/bao-cao-tai-chinh.chn"
    REPORT_TYPES = ['bsheet', 'incsta', 'cashflow']

    def __init__(self, symbol: str, start_year: int):
        self.symbol = symbol.upper()
        self.start_year = start_year
        self.end_year = datetime.datetime.now().year

    def _fetch_report_table(self, report_type: str, year: int) -> pd.DataFrame | None:
        """
        Fetches the entire financial report table for a given year.
        Returns the specific DataFrame table, or None if it fails.
        """
        url = self.BASE_URL.format(self.symbol, report_type, year)
        try:
            web_data = pd.read_html(url)
            # The actual data is usually in the 5th table (index 4)
            table = web_data[4] 
            # Check if the table is meaningful (has more than 1 row and 1 column)
            if table.shape[0] > 1 and table.shape[1] > 1:
                return table
            return None
        except Exception as e:
            logging.debug(f"Could not fetch or parse table from {url}. Error: {e}")
            return None

    def scrape_all_reports(self) -> pd.DataFrame | None:
        """
        Scrapes all three report types (BS, IS, CF) for all years for the company.
        This version is more resilient to missing data for the most recent year.
        """
        company_reports = []
        report_map = {'bsheet': 'Balance Sheet',
                      'incsta': 'Income Statement',
                      'cashflow': 'Cash Flow Statement',
                      'cashflowdirect': 'Direct Cash Flow Statement'
                     }
        
        for report_type in self.REPORT_TYPES:
            # Step 1: Fetch all available yearly tables first
            yearly_tables = {}
            for year in range(self.start_year, self.end_year + 1):
                table = self._fetch_report_table(report_type, year)
                if table is not None:
                    yearly_tables[year] = table
            
            # Step 2: If no data was found for any year, skip this report type
            if not yearly_tables:
                logging.debug(f"No data found for {self.symbol} - {report_type} in any year.")
                continue

            # Step 3: Get categories from the first successfully fetched table
            # Categories (account names) are in the first column (iloc[:, 0])
            first_available_table = list(yearly_tables.values())[0]
            categories = first_available_table.iloc[:, 0]

            # Step 4: Extract the actual data column from each fetched table
            # Data is in the 5th column (iloc[:, 4])
            yearly_data = {
                year: table.iloc[:, 4]
                for year, table in yearly_tables.items()
            }
            
            # Step 5: Construct the DataFrame for this report type
            df_wide = pd.DataFrame(yearly_data)
            df_wide['account'] = categories
            df_long = df_wide.melt(id_vars=['account'], var_name='report_date', value_name='value')
            df_long['report_type'] = report_map.get(report_type)
            company_reports.append(df_long)
        
        if not company_reports: 
            return None
            
        final_df = pd.concat(company_reports, ignore_index=True)
        final_df['symbol'] = self.symbol
        return final_df
# ==============================================================================
# LOGIC HELPER (FUNCTIONS)
# ==============================================================================
def get_company_listing() -> pd.DataFrame:
    """Fetches a list of companies from HSX and HNX."""
    logging.info("Fetching company list...")
    listing = Listing()
    df_symbols = listing.symbols_by_exchange()
    df_short = df_symbols[df_symbols['exchange'].isin(['HSX', 'HNX']) & (df_symbols['type'] == 'STOCK')]
    
    df_industries = listing.symbols_by_industries()
    df_industry_names = df_industries[['symbol', 'icb_name2']].rename(columns={'icb_name2': 'industry'})
    
    df_final = pd.merge(df_short, df_industry_names, how='left', on='symbol')
    return df_final[['symbol', 'exchange', 'organ_name', 'industry']].dropna(subset=['symbol'])

def transform_data(raw_df: pd.DataFrame, company_info_df: pd.DataFrame, mapping_dict: dict) -> pd.DataFrame:
    """Cleans and transforms raw scraped data."""
    logging.info("Transforming raw data...")
    df = pd.merge(raw_df, company_info_df, on='symbol', how='left')
    df.dropna(subset=['symbol', 'report_date', 'account'], inplace=True)
    df['value'] = pd.to_numeric(df['value'], errors='coerce')
    df['report_date'] = df['report_date'].astype(str)
    
    df['account_vi'] = df['account']
    df['account_en'] = df['account'].apply(lambda x: mapping_dict.get(x, {}).get('english'))
    df['account'] = df['account'].apply(lambda x: mapping_dict.get(x, {}).get('english_format'))
    
    final_cols = ['symbol', 'exchange', 'organ_name', 'industry', 'report_type', 'report_date', 'account', 'value', 'account_vi', 'account_en']
    return df[final_cols].sort_values(by=['symbol', 'report_type', 'report_date'])


In [13]:

output_dir = CONFIG['output_dir']
os.makedirs(output_dir, exist_ok=True)

# --- Step 1: Get Company List ---
try:
    company_df = get_company_listing()
    company_list_path = os.path.join(output_dir, CONFIG['company_list_filename'])
    company_df.to_csv(company_list_path, index=False)
    logging.info(f"Saved company list of {len(company_df)} companies to {company_list_path}")
except:
    company_df = pd.read_csv(company_list_path)

# --- Step 2: Scrape Data Concurrently ---
symbols_to_scrape = company_df['symbol'].tolist()

In [2]:
symbols_to_scrape

NameError: name 'symbols_to_scrape' is not defined

In [14]:
all_results = []
with ThreadPoolExecutor(max_workers=CONFIG['max_workers']) as executor:
    future_to_symbol = {
        executor.submit(CafeFScraper(symbol, CONFIG['start_year']).scrape_all_reports): symbol
        for symbol in symbols_to_scrape
    }
    progress = tqdm(as_completed(future_to_symbol), total=len(symbols_to_scrape), desc="Scraping Financials")
    for future in progress:
        result_df = future.result()
        if result_df is not None:
            all_results.append(result_df)

if not all_results:
    logging.warning("Scraping finished, but no data was collected.")
    return

raw_df = pd.concat(all_results, ignore_index=True)
raw_data_path = os.path.join(output_dir, CONFIG['raw_data_filename'])
# raw_df.to_parquet(raw_data_path, index=False)
logging.info(f"Saved raw scraped data to {raw_data_path}")

# --- Step 3: Transform Data ---
with open(CONFIG['mapping_filepath'], 'r', encoding='utf-8') as f:
    account_map = json.load(f)
    
final_df = transform_data(raw_df, company_df, account_map)
final_df.columns = financial_statement_schemas
final_data_path = os.path.join(output_dir, CONFIG['final_data_filename'])
final_csv_data_path = os.path.join(output_dir, CONFIG['final_data_filename_csv'])

# add conversion here for parquet saving
# final_df.value = final_df.value.astype(float)
final_df.to_csv(final_csv_data_path, sep="\t", index=False)
final_df.to_parquet(final_data_path, index=False)

logging.info(f"Successfully transformed data and saved to {final_data_path}")
logging.info("Pipeline finished.")


Scraping Financials: 100%|█████████████████████████████████████████████████████████| 696/696 [5:08:56<00:00, 26.63s/it]


SyntaxError: 'return' outside function (2597924215.py, line 15)

In [16]:
all_results

[]

In [17]:
company_df = pd.read_csv(company_list_path)

In [18]:
list_company = list(set(company_df.symbol))

In [21]:
import os
import requests
from urllib.parse import urlparse

OUTPUT_FOLDER = "vietstock_reports_hnx"
# --- Main Script ---
def download_financial_reports(tickers, output_dir):
    """
    Downloads financial reports for a list of company tickers.

    Args:
        tickers (list): A list of company ticker strings.
        output_dir (str): The name of the folder to save files to.
    """
    # Create the output directory if it doesn't exist
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
        print(f"Created directory: '{output_dir}'")

    # The base URL structure. The '{}' is a placeholder for the company ticker.
    url_template = "https://static2.vietstock.vn/data/HNX/2024/BCTC/VN/NAM/{}_Baocaotaichinh_2024_Kiemtoan_Congtyme.pdf"

    session = requests.Session()
    # Add a user-agent header to mimic a real browser request
    session.headers.update({
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    })

    print("\n--- Starting Download Process ---")

    for company_ticker in tickers:
        # Format the URL with the current company ticker
        file_url = url_template.format(company_ticker)

        # Generate a local file path
        # We can get the filename directly from the URL
        filename = os.path.basename(urlparse(file_url).path)
        local_filepath = os.path.join(output_dir, filename)

        print(f"\nProcessing Ticker: {company_ticker}")
        print(f" -> URL: {file_url}")

        try:
            # Make the GET request to download the file
            # stream=True is important for downloading large files efficiently
            response = session.get(file_url, stream=True, timeout=30)

            # Check if the request was successful (HTTP status code 200)
            if response.status_code == 200:
                # Open the local file in binary write mode
                with open(local_filepath, 'wb') as f:
                    # Write the content of the response to the file in chunks
                    for chunk in response.iter_content(chunk_size=8192):
                        f.write(chunk)
                print(f" ✔ Success: Report saved to '{local_filepath}'")
            else:
                # If the status code is not 200, the file likely doesn't exist or there was an error
                print(f" ❌ Failed: Could not download file. Server responded with status code {response.status_code}.")

        except requests.exceptions.RequestException as e:
            # Handle network-related errors (e.g., connection timeout)
            print(f" ❌ Error: A network error occurred for {company_ticker}. Details: {e}")

    print("\n--- Download Process Finished ---")


# download_financial_reports(list_company, OUTPUT_FOLDER)

In [22]:
download_financial_reports(list_company, OUTPUT_FOLDER)

Created directory: 'vietstock_reports_hnx'

--- Starting Download Process ---

Processing Ticker: PHR
 -> URL: https://static2.vietstock.vn/data/HNX/2024/BCTC/VN/NAM/PHR_Baocaotaichinh_2024_Kiemtoan_Congtyme.pdf
 ❌ Failed: Could not download file. Server responded with status code 404.

Processing Ticker: TBX
 -> URL: https://static2.vietstock.vn/data/HNX/2024/BCTC/VN/NAM/TBX_Baocaotaichinh_2024_Kiemtoan_Congtyme.pdf
 ❌ Failed: Could not download file. Server responded with status code 404.

Processing Ticker: BNA
 -> URL: https://static2.vietstock.vn/data/HNX/2024/BCTC/VN/NAM/BNA_Baocaotaichinh_2024_Kiemtoan_Congtyme.pdf
 ❌ Failed: Could not download file. Server responded with status code 404.

Processing Ticker: BKC
 -> URL: https://static2.vietstock.vn/data/HNX/2024/BCTC/VN/NAM/BKC_Baocaotaichinh_2024_Kiemtoan_Congtyme.pdf
 ❌ Failed: Could not download file. Server responded with status code 404.

Processing Ticker: DCM
 -> URL: https://static2.vietstock.vn/data/HNX/2024/BCTC/VN/NA