# 원본

In [3]:
import os
import json
import datetime
import sqlite3
import configparser

import pandas as pd
import requests
from bs4 import BeautifulSoup
from tabulate import tabulate

def log_started():
    """Log the start of a new ETL execution."""
    with open('etl_project_log.txt', 'a') as log_file:
        log_file.write("\n" + "=" * 50 + "\n")
        timestamp = datetime.datetime.now().strftime('%Y-%B-%d-%H-%M-%S')
        log_file.write(f"New Execution at {timestamp}\n")
        log_file.write("=" * 50 + "\n\n")


def log_message(message, level="INFO"):
    """Log a message with a timestamp and severity level."""
    timestamp = datetime.datetime.now().strftime('%Y-%b-%d %H:%M:%S')
    with open('etl_project_log.txt', 'a') as log_file:
        log_file.write(f"{timestamp} - {level} - {message}\n")


def load_config(config_path='config.ini'):
    """Load configuration values from the given config file."""
    if not os.path.exists(config_path):
        log_message(f"Configuration file '{config_path}' not found.", level="ERROR")
        raise FileNotFoundError(f"Configuration file '{config_path}' not found.")

    config = configparser.ConfigParser()
    config.read(config_path)

    if 'DEFAULT' not in config or 'URL' not in config['DEFAULT'] or 'TABLE_CLASS' not in config['DEFAULT']:
        log_message("Invalid or missing configuration values in 'config.ini'.", level="ERROR")
        raise ValueError("Invalid or missing configuration values in 'config.ini'.")

    return config['DEFAULT']['URL'], config['DEFAULT']['TABLE_CLASS']


def extract_gdp_data(url, table_class):
    """Extract GDP data from the given URL and table class."""
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        table = soup.find('table', {'class': table_class})

        if table is None:
            log_message("No table found with the specified class.", level="ERROR")
            raise ValueError("Failed to locate the GDP table on the webpage.")

        df = pd.read_html(str(table))[0]
        df = df.iloc[:, [0, 1, 2]]
        df.columns = ['Country', 'GDP (Nominal)', 'Year']
        df = df.dropna(subset=['Country', 'GDP (Nominal)'])
        df['GDP (B USD)'] = (
            df['GDP (Nominal)']
            .str.replace(r'[^\d.]', '', regex=True)
            .replace('', '0')
            .astype(float) / 1e3
        )
        df['Year'] = df['Year'].str.replace(r'\[.*?\]', '', regex=True)
        df = df[['Country', 'GDP (B USD)', 'Year']]
        df = df.sort_values(by='GDP (B USD)', ascending=False)
        df['GDP (B USD)'] = df['GDP (B USD)'].round(2)
        return df
    except Exception as e:
        log_message(f"Error during data extraction: {str(e)}", level="ERROR")
        raise


def transform_gdp_data(df):
    """Transform the extracted GDP data."""
    log_message("Starting Data Transformation")
    try:
        with open('country_region_table.json', 'r', encoding='utf-8') as region_file:
            region_data = json.load(region_file)
        df['Region'] = df['Country'].map(region_data)
        log_message("Data Transformation Completed")
        return df
    except Exception as e:
        log_message(f"Error during data transformation: {str(e)}", level="ERROR")
        raise


def load_gdp_data(df):
    """Load the transformed data into an SQLite database."""
    log_message("Loading data into SQLite database")
    try:
        conn = sqlite3.connect('World_Economies.db')
        df[['Country', 'GDP (B USD)', 'Year', 'Region']].rename(
            columns={'GDP (B USD)': 'GDP_USD_billion'}
        ).to_sql('Countries_by_GDP', conn, if_exists='replace', index=False)
        conn.close()
        log_message("Data successfully loaded into SQLite database")
    except Exception as e:
        log_message(f"Error while loading data into SQLite database: {str(e)}", level="ERROR")
        raise


def display_countries_with_gdp_over_100():
    """Display countries with GDP over 100B USD."""
    log_message("Displaying countries with GDP over 100B USD")
    try:
        conn = sqlite3.connect('World_Economies.db')
        query = """
        SELECT Country, GDP_USD_billion
        FROM Countries_by_GDP
        WHERE GDP_USD_billion >= 100
        """
        result = pd.read_sql_query(query, conn)
        conn.close()
        print("Countries with GDP >= 100B USD:")
        print(tabulate(result, headers='keys', tablefmt='psql', showindex=False))
    except Exception as e:
        log_message(f"Error querying database for countries with GDP >= 100B: {str(e)}", level="ERROR")
        raise


def display_region_top5_average_gdp():
    """Calculate and display the average GDP of the top 5 countries by region."""
    log_message("Calculating average GDP of top 5 countries by region")
    try:
        conn = sqlite3.connect('World_Economies.db')
        query = """
        WITH RankedCountries AS (
            SELECT Country, GDP_USD_billion, Region,
                   RANK() OVER (PARTITION BY Region ORDER BY GDP_USD_billion DESC) AS Rank
            FROM Countries_by_GDP
            WHERE Region IS NOT NULL
        )
        SELECT Region, AVG(GDP_USD_billion) AS Avg_Top5_GDP
        FROM RankedCountries
        WHERE Rank <= 5
        GROUP BY Region
        """
        result = pd.read_sql_query(query, conn)
        conn.close()
        print("Average GDP of top 5 countries by region (excluding None):")
        print(tabulate(result, headers='keys', tablefmt='psql', showindex=False))
    except Exception as e:
        log_message(f"Error querying database for top 5 average GDP: {str(e)}", level="ERROR")
        raise


def save_gdp_data(df, output_csv_file='extracted_gdp_data.csv', output_json_file='extracted_gdp_data.json'):
    """Save the GDP data to CSV and JSON files."""
    log_message("Saving Extracted Data")
    try:
        df.to_csv(output_csv_file, index=False)
        df.to_json(output_json_file, orient='records', force_ascii=False, indent=4)
        log_message(f"Data saved: CSV ({output_csv_file}), JSON ({output_json_file})")
    except Exception as e:
        log_message(f"Failed to save data: {str(e)}", level="ERROR")
        raise
    
# # GDP가 100B USD 이상인 국가 필터링
# def filtered_100USD(df):
#     filtered_100 = df[df['GDP (B USD)'] >= 100]
#     print("Countries with a GDP of over 100B USD")
#     print(filtered_100)
#     return filtered_100


# # Region별 상위 5개 국가의 GDP 평균 계산
# def region_top5_calculate(df):
#     region_top5_avg = (
#         df.groupby('Region')
#         .apply(lambda x: x.nlargest(5, 'GDP (B USD)')['GDP (B USD)'].mean())
#         .reset_index(name='Top 5 Avg GDP (B USD)')
#     )
#     print("Average GDP of top 5 countries by region")
#     print(region_top5_avg)
#     return region_top5_avg


def etl_process():
    """Main ETL process for GDP data."""
    try:
        log_started()
        log_message("ETL Process Started")

        url, table_class = load_config()
        extracted_data = extract_gdp_data(url, table_class)
        #save_gdp_data(extracted_data)
        transformed_data = transform_gdp_data(extracted_data)
        #save_gdp_data(transformed_data, 'transformed_gdp_data.csv', 'transformed_gdp_data.json')
        load_gdp_data(transformed_data)
        display_countries_with_gdp_over_100()
        display_region_top5_average_gdp()
        
        ## 추가요구사항 전 출력과정
        #filtered_data = filtered_100USD(transformed_data)
        #region_top5_data = region_top5_calculate(transformed_data)

        log_message("ETL Process Completed Successfully")
    except Exception as e:
        log_message(f"ETL Process Failed: {str(e)}", level="ERROR")


if __name__ == "__main__":
    etl_process()


Countries with GDP >= 100B USD:
+----------------------+-------------------+
| Country              |   GDP_USD_billion |
|----------------------+-------------------|
| World                |         115494    |
| United States        |          30337.2  |
| China                |          19534.9  |
| Germany              |           4921.56 |
| Japan                |           4389.33 |
| India                |           4271.92 |
| United Kingdom       |           3730.26 |
| France               |           3283.43 |
| Italy                |           2459.6  |
| Canada               |           2330.31 |
| Brazil               |           2307.16 |
| Russia               |           2195.71 |
| South Korea          |           1947.13 |
| Australia            |           1881.14 |
| Spain                |           1827.58 |
| Mexico               |           1817.82 |
| Indonesia            |           1492.62 |
| Turkey               |           1455.41 |
| Netherlands          

  df = pd.read_html(str(table))[0]


---


# 리팩토링 최종

In [None]:
import os
import json
import datetime
import sqlite3
import configparser

import pandas as pd
import requests
from bs4 import BeautifulSoup
from tabulate import tabulate


def log_started():
    """
    Log the start of a new ETL execution.
    
    This function creates a separator in the log file and records a timestamp
    to indicate the beginning of a new ETL execution.
    """
    with open('etl_project_log.txt', 'a') as log_file:
        log_file.write("\n" + "=" * 50 + "\n")
        timestamp = datetime.datetime.now().strftime('%Y-%B-%d-%H-%M-%S')
        log_file.write(f"New Execution at {timestamp}\n")
        log_file.write("=" * 50 + "\n\n")


def log_message(message, level="INFO"):
    """
    Log a message with a timestamp and severity level.

    Parameters:
        message (str): The message to log.
        level (str): The severity level of the message (default: "INFO").
    """
    timestamp = datetime.datetime.now().strftime('%Y-%b-%d %H:%M:%S')
    with open('etl_project_log.txt', 'a') as log_file:
        log_file.write(f"{timestamp} - {level} - {message}\n")


def extract_gdp_data(config_path='config.ini'):
    """
    Extract GDP data from a webpage based on a configuration file.

    Parameters:
        config_path (str): Path to the configuration file (default: 'config.ini').

    Returns:
        pd.DataFrame: Raw GDP data extracted from the webpage.
    """
    log_message("Starting data extraction process")

    # Step 1: Read and validate the configuration file
    if not os.path.exists(config_path):
        log_message(f"Configuration file '{config_path}' not found.", level="ERROR")
        raise FileNotFoundError(f"Configuration file '{config_path}' not found.")

    try:
        config = configparser.ConfigParser()
        config.read(config_path)
        url = config['DEFAULT']['URL']
        table_class = config['DEFAULT']['TABLE_CLASS']
        log_message("Configuration file read successfully")
    except Exception as e:
        log_message(f"Error reading configuration file: {str(e)}", level="ERROR")
        raise

    # Step 2: Fetch the webpage data
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        log_message(f"Successfully fetched data from URL: {url}")
    except requests.exceptions.RequestException as e:
        log_message(f"Error during requests to the URL: {str(e)}", level="ERROR")
        raise

    # Step 3: Parse the HTML and extract the table
    try:
        soup = BeautifulSoup(response.text, 'html.parser')
        table = soup.find('table', {'class': table_class})
        if table is None:
            log_message(f"No table found with the specified class '{table_class}'.", level="ERROR")
            raise ValueError("Failed to locate the GDP table on the webpage.")
        log_message("Successfully located the GDP table in the HTML")
    except Exception as e:
        log_message(f"Error parsing HTML or finding table: {str(e)}", level="ERROR")
        raise

    # Step 4: Convert the table to a DataFrame and save raw data
    try:
        raw_df = pd.read_html(str(table))[0]
        raw_df.to_csv('raw_gdp_data.csv', index=False)
        raw_df.to_json('raw_gdp_data.json', orient='records', force_ascii=False, indent=4)
        log_message("Raw data saved successfully")
    except Exception as e:
        log_message(f"Error converting table to DataFrame or saving raw data: {str(e)}", level="ERROR")
        raise

    log_message("Data extraction process completed successfully")
    return raw_df


def transform_gdp_data(df):
    """
    Transform the extracted GDP data for further analysis.

    Parameters:
        df (pd.DataFrame): Raw GDP data.

    Returns:
        pd.DataFrame: Transformed GDP data.
    """
    log_message("Starting data transformation process")
    try:
        # Step 1: Clean and process the data
        df = df.iloc[:, [0, 1, 2]]
        df.columns = ['Country', 'GDP (Nominal)', 'Year']
        df = df.dropna(subset=['Country', 'GDP (Nominal)'])
        df['GDP (B USD)'] = (
            df['GDP (Nominal)']
            .str.replace(r'[^\d.]', '', regex=True)  # Remove non-numeric characters
            .astype(float) / 1e3
        )
        df['Year'] = df['Year'].str.replace(r'\[.*?\]', '', regex=True)
        df = df[['Country', 'GDP (B USD)', 'Year']]
        df.sort_values(by='GDP (B USD)', ascending=False, inplace=True)
        log_message("Data cleaned and transformed successfully")

        # Step 2: Merge with region data
        with open('country_region_table.json', 'r', encoding='utf-8') as region_file:
            region_data = json.load(region_file)
        region_df = pd.DataFrame(list(region_data.items()), columns=['Country', 'Region'])
        df = pd.merge(df, region_df, on='Country', how='left')

        # Step 3: Save transformed data
        df.to_csv('transformed_gdp_data.csv', index=False)
        df.to_json('transformed_gdp_data.json', orient='records', force_ascii=False, indent=4)
        log_message("Transformed data saved successfully")

    except Exception as e:
        log_message(f"Error in data transformation: {str(e)}", level="ERROR")
        raise

    return df


def load_gdp_data(df):
    """
    Load transformed GDP data into an SQLite database.

    Parameters:
        df (pd.DataFrame): Transformed GDP data.
    """
    log_message("Loading data into SQLite database")
    try:
        conn = sqlite3.connect('World_Economies.db')
        df.to_sql('Countries_by_GDP', conn, if_exists='replace', index=False)
        conn.close()
        log_message("Data successfully loaded into SQLite database")
    except Exception as e:
        log_message(f"Error while loading data into SQLite database: {str(e)}", level="ERROR")
        raise


def display_countries_with_gdp_over_100():
    """
    Display countries with GDP over 100 billion USD.

    Prints a table of countries with GDP greater than or equal to 100B USD from the database.
    """
    log_message("Displaying countries with GDP over 100B USD")
    try:
        conn = sqlite3.connect('World_Economies.db')
        query = """
        SELECT Country, GDP_USD_billion
        FROM Countries_by_GDP
        WHERE GDP_USD_billion >= 100
        """
        result = pd.read_sql_query(query, conn)
        conn.close()
        print("Countries with GDP >= 100B USD:")
        print(tabulate(result, headers='keys', tablefmt='psql', showindex=False))
    except Exception as e:
        log_message(f"Error querying database for countries with GDP >= 100B: {str(e)}", level="ERROR")
        raise


def display_region_top5_average_gdp():
    """
    Calculate and display the average GDP of the top 5 countries by region.

    Prints the average GDP of the top 5 countries for each region from the database.
    """
    log_message("Calculating average GDP of top 5 countries by region")
    try:
        conn = sqlite3.connect('World_Economies.db')
        query = """
        WITH RankedCountries AS (
            SELECT Country, GDP_USD_billion, Region,
                   RANK() OVER (PARTITION BY Region ORDER BY GDP_USD_billion DESC) AS Rank
            FROM Countries_by_GDP
            WHERE Region IS NOT NULL
        )
        SELECT Region, AVG(GDP_USD_billion) AS Avg_Top5_GDP
        FROM RankedCountries
        WHERE Rank <= 5
        GROUP BY Region
        """
        result = pd.read_sql_query(query, conn)
        conn.close()
        print("Average GDP of top 5 countries by region (excluding None):")
        print(tabulate(result, headers='keys', tablefmt='psql', showindex=False))
    except Exception as e:
        log_message(f"Error querying database for top 5 average GDP: {str(e)}", level="ERROR")
        raise


def etl_process():
    """
    Execute the ETL process for GDP data.

    Steps:
        1. Extract raw data from a webpage.
        2. Transform the data into a cleaned(required) format.
        3. Load the data into an SQLite database.
        4. Display additional analytical results.
    """
    try:
        log_started()
        log_message("ETL Process Started")

        # Extract
        raw_data = extract_gdp_data()

        # Transform
        transformed_data = transform_gdp_data(raw_data)

        # Load
        load_gdp_data(transformed_data)

        # Display analysis
        display_countries_with_gdp_over_100()
        display_region_top5_average_gdp()

        log_message("ETL Process Completed Successfully")
    except Exception as e:
        log_message(f"ETL Process Failed: {str(e)}", level="ERROR")


if __name__ == "__main__":
    etl_process()

---

# load_config()함수에서 진행하는 것을 load_config()함수를 없애고 extract_gdp_data()함수에서 진행할 수 있도록 옮김

In [4]:
import os
import json
import datetime
import sqlite3
import configparser

import pandas as pd
import requests
from bs4 import BeautifulSoup
from tabulate import tabulate

def log_started():
    """Log the start of a new ETL execution."""
    with open('etl_project_log.txt', 'a') as log_file:
        log_file.write("\n" + "=" * 50 + "\n")
        timestamp = datetime.datetime.now().strftime('%Y-%B-%d-%H-%M-%S')
        log_file.write(f"New Execution at {timestamp}\n")
        log_file.write("=" * 50 + "\n\n")

def log_message(message, level="INFO"):
    """Log a message with a timestamp and severity level."""
    timestamp = datetime.datetime.now().strftime('%Y-%b-%d %H:%M:%S')
    with open('etl_project_log.txt', 'a') as log_file:
        log_file.write(f"{timestamp} - {level} - {message}\n")

def extract_gdp_data(config_path='config.ini'):
    """Extract GDP data from the URL and table class specified in the config file."""
    if not os.path.exists(config_path):
        log_message(f"Configuration file '{config_path}' not found.", level="ERROR")
        raise FileNotFoundError(f"Configuration file '{config_path}' not found.")

    config = configparser.ConfigParser()
    config.read(config_path)

    if 'DEFAULT' not in config or 'URL' not in config['DEFAULT'] or 'TABLE_CLASS' not in config['DEFAULT']:
        log_message("Invalid or missing configuration values in 'config.ini'.", level="ERROR")
        raise ValueError("Invalid or missing configuration values in 'config.ini'.")

    url = config['DEFAULT']['URL']
    table_class = config['DEFAULT']['TABLE_CLASS']

    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        table = soup.find('table', {'class': table_class})

        if table is None:
            log_message("No table found with the specified class.", level="ERROR")
            raise ValueError("Failed to locate the GDP table on the webpage.")

        df = pd.read_html(str(table))[0]
        df = df.iloc[:, [0, 1, 2]]
        df.columns = ['Country', 'GDP (Nominal)', 'Year']
        df = df.dropna(subset=['Country', 'GDP (Nominal)'])
        df['GDP (B USD)'] = (
            df['GDP (Nominal)']
            .str.replace(r'[^\d.]', '', regex=True)
            .replace('', '0')
            .astype(float) / 1e3
        )
        df['Year'] = df['Year'].str.replace(r'\[.*?\]', '', regex=True)
        df = df[['Country', 'GDP (B USD)', 'Year']]
        df = df.sort_values(by='GDP (B USD)', ascending=False)
        df['GDP (B USD)'] = df['GDP (B USD)'].round(2)
        return df
    except Exception as e:
        log_message(f"Error during data extraction: {str(e)}", level="ERROR")
        raise

def transform_gdp_data(df):
    """Transform the extracted GDP data."""
    log_message("Starting Data Transformation")
    try:
        with open('country_region_table.json', 'r', encoding='utf-8') as region_file:
            region_data = json.load(region_file)
        df['Region'] = df['Country'].map(region_data)
        log_message("Data Transformation Completed")
        return df
    except Exception as e:
        log_message(f"Error during data transformation: {str(e)}", level="ERROR")
        raise

def load_gdp_data(df):
    """Load the transformed data into an SQLite database."""
    log_message("Loading data into SQLite database")
    try:
        conn = sqlite3.connect('World_Economies.db')
        df[['Country', 'GDP (B USD)', 'Year', 'Region']].rename(
            columns={'GDP (B USD)': 'GDP_USD_billion'}
        ).to_sql('Countries_by_GDP', conn, if_exists='replace', index=False)
        conn.close()
        log_message("Data successfully loaded into SQLite database")
    except Exception as e:
        log_message(f"Error while loading data into SQLite database: {str(e)}", level="ERROR")
        raise

def display_countries_with_gdp_over_100():
    """Display countries with GDP over 100B USD."""
    log_message("Displaying countries with GDP over 100B USD")
    try:
        conn = sqlite3.connect('World_Economies.db')
        query = """
        SELECT Country, GDP_USD_billion
        FROM Countries_by_GDP
        WHERE GDP_USD_billion >= 100
        """
        result = pd.read_sql_query(query, conn)
        conn.close()
        print("Countries with GDP >= 100B USD:")
        print(tabulate(result, headers='keys', tablefmt='psql', showindex=False))
    except Exception as e:
        log_message(f"Error querying database for countries with GDP >= 100B: {str(e)}", level="ERROR")
        raise

def display_region_top5_average_gdp():
    """Calculate and display the average GDP of the top 5 countries by region."""
    log_message("Calculating average GDP of top 5 countries by region")
    try:
        conn = sqlite3.connect('World_Economies.db')
        query = """
        WITH RankedCountries AS (
            SELECT Country, GDP_USD_billion, Region,
                   RANK() OVER (PARTITION BY Region ORDER BY GDP_USD_billion DESC) AS Rank
            FROM Countries_by_GDP
            WHERE Region IS NOT NULL
        )
        SELECT Region, AVG(GDP_USD_billion) AS Avg_Top5_GDP
        FROM RankedCountries
        WHERE Rank <= 5
        GROUP BY Region
        """
        result = pd.read_sql_query(query, conn)
        conn.close()
        print("Average GDP of top 5 countries by region (excluding None):")
        print(tabulate(result, headers='keys', tablefmt='psql', showindex=False))
    except Exception as e:
        log_message(f"Error querying database for top 5 average GDP: {str(e)}", level="ERROR")
        raise

def save_gdp_data(df, output_csv_file='extracted_gdp_data.csv', output_json_file='extracted_gdp_data.json'):
    """Save the GDP data to CSV and JSON files."""
    log_message("Saving Extracted Data")
    try:
        df.to_csv(output_csv_file, index=False)
        df.to_json(output_json_file, orient='records', force_ascii=False, indent=4)
        log_message(f"Data saved: CSV ({output_csv_file}), JSON ({output_json_file})")
    except Exception as e:
        log_message(f"Failed to save data: {str(e)}", level="ERROR")
        raise

def etl_process():
    """Main ETL process for GDP data."""
    try:
        log_started()
        log_message("ETL Process Started")

        extracted_data = extract_gdp_data()
        save_gdp_data(extracted_data)
        transformed_data = transform_gdp_data(extracted_data)
        save_gdp_data(transformed_data, 'transformed_gdp_data.csv', 'transformed_gdp_data.json')
        load_gdp_data(transformed_data)
        display_countries_with_gdp_over_100()
        display_region_top5_average_gdp()

        log_message("ETL Process Completed Successfully")
    except Exception as e:
        log_message(f"ETL Process Failed: {str(e)}", level="ERROR")

if __name__ == "__main__":
    etl_process()


Countries with GDP >= 100B USD:
+----------------------+-------------------+
| Country              |   GDP_USD_billion |
|----------------------+-------------------|
| World                |         115494    |
| United States        |          30337.2  |
| China                |          19534.9  |
| Germany              |           4921.56 |
| Japan                |           4389.33 |
| India                |           4271.92 |
| United Kingdom       |           3730.26 |
| France               |           3283.43 |
| Italy                |           2459.6  |
| Canada               |           2330.31 |
| Brazil               |           2307.16 |
| Russia               |           2195.71 |
| South Korea          |           1947.13 |
| Australia            |           1881.14 |
| Spain                |           1827.58 |
| Mexico               |           1817.82 |
| Indonesia            |           1492.62 |
| Turkey               |           1455.41 |
| Netherlands          

  df = pd.read_html(str(table))[0]


---

#  extract_gdp_data()에서 BeautifulSoap를 활용해 웹스크래핑을 한 뒤, 스크래핑한 데이터를 수정('Country', 'GDP (Nominal)', 'Year'만 남겨놓는 것)하기 전에 raw data를 CSV 와 JSON files 로 저장하는 작업을 추가


In [None]:

import os
import json
import datetime
import sqlite3
import configparser

import pandas as pd
import requests
from bs4 import BeautifulSoup
from tabulate import tabulate

def log_started():
    """Log the start of a new ETL execution."""
    with open('etl_project_log.txt', 'a') as log_file:
        log_file.write("\n" + "=" * 50 + "\n")
        timestamp = datetime.datetime.now().strftime('%Y-%B-%d-%H-%M-%S')
        log_file.write(f"New Execution at {timestamp}\n")
        log_file.write("=" * 50 + "\n\n")

def log_message(message, level="INFO"):
    """Log a message with a timestamp and severity level."""
    timestamp = datetime.datetime.now().strftime('%Y-%b-%d %H:%M:%S')
    with open('etl_project_log.txt', 'a') as log_file:
        log_file.write(f"{timestamp} - {level} - {message}\n")

def extract_gdp_data(config_path='config.ini'):
    """Extract GDP data from the URL and table class specified in the config file."""
    if not os.path.exists(config_path):
        log_message(f"Configuration file '{config_path}' not found.", level="ERROR")
        raise FileNotFoundError(f"Configuration file '{config_path}' not found.")

    config = configparser.ConfigParser()
    config.read(config_path)

    if 'DEFAULT' not in config or 'URL' not in config['DEFAULT'] or 'TABLE_CLASS' not in config['DEFAULT']:
        log_message("Invalid or missing configuration values in 'config.ini'.", level="ERROR")
        raise ValueError("Invalid or missing configuration values in 'config.ini'.")

    url = config['DEFAULT']['URL']
    table_class = config['DEFAULT']['TABLE_CLASS']

    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        table = soup.find('table', {'class': table_class})

        if table is None:
            log_message("No table found with the specified class.", level="ERROR")
            raise ValueError("Failed to locate the GDP table on the webpage.")

        raw_df = pd.read_html(str(table))[0]
        raw_output_csv_file = 'raw_gdp_data.csv'
        raw_output_json_file = 'raw_gdp_data.json'

        # Save raw data before any processing
        raw_df.to_csv(raw_output_csv_file, index=False)
        raw_df.to_json(raw_output_json_file, orient='records', force_ascii=False, indent=4)
        log_message(f"Raw data saved: CSV ({raw_output_csv_file}), JSON ({raw_output_json_file})")

        df = raw_df.iloc[:, [0, 1, 2]]
        df.columns = ['Country', 'GDP (Nominal)', 'Year']
        df = df.dropna(subset=['Country', 'GDP (Nominal)'])
        df['GDP (B USD)'] = (
            df['GDP (Nominal)']
            .str.replace(r'[^\d.]', '', regex=True)
            .replace('', '0')
            .astype(float) / 1e3
        )
        df['Year'] = df['Year'].str.replace(r'\[.*?\]', '', regex=True)
        df = df[['Country', 'GDP (B USD)', 'Year']]
        df = df.sort_values(by='GDP (B USD)', ascending=False)
        df['GDP (B USD)'] = df['GDP (B USD)'].round(2)
        return df
    except Exception as e:
        log_message(f"Error during data extraction: {str(e)}", level="ERROR")
        raise

def transform_gdp_data(df):
    """Transform the extracted GDP data."""
    log_message("Starting Data Transformation")
    try:
        with open('country_region_table.json', 'r', encoding='utf-8') as region_file:
            region_data = json.load(region_file)
        df['Region'] = df['Country'].map(region_data)
        log_message("Data Transformation Completed")
        return df
    except Exception as e:
        log_message(f"Error during data transformation: {str(e)}", level="ERROR")
        raise

def load_gdp_data(df):
    """Load the transformed data into an SQLite database."""
    log_message("Loading data into SQLite database")
    try:
        conn = sqlite3.connect('World_Economies.db')
        df[['Country', 'GDP (B USD)', 'Year', 'Region']].rename(
            columns={'GDP (B USD)': 'GDP_USD_billion'}
        ).to_sql('Countries_by_GDP', conn, if_exists='replace', index=False)
        conn.close()
        log_message("Data successfully loaded into SQLite database")
    except Exception as e:
        log_message(f"Error while loading data into SQLite database: {str(e)}", level="ERROR")
        raise

def display_countries_with_gdp_over_100():
    """Display countries with GDP over 100B USD."""
    log_message("Displaying countries with GDP over 100B USD")
    try:
        conn = sqlite3.connect('World_Economies.db')
        query = """
        SELECT Country, GDP_USD_billion
        FROM Countries_by_GDP
        WHERE GDP_USD_billion >= 100
        """
        result = pd.read_sql_query(query, conn)
        conn.close()
        print("Countries with GDP >= 100B USD:")
        print(tabulate(result, headers='keys', tablefmt='psql', showindex=False))
    except Exception as e:
        log_message(f"Error querying database for countries with GDP >= 100B: {str(e)}", level="ERROR")
        raise

def display_region_top5_average_gdp():
    """Calculate and display the average GDP of the top 5 countries by region."""
    log_message("Calculating average GDP of top 5 countries by region")
    try:
        conn = sqlite3.connect('World_Economies.db')
        query = """
        WITH RankedCountries AS (
            SELECT Country, GDP_USD_billion, Region,
                   RANK() OVER (PARTITION BY Region ORDER BY GDP_USD_billion DESC) AS Rank
            FROM Countries_by_GDP
            WHERE Region IS NOT NULL
        )
        SELECT Region, AVG(GDP_USD_billion) AS Avg_Top5_GDP
        FROM RankedCountries
        WHERE Rank <= 5
        GROUP BY Region
        """
        result = pd.read_sql_query(query, conn)
        conn.close()
        print("Average GDP of top 5 countries by region (excluding None):")
        print(tabulate(result, headers='keys', tablefmt='psql', showindex=False))
    except Exception as e:
        log_message(f"Error querying database for top 5 average GDP: {str(e)}", level="ERROR")
        raise

def save_gdp_data(df, output_csv_file='extracted_gdp_data.csv', output_json_file='extracted_gdp_data.json'):
    """Save the GDP data to CSV and JSON files."""
    log_message("Saving Extracted Data")
    try:
        df.to_csv(output_csv_file, index=False)
        df.to_json(output_json_file, orient='records', force_ascii=False, indent=4)
        log_message(f"Data saved: CSV ({output_csv_file}), JSON ({output_json_file})")
    except Exception as e:
        log_message(f"Failed to save data: {str(e)}", level="ERROR")
        raise

def etl_process():
    """Main ETL process for GDP data."""
    try:
        log_started()
        log_message("ETL Process Started")

        extracted_data = extract_gdp_data()
        save_gdp_data(extracted_data)
        transformed_data = transform_gdp_data(extracted_data)
        save_gdp_data(transformed_data, 'transformed_gdp_data.csv', 'transformed_gdp_data.json')
        load_gdp_data(transformed_data)
        display_countries_with_gdp_over_100()
        display_region_top5_average_gdp()

        log_message("ETL Process Completed Successfully")
    except Exception as e:
        log_message(f"ETL Process Failed: {str(e)}", level="ERROR")

if __name__ == "__main__":
    etl_process()


---

# extract_gdp_data()에서 진행하는 데이터 프레임 전처리 과정을 transfrom_gdp_data()함수에서 진행되도록 변경

In [None]:
import os
import json
import datetime
import sqlite3
import configparser

import pandas as pd
import requests
from bs4 import BeautifulSoup
from tabulate import tabulate

def log_started():
    """Log the start of a new ETL execution."""
    with open('etl_project_log.txt', 'a') as log_file:
        log_file.write("\n" + "=" * 50 + "\n")
        timestamp = datetime.datetime.now().strftime('%Y-%B-%d-%H-%M-%S')
        log_file.write(f"New Execution at {timestamp}\n")
        log_file.write("=" * 50 + "\n\n")

def log_message(message, level="INFO"):
    """Log a message with a timestamp and severity level."""
    timestamp = datetime.datetime.now().strftime('%Y-%b-%d %H:%M:%S')
    with open('etl_project_log.txt', 'a') as log_file:
        log_file.write(f"{timestamp} - {level} - {message}\n")

def extract_gdp_data(config_path='config.ini'):
    """Extract GDP data from the URL and table class specified in the config file."""
    if not os.path.exists(config_path):
        log_message(f"Configuration file '{config_path}' not found.", level="ERROR")
        raise FileNotFoundError(f"Configuration file '{config_path}' not found.")

    config = configparser.ConfigParser()
    config.read(config_path)

    if 'DEFAULT' not in config or 'URL' not in config['DEFAULT'] or 'TABLE_CLASS' not in config['DEFAULT']:
        log_message("Invalid or missing configuration values in 'config.ini'.", level="ERROR")
        raise ValueError("Invalid or missing configuration values in 'config.ini'.")

    url = config['DEFAULT']['URL']
    table_class = config['DEFAULT']['TABLE_CLASS']

    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        table = soup.find('table', {'class': table_class})

        if table is None:
            log_message("No table found with the specified class.", level="ERROR")
            raise ValueError("Failed to locate the GDP table on the webpage.")

        raw_df = pd.read_html(str(table))[0]
        raw_output_csv_file = 'raw_gdp_data.csv'
        raw_output_json_file = 'raw_gdp_data.json'

        # Save raw data before any processing
        raw_df.to_csv(raw_output_csv_file, index=False)
        raw_df.to_json(raw_output_json_file, orient='records', force_ascii=False, indent=4)
        log_message(f"Raw data saved: CSV ({raw_output_csv_file}), JSON ({raw_output_json_file})")

        return raw_df
    except Exception as e:
        log_message(f"Error during data extraction: {str(e)}", level="ERROR")
        raise

def transform_gdp_data(df):
    """Transform the extracted GDP data."""
    log_message("Starting Data Transformation")
    try:
        df = df.iloc[:, [0, 1, 2]]
        df.columns = ['Country', 'GDP (Nominal)', 'Year']
        df = df.dropna(subset=['Country', 'GDP (Nominal)'])
        df['GDP (B USD)'] = (
            df['GDP (Nominal)']
            .str.replace(r'[^\d.]', '', regex=True)
            .replace('', '0')
            .astype(float) / 1e3
        )
        df['Year'] = df['Year'].str.replace(r'\[.*?\]', '', regex=True)
        df = df[['Country', 'GDP (B USD)', 'Year']]
        df = df.sort_values(by='GDP (B USD)', ascending=False)
        df['GDP (B USD)'] = df['GDP (B USD)'].round(2)

        with open('country_region_table.json', 'r', encoding='utf-8') as region_file:
            region_data = json.load(region_file)
        df['Region'] = df['Country'].map(region_data)

        log_message("Data Transformation Completed")
        return df
    except Exception as e:
        log_message(f"Error during data transformation: {str(e)}", level="ERROR")
        raise

def load_gdp_data(df):
    """Load the transformed data into an SQLite database."""
    log_message("Loading data into SQLite database")
    try:
        conn = sqlite3.connect('World_Economies.db')
        df[['Country', 'GDP (B USD)', 'Year', 'Region']].rename(
            columns={'GDP (B USD)': 'GDP_USD_billion'}
        ).to_sql('Countries_by_GDP', conn, if_exists='replace', index=False)
        conn.close()
        log_message("Data successfully loaded into SQLite database")
    except Exception as e:
        log_message(f"Error while loading data into SQLite database: {str(e)}", level="ERROR")
        raise

def display_countries_with_gdp_over_100():
    """Display countries with GDP over 100B USD."""
    log_message("Displaying countries with GDP over 100B USD")
    try:
        conn = sqlite3.connect('World_Economies.db')
        query = """
        SELECT Country, GDP_USD_billion
        FROM Countries_by_GDP
        WHERE GDP_USD_billion >= 100
        """
        result = pd.read_sql_query(query, conn)
        conn.close()
        print("Countries with GDP >= 100B USD:")
        print(tabulate(result, headers='keys', tablefmt='psql', showindex=False))
    except Exception as e:
        log_message(f"Error querying database for countries with GDP >= 100B: {str(e)}", level="ERROR")
        raise

def display_region_top5_average_gdp():
    """Calculate and display the average GDP of the top 5 countries by region."""
    log_message("Calculating average GDP of top 5 countries by region")
    try:
        conn = sqlite3.connect('World_Economies.db')
        query = """
        WITH RankedCountries AS (
            SELECT Country, GDP_USD_billion, Region,
                   RANK() OVER (PARTITION BY Region ORDER BY GDP_USD_billion DESC) AS Rank
            FROM Countries_by_GDP
            WHERE Region IS NOT NULL
        )
        SELECT Region, AVG(GDP_USD_billion) AS Avg_Top5_GDP
        FROM RankedCountries
        WHERE Rank <= 5
        GROUP BY Region
        """
        result = pd.read_sql_query(query, conn)
        conn.close()
        print("Average GDP of top 5 countries by region (excluding None):")
        print(tabulate(result, headers='keys', tablefmt='psql', showindex=False))
    except Exception as e:
        log_message(f"Error querying database for top 5 average GDP: {str(e)}", level="ERROR")
        raise

def save_gdp_data(df, output_csv_file='extracted_gdp_data.csv', output_json_file='extracted_gdp_data.json'):
    """Save the GDP data to CSV and JSON files."""
    log_message("Saving Extracted Data")
    try:
        df.to_csv(output_csv_file, index=False)
        df.to_json(output_json_file, orient='records', force_ascii=False, indent=4)
        log_message(f"Data saved: CSV ({output_csv_file}), JSON ({output_json_file})")
    except Exception as e:
        log_message(f"Failed to save data: {str(e)}", level="ERROR")
        raise

def etl_process():
    """Main ETL process for GDP data."""
    try:
        log_started()
        log_message("ETL Process Started")

        # Extract 
        raw_data = extract_gdp_data()
        # Transform
        transformed_data = transform_gdp_data(raw_data)
        save_gdp_data(transformed_data, 'transformed_gdp_data.csv', 'transformed_gdp_data.json')
        load_gdp_data(transformed_data)
        display_countries_with_gdp_over_100()
        display_region_top5_average_gdp()

        log_message("ETL Process Completed Successfully")
    except Exception as e:
        log_message(f"ETL Process Failed: {str(e)}", level="ERROR")

if __name__ == "__main__":
    etl_process()


---

# save_gdp_data() 함수를 없애고  save_gdp_data()에서 하던 저장 과정을 transform_gdp_data(df)함수 안에 넣기

In [None]:
import os
import json
import datetime
import sqlite3
import configparser

import pandas as pd
import requests
from bs4 import BeautifulSoup
from tabulate import tabulate


def log_started():
    """Log the start of a new ETL execution."""
    with open('etl_project_log.txt', 'a') as log_file:
        log_file.write("\n" + "=" * 50 + "\n")
        timestamp = datetime.datetime.now().strftime('%Y-%B-%d-%H-%M-%S')
        log_file.write(f"New Execution at {timestamp}\n")
        log_file.write("=" * 50 + "\n\n")
        

def log_message(message, level="INFO"):
    """Log a message with a timestamp and severity level."""
    timestamp = datetime.datetime.now().strftime('%Y-%b-%d %H:%M:%S')
    with open('etl_project_log.txt', 'a') as log_file:
        log_file.write(f"{timestamp} - {level} - {message}\n")
        

def extract_gdp_data(config_path='config.ini'):
    """Extract GDP data from the URL and table class specified in the config file."""
    if not os.path.exists(config_path):
        log_message(f"Configuration file '{config_path}' not found.", level="ERROR")
        raise FileNotFoundError(f"Configuration file '{config_path}' not found.")

    config = configparser.ConfigParser()
    config.read(config_path)

    if 'DEFAULT' not in config or 'URL' not in config['DEFAULT'] or 'TABLE_CLASS' not in config['DEFAULT']:
        log_message("Invalid or missing configuration values in 'config.ini'.", level="ERROR")
        raise ValueError("Invalid or missing configuration values in 'config.ini'.")

    url = config['DEFAULT']['URL']
    table_class = config['DEFAULT']['TABLE_CLASS']

    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        table = soup.find('table', {'class': table_class})

        if table is None:
            log_message("No table found with the specified class.", level="ERROR")
            raise ValueError("Failed to locate the GDP table on the webpage.")

        raw_df = pd.read_html(str(table))[0]
        raw_output_csv_file = 'raw_gdp_data.csv'
        raw_output_json_file = 'raw_gdp_data.json'

        # Save raw data before any processing
        raw_df.to_csv(raw_output_csv_file, index=False)
        raw_df.to_json(raw_output_json_file, orient='records', force_ascii=False, indent=4)
        log_message(f"Raw data saved: CSV ({raw_output_csv_file}), JSON ({raw_output_json_file})")

        return raw_df
    except Exception as e:
        log_message(f"Error during data extraction: {str(e)}", level="ERROR")
        raise
    

def transform_gdp_data(df):
    """Transform the extracted GDP data and save it."""
    log_message("Starting Data Transformation")
    try:
        df = df.iloc[:, [0, 1, 2]]
        df.columns = ['Country', 'GDP (Nominal)', 'Year']
        df = df.dropna(subset=['Country', 'GDP (Nominal)'])
        df['GDP (B USD)'] = (
            df['GDP (Nominal)']
            .str.replace(r'[^\d.]', '', regex=True)
            .replace('', '0')
            .astype(float) / 1e3
        )
        df['Year'] = df['Year'].str.replace(r'\[.*?\]', '', regex=True)
        df = df[['Country', 'GDP (B USD)', 'Year']]
        df = df.sort_values(by='GDP (B USD)', ascending=False)
        df['GDP (B USD)'] = df['GDP (B USD)'].round(2)

        with open('country_region_table.json', 'r', encoding='utf-8') as region_file:
            region_data = json.load(region_file)
        df['Region'] = df['Country'].map(region_data)

        transformed_output_csv_file = 'transformed_gdp_data.csv'
        transformed_output_json_file = 'transformed_gdp_data.json'

        # Save transformed data
        df.to_csv(transformed_output_csv_file, index=False)
        df.to_json(transformed_output_json_file, orient='records', force_ascii=False, indent=4)
        log_message(f"Transformed data saved: CSV ({transformed_output_csv_file}), JSON ({transformed_output_json_file})")

        log_message("Data Transformation Completed")
        return df
    except Exception as e:
        log_message(f"Error during data transformation: {str(e)}", level="ERROR")
        raise


def load_gdp_data(df):
    """Load the transformed data into an SQLite database."""
    log_message("Loading data into SQLite database")
    try:
        conn = sqlite3.connect('World_Economies.db')
        df[['Country', 'GDP (B USD)', 'Year', 'Region']].rename(
            columns={'GDP (B USD)': 'GDP_USD_billion'}
        ).to_sql('Countries_by_GDP', conn, if_exists='replace', index=False)
        conn.close()
        log_message("Data successfully loaded into SQLite database")
    except Exception as e:
        log_message(f"Error while loading data into SQLite database: {str(e)}", level="ERROR")
        raise


def display_countries_with_gdp_over_100():
    """Display countries with GDP over 100B USD."""
    log_message("Displaying countries with GDP over 100B USD")
    try:
        conn = sqlite3.connect('World_Economies.db')
        query = """
        SELECT Country, GDP_USD_billion
        FROM Countries_by_GDP
        WHERE GDP_USD_billion >= 100
        """
        result = pd.read_sql_query(query, conn)
        conn.close()
        print("Countries with GDP >= 100B USD:")
        print(tabulate(result, headers='keys', tablefmt='psql', showindex=False))
    except Exception as e:
        log_message(f"Error querying database for countries with GDP >= 100B: {str(e)}", level="ERROR")
        raise


def display_region_top5_average_gdp():
    """Calculate and display the average GDP of the top 5 countries by region."""
    log_message("Calculating average GDP of top 5 countries by region")
    try:
        conn = sqlite3.connect('World_Economies.db')
        query = """
        WITH RankedCountries AS (
            SELECT Country, GDP_USD_billion, Region,
                   RANK() OVER (PARTITION BY Region ORDER BY GDP_USD_billion DESC) AS Rank
            FROM Countries_by_GDP
            WHERE Region IS NOT NULL
        )
        SELECT Region, AVG(GDP_USD_billion) AS Avg_Top5_GDP
        FROM RankedCountries
        WHERE Rank <= 5
        GROUP BY Region
        """
        result = pd.read_sql_query(query, conn)
        conn.close()
        print("Average GDP of top 5 countries by region (excluding None):")
        print(tabulate(result, headers='keys', tablefmt='psql', showindex=False))
    except Exception as e:
        log_message(f"Error querying database for top 5 average GDP: {str(e)}", level="ERROR")
        raise

def etl_process():
    """Main ETL process for GDP data."""
    try:
        log_started()
        log_message("ETL Process Started")

        # Extract
        raw_data = extract_gdp_data()
        # Transform
        transformed_data = transform_gdp_data(raw_data)
        # Load
        load_gdp_data(transformed_data)
        
        # 화면 출력 요구사항
        display_countries_with_gdp_over_100()
        display_region_top5_average_gdp()

        log_message("ETL Process Completed Successfully")
    except Exception as e:
        log_message(f"ETL Process Failed: {str(e)}", level="ERROR")

if __name__ == "__main__":
    etl_process()


Countries with GDP >= 100B USD:
+----------------------+-------------------+
| Country              |   GDP_USD_billion |
|----------------------+-------------------|
| World                |         115494    |
| United States        |          30337.2  |
| China                |          19534.9  |
| Germany              |           4921.56 |
| Japan                |           4389.33 |
| India                |           4271.92 |
| United Kingdom       |           3730.26 |
| France               |           3283.43 |
| Italy                |           2459.6  |
| Canada               |           2330.31 |
| Brazil               |           2307.16 |
| Russia               |           2195.71 |
| South Korea          |           1947.13 |
| Australia            |           1881.14 |
| Spain                |           1827.58 |
| Mexico               |           1817.82 |
| Indonesia            |           1492.62 |
| Turkey               |           1455.41 |
| Netherlands          

  raw_df = pd.read_html(str(table))[0]


# Python의 반복문은 Pandas의 병합 연산보다 속도가 느리므로 transform에 해당 부분 개선
```python
with open('country_region_table.json', 'r', encoding='utf-8') as region_file:
            region_data = json.load(region_file)
        df['Region'] = df['Country'].map(region_data)
```

### Pandas 병합의 장점
- 최적화된 C 라이브러리 사용: Pandas는 내부적으로 C로 구현된 병합 연산을 사용하기 때문에 반복문에 비해 훨씬 빠르게 동작합니다.
- 벡터화 연산: Pandas 병합은 벡터화 방식으로 작동하므로, 각 행에 대해 개별적으로 처리하지 않고 한 번에 데이터프레임 전체를 처리합니다.
- 큰 데이터셋에 적합: 수천에서 수백만 개의 행이 있는 데이터셋에서도 효율적입니다.
- Python 반복문의 단점: 해석기 방식 실행 (Python 반복문은 한 행씩 처리하며, 이는 Python 인터프리터에서 실행되기 때문에 속도가 상대적으로 느리다)

1. Python 반복문 vs Pandas 병합

	- Python 반복문
		-	반복문 방식(map 또는 apply)은 각 행마다 순차적으로 데이터를 처리합니다.
		-	각 국가에 대해 JSON 데이터를 검색하고 매칭해야 하므로 ￼의 시간 복잡도가 발생합니다.
		-	여기서 ￼은 데이터프레임의 행 수, ￼는 JSON 딕셔너리 키의 수입니다.
		-	Python의 반복문은 인터프리터 방식으로 실행되기 때문에, 대량 데이터셋에서 비효율적입니다.

	- Pandas 병합
		-	Pandas 병합은 벡터화 연산을 사용하며, 내부적으로 최적화된 C 라이브러리를 통해 동작합니다.
		-	병합 연산의 시간 복잡도는 ￼으로, ￼은 데이터프레임의 행 수, ￼은 병합 대상(region_df) 데이터프레임의 행 수입니다.
		-	대량 데이터셋에서도 병합 속도가 훨씬 빠릅니다.

2. JSON 데이터를 Pandas DataFrame으로 변환

	- 변경된 코드에서 json.load()로 읽은 데이터를 Pandas DataFrame으로 변환했기 때문에:
		1. 	JSON 데이터를 DataFrame으로 한 번만 변환하면 됩니다.
		2.	Pandas 병합 연산으로 모든 국가와 지역 정보를 한 번에 매칭할 수 있습니다.

3. 연산 속도의 차이
	- 기존 방식 (Python 반복문):
		- 각 행마다 JSON 딕셔너리를 검색하며, 반복적으로 키를 확인해야 하므로 시간이 오래 걸립니다.
	- 변경된 방식 (Pandas 병합):
		- 두 DataFrame을 효율적으로 병합하며, 전체 데이터를 벡터화 연산으로 처리하기 때문에 실행 속도가 매우 빨라집니다.

4. 결론
- 변경된 코드는 Python 반복문 대신 Pandas의 병합 연산을 사용하여 연산 속도가 크게 개선되었습니다. 특히 데이터셋이 커질수록 성능 차이가 더 두드러질 것입니다. 따라서 Pandas 병합 방식을 사용하는 것이 더 효율적이고 확장성이 뛰어난 선택입니다.

In [11]:
import os
import json
import datetime
import sqlite3
import configparser

import pandas as pd
import requests
from bs4 import BeautifulSoup
from tabulate import tabulate


def log_started():
    """Log the start of a new ETL execution."""
    with open('etl_project_log.txt', 'a') as log_file:
        log_file.write("\n" + "=" * 50 + "\n")
        timestamp = datetime.datetime.now().strftime('%Y-%B-%d-%H-%M-%S')
        log_file.write(f"New Execution at {timestamp}\n")
        log_file.write("=" * 50 + "\n\n")
        

def log_message(message, level="INFO"):
    """Log a message with a timestamp and severity level."""
    timestamp = datetime.datetime.now().strftime('%Y-%b-%d %H:%M:%S')
    with open('etl_project_log.txt', 'a') as log_file:
        log_file.write(f"{timestamp} - {level} - {message}\n")
        

def extract_gdp_data(config_path='config.ini'):
    """Extract GDP data from the URL and table class specified in the config file."""
    if not os.path.exists(config_path):
        log_message(f"Configuration file '{config_path}' not found.", level="ERROR")
        raise FileNotFoundError(f"Configuration file '{config_path}' not found.")

    config = configparser.ConfigParser()
    config.read(config_path)

    if 'DEFAULT' not in config or 'URL' not in config['DEFAULT'] or 'TABLE_CLASS' not in config['DEFAULT']:
        log_message("Invalid or missing configuration values in 'config.ini'.", level="ERROR")
        raise ValueError("Invalid or missing configuration values in 'config.ini'.")

    url = config['DEFAULT']['URL']
    table_class = config['DEFAULT']['TABLE_CLASS']

    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        table = soup.find('table', {'class': table_class})

        if table is None:
            log_message("No table found with the specified class.", level="ERROR")
            raise ValueError("Failed to locate the GDP table on the webpage.")

        raw_df = pd.read_html(str(table))[0]
        raw_output_csv_file = 'raw_gdp_data.csv'
        raw_output_json_file = 'raw_gdp_data.json'

        # Save raw data before any processing
        raw_df.to_csv(raw_output_csv_file, index=False)
        raw_df.to_json(raw_output_json_file, orient='records', force_ascii=False, indent=4)
        log_message(f"Raw data saved: CSV ({raw_output_csv_file}), JSON ({raw_output_json_file})")

        return raw_df
    except Exception as e:
        log_message(f"Error during data extraction: {str(e)}", level="ERROR")
        raise
    

def transform_gdp_data(df):
    """Transform the extracted GDP data and save it."""
    log_message("Starting Data Transformation")
    try:
        # 기본 데이터 정리 및 변환
        df = df.iloc[:, [0, 1, 2]]
        df.columns = ['Country', 'GDP (Nominal)', 'Year']
        df = df.dropna(subset=['Country', 'GDP (Nominal)'])
        df['GDP (B USD)'] = (
            df['GDP (Nominal)']
            .str.replace(r'[^\d.]', '', regex=True)
            .replace('', '0')
            .astype(float) / 1e3
        )
        df['Year'] = df['Year'].str.replace(r'\[.*?\]', '', regex=True)
        df = df[['Country', 'GDP (B USD)', 'Year']]
        df = df.sort_values(by='GDP (B USD)', ascending=False)
        df['GDP (B USD)'] = df['GDP (B USD)'].round(2)

        # JSON 파일을 DataFrame으로 변환
        with open('country_region_table.json', 'r', encoding='utf-8') as region_file:
            region_data = json.load(region_file)
        
        # JSON 데이터를 DataFrame으로 변환
        region_df = pd.DataFrame(list(region_data.items()), columns=['Country', 'Region'])

        # 데이터 병합
        df = pd.merge(df, region_df, on='Country', how='left')

        # 변환된 데이터 저장
        transformed_output_csv_file = 'transformed_gdp_data.csv'
        transformed_output_json_file = 'transformed_gdp_data.json'

        df.to_csv(transformed_output_csv_file, index=False)
        df.to_json(transformed_output_json_file, orient='records', force_ascii=False, indent=4)

        log_message(f"Transformed data saved: CSV ({transformed_output_csv_file}), JSON ({transformed_output_json_file})")
        log_message("Data Transformation Completed")
        return df
    except Exception as e:
        log_message(f"Error during data transformation: {str(e)}", level="ERROR")
        raise


def load_gdp_data(df):
    """Load the transformed data into an SQLite database."""
    log_message("Loading data into SQLite database")
    try:
        conn = sqlite3.connect('World_Economies.db')
        df[['Country', 'GDP (B USD)', 'Year', 'Region']].rename(
            columns={'GDP (B USD)': 'GDP_USD_billion'}
        ).to_sql('Countries_by_GDP', conn, if_exists='replace', index=False)
        conn.close()
        log_message("Data successfully loaded into SQLite database")
    except Exception as e:
        log_message(f"Error while loading data into SQLite database: {str(e)}", level="ERROR")
        raise


def display_countries_with_gdp_over_100():
    """Display countries with GDP over 100B USD."""
    log_message("Displaying countries with GDP over 100B USD")
    try:
        conn = sqlite3.connect('World_Economies.db')
        query = """
        SELECT Country, GDP_USD_billion
        FROM Countries_by_GDP
        WHERE GDP_USD_billion >= 100
        """
        result = pd.read_sql_query(query, conn)
        conn.close()
        print("Countries with GDP >= 100B USD:")
        print(tabulate(result, headers='keys', tablefmt='psql', showindex=False))
    except Exception as e:
        log_message(f"Error querying database for countries with GDP >= 100B: {str(e)}", level="ERROR")
        raise


def display_region_top5_average_gdp():
    """Calculate and display the average GDP of the top 5 countries by region."""
    log_message("Calculating average GDP of top 5 countries by region")
    try:
        conn = sqlite3.connect('World_Economies.db')
        query = """
        WITH RankedCountries AS (
            SELECT Country, GDP_USD_billion, Region,
                   RANK() OVER (PARTITION BY Region ORDER BY GDP_USD_billion DESC) AS Rank
            FROM Countries_by_GDP
            WHERE Region IS NOT NULL
        )
        SELECT Region, AVG(GDP_USD_billion) AS Avg_Top5_GDP
        FROM RankedCountries
        WHERE Rank <= 5
        GROUP BY Region
        """
        result = pd.read_sql_query(query, conn)
        conn.close()
        print("Average GDP of top 5 countries by region (excluding None):")
        print(tabulate(result, headers='keys', tablefmt='psql', showindex=False))
    except Exception as e:
        log_message(f"Error querying database for top 5 average GDP: {str(e)}", level="ERROR")
        raise

def etl_process():
    """Main ETL process for GDP data."""
    try:
        log_started()
        log_message("ETL Process Started")

        # Extract
        raw_data = extract_gdp_data()
        # Transform
        transformed_data = transform_gdp_data(raw_data)
        # Load
        load_gdp_data(transformed_data)
        
        # 화면 출력 요구사항
        display_countries_with_gdp_over_100()
        display_region_top5_average_gdp()

        log_message("ETL Process Completed Successfully")
    except Exception as e:
        log_message(f"ETL Process Failed: {str(e)}", level="ERROR")

if __name__ == "__main__":
    etl_process()


Countries with GDP >= 100B USD:
+----------------------+-------------------+
| Country              |   GDP_USD_billion |
|----------------------+-------------------|
| World                |         115494    |
| United States        |          30337.2  |
| China                |          19534.9  |
| Germany              |           4921.56 |
| Japan                |           4389.33 |
| India                |           4271.92 |
| United Kingdom       |           3730.26 |
| France               |           3283.43 |
| Italy                |           2459.6  |
| Canada               |           2330.31 |
| Brazil               |           2307.16 |
| Russia               |           2195.71 |
| South Korea          |           1947.13 |
| Australia            |           1881.14 |
| Spain                |           1827.58 |
| Mexico               |           1817.82 |
| Indonesia            |           1492.62 |
| Turkey               |           1455.41 |
| Netherlands          

  raw_df = pd.read_html(str(table))[0]


---

# transform_gdp_data(df)에서 진행하는 과정 중 try문에서 오류를 더 정확히 확인하기 위해서 더 작게 쪼갬

In [13]:
import os
import json
import datetime
import sqlite3
import configparser

import pandas as pd
import requests
from bs4 import BeautifulSoup
from tabulate import tabulate


def log_started():
    """Log the start of a new ETL execution."""
    with open('etl_project_log.txt', 'a') as log_file:
        log_file.write("\n" + "=" * 50 + "\n")
        timestamp = datetime.datetime.now().strftime('%Y-%B-%d-%H-%M-%S')
        log_file.write(f"New Execution at {timestamp}\n")
        log_file.write("=" * 50 + "\n\n")
        

def log_message(message, level="INFO"):
    """Log a message with a timestamp and severity level."""
    timestamp = datetime.datetime.now().strftime('%Y-%b-%d %H:%M:%S')
    with open('etl_project_log.txt', 'a') as log_file:
        log_file.write(f"{timestamp} - {level} - {message}\n")
        

def extract_gdp_data(config_path='config.ini'):
    """Extract GDP data from the URL and table class specified in the config file."""
    if not os.path.exists(config_path):
        log_message(f"Configuration file '{config_path}' not found.", level="ERROR")
        raise FileNotFoundError(f"Configuration file '{config_path}' not found.")

    config = configparser.ConfigParser()
    config.read(config_path)

    if 'DEFAULT' not in config or 'URL' not in config['DEFAULT'] or 'TABLE_CLASS' not in config['DEFAULT']:
        log_message("Invalid or missing configuration values in 'config.ini'.", level="ERROR")
        raise ValueError("Invalid or missing configuration values in 'config.ini'.")

    url = config['DEFAULT']['URL']
    table_class = config['DEFAULT']['TABLE_CLASS']

    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        table = soup.find('table', {'class': table_class})

        if table is None:
            log_message("No table found with the specified class.", level="ERROR")
            raise ValueError("Failed to locate the GDP table on the webpage.")

        raw_df = pd.read_html(str(table))[0]
        raw_output_csv_file = 'raw_gdp_data.csv'
        raw_output_json_file = 'raw_gdp_data.json'

        # Save raw data before any processing
        raw_df.to_csv(raw_output_csv_file, index=False)
        raw_df.to_json(raw_output_json_file, orient='records', force_ascii=False, indent=4)
        log_message(f"Raw data saved: CSV ({raw_output_csv_file}), JSON ({raw_output_json_file})")

        return raw_df
    except Exception as e:
        log_message(f"Error during data extraction: {str(e)}", level="ERROR")
        raise
    

def transform_gdp_data(df):
    """Transform the extracted GDP data and save it."""
    log_message("Starting Data Transformation")
    try:
        # 1. 기본 데이터 정리 및 변환
        try:
            df = df.iloc[:, [0, 1, 2]]
            df.columns = ['Country', 'GDP (Nominal)', 'Year']
            df = df.dropna(subset=['Country', 'GDP (Nominal)'])
            df['GDP (B USD)'] = (
                df['GDP (Nominal)']
                .str.replace(r'[^\d.]', '', regex=True)
                .replace('', '0')
                .astype(float) / 1e3
            )
            df['Year'] = df['Year'].str.replace(r'\[.*?\]', '', regex=True)
            df = df[['Country', 'GDP (B USD)', 'Year']]
            df = df.sort_values(by='GDP (B USD)', ascending=False)
            df['GDP (B USD)'] = df['GDP (B USD)'].round(2)
            log_message("Data cleaning and transformation completed.")
        except Exception as e:
            log_message(f"Error in data cleaning and transformation: {str(e)}", level="ERROR")
            raise

        # 2. JSON 파일 읽기 및 데이터 병합
        try:
            with open('country_region_table.json', 'r', encoding='utf-8') as region_file:
                region_data = json.load(region_file)
            region_df = pd.DataFrame(list(region_data.items()), columns=['Country', 'Region'])
            df = pd.merge(df, region_df, on='Country', how='left')
            log_message("Region data merged successfully.")
        except FileNotFoundError:
            log_message("Region mapping file 'country_region_table.json' not found.", level="ERROR")
            raise
        except Exception as e:
            log_message(f"Error during region data merging: {str(e)}", level="ERROR")
            raise

        # 3. 변환된 데이터 저장
        try:
            transformed_output_csv_file = 'transformed_gdp_data.csv'
            transformed_output_json_file = 'transformed_gdp_data.json'

            df.to_csv(transformed_output_csv_file, index=False)
            df.to_json(transformed_output_json_file, orient='records', force_ascii=False, indent=4)
            log_message(f"Transformed data saved: CSV ({transformed_output_csv_file}), JSON ({transformed_output_json_file})")
        except Exception as e:
            log_message(f"Error while saving transformed data: {str(e)}", level="ERROR")
            raise

        log_message("Data Transformation Completed")
        return df

    except Exception as e:
        log_message(f"Error during data transformation: {str(e)}", level="ERROR")
        raise


def load_gdp_data(df):
    """Load the transformed data into an SQLite database."""
    log_message("Loading data into SQLite database")
    try:
        conn = sqlite3.connect('World_Economies.db')
        df[['Country', 'GDP (B USD)', 'Year', 'Region']].rename(
            columns={'GDP (B USD)': 'GDP_USD_billion'}
        ).to_sql('Countries_by_GDP', conn, if_exists='replace', index=False)
        conn.close()
        log_message("Data successfully loaded into SQLite database")
    except Exception as e:
        log_message(f"Error while loading data into SQLite database: {str(e)}", level="ERROR")
        raise


def display_countries_with_gdp_over_100():
    """Display countries with GDP over 100B USD."""
    log_message("Displaying countries with GDP over 100B USD")
    try:
        conn = sqlite3.connect('World_Economies.db')
        query = """
        SELECT Country, GDP_USD_billion
        FROM Countries_by_GDP
        WHERE GDP_USD_billion >= 100
        """
        result = pd.read_sql_query(query, conn)
        conn.close()
        print("Countries with GDP >= 100B USD:")
        print(tabulate(result, headers='keys', tablefmt='psql', showindex=False))
    except Exception as e:
        log_message(f"Error querying database for countries with GDP >= 100B: {str(e)}", level="ERROR")
        raise


def display_region_top5_average_gdp():
    """Calculate and display the average GDP of the top 5 countries by region."""
    log_message("Calculating average GDP of top 5 countries by region")
    try:
        conn = sqlite3.connect('World_Economies.db')
        query = """
        WITH RankedCountries AS (
            SELECT Country, GDP_USD_billion, Region,
                   RANK() OVER (PARTITION BY Region ORDER BY GDP_USD_billion DESC) AS Rank
            FROM Countries_by_GDP
            WHERE Region IS NOT NULL
        )
        SELECT Region, AVG(GDP_USD_billion) AS Avg_Top5_GDP
        FROM RankedCountries
        WHERE Rank <= 5
        GROUP BY Region
        """
        result = pd.read_sql_query(query, conn)
        conn.close()
        print("Average GDP of top 5 countries by region (excluding None):")
        print(tabulate(result, headers='keys', tablefmt='psql', showindex=False))
    except Exception as e:
        log_message(f"Error querying database for top 5 average GDP: {str(e)}", level="ERROR")
        raise

def etl_process():
    """Main ETL process for GDP data."""
    try:
        log_started()
        log_message("ETL Process Started")

        # Extract
        raw_data = extract_gdp_data()
        # Transform
        transformed_data = transform_gdp_data(raw_data)
        # Load
        load_gdp_data(transformed_data)
        
        # 화면 출력 요구사항
        display_countries_with_gdp_over_100()
        display_region_top5_average_gdp()

        log_message("ETL Process Completed Successfully")
    except Exception as e:
        log_message(f"ETL Process Failed: {str(e)}", level="ERROR")

if __name__ == "__main__":
    etl_process()


Countries with GDP >= 100B USD:
+----------------------+-------------------+
| Country              |   GDP_USD_billion |
|----------------------+-------------------|
| World                |         115494    |
| United States        |          30337.2  |
| China                |          19534.9  |
| Germany              |           4921.56 |
| Japan                |           4389.33 |
| India                |           4271.92 |
| United Kingdom       |           3730.26 |
| France               |           3283.43 |
| Italy                |           2459.6  |
| Canada               |           2330.31 |
| Brazil               |           2307.16 |
| Russia               |           2195.71 |
| South Korea          |           1947.13 |
| Australia            |           1881.14 |
| Spain                |           1827.58 |
| Mexico               |           1817.82 |
| Indonesia            |           1492.62 |
| Turkey               |           1455.41 |
| Netherlands          

  raw_df = pd.read_html(str(table))[0]


---


# Extract_gdp_data()에서 진행하는 과정 중 try문에서 오류를 더 정확히 확인하기 위해서 더 작게 쪼갬

In [14]:
import os
import json
import datetime
import sqlite3
import configparser

import pandas as pd
import requests
from bs4 import BeautifulSoup
from tabulate import tabulate


def log_started():
    """Log the start of a new ETL execution."""
    with open('etl_project_log.txt', 'a') as log_file:
        log_file.write("\n" + "=" * 50 + "\n")
        timestamp = datetime.datetime.now().strftime('%Y-%B-%d-%H-%M-%S')
        log_file.write(f"New Execution at {timestamp}\n")
        log_file.write("=" * 50 + "\n\n")
        

def log_message(message, level="INFO"):
    """Log a message with a timestamp and severity level."""
    timestamp = datetime.datetime.now().strftime('%Y-%b-%d %H:%M:%S')
    with open('etl_project_log.txt', 'a') as log_file:
        log_file.write(f"{timestamp} - {level} - {message}\n")
        

def extract_gdp_data(config_path='config.ini'):
    """Extract GDP data from the URL and table class specified in the config file."""
    log_message("Starting data extraction process")
    
    # 1. 설정 파일 확인 및 읽기
    if not os.path.exists(config_path):
        log_message(f"Configuration file '{config_path}' not found.", level="ERROR")
        raise FileNotFoundError(f"Configuration file '{config_path}' not found.")
    
    try:
        config = configparser.ConfigParser()
        config.read(config_path)
        if 'DEFAULT' not in config or 'URL' not in config['DEFAULT'] or 'TABLE_CLASS' not in config['DEFAULT']:
            log_message("Invalid or missing configuration values in 'config.ini'.", level="ERROR")
            raise ValueError("Invalid or missing configuration values in 'config.ini'.")
        url = config['DEFAULT']['URL']
        table_class = config['DEFAULT']['TABLE_CLASS']
        log_message("Configuration file read successfully")
    except Exception as e:
        log_message(f"Error reading configuration file: {str(e)}", level="ERROR")
        raise
    
    # 2. 웹 페이지 요청 및 HTML 가져오기
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        log_message(f"Successfully fetched data from URL: {url}")
    except requests.exceptions.Timeout:
        log_message("Request timed out while accessing the URL.", level="ERROR")
        raise
    except requests.exceptions.RequestException as e:
        log_message(f"Error during requests to the URL: {str(e)}", level="ERROR")
        raise
    
    # 3. HTML 파싱 및 테이블 찾기
    try:
        soup = BeautifulSoup(response.text, 'html.parser')
        table = soup.find('table', {'class': table_class})
        if table is None:
            log_message(f"No table found with the specified class '{table_class}'.", level="ERROR")
            raise ValueError("Failed to locate the GDP table on the webpage.")
        log_message("Successfully located the GDP table in the HTML")
    except Exception as e:
        log_message(f"Error parsing HTML or finding table: {str(e)}", level="ERROR")
        raise
    
    # 4. 테이블 데이터를 DataFrame으로 변환
    try:
        raw_df = pd.read_html(str(table))[0]
        raw_output_csv_file = 'raw_gdp_data.csv'
        raw_output_json_file = 'raw_gdp_data.json'

        # Save raw data before any processing
        raw_df.to_csv(raw_output_csv_file, index=False)
        raw_df.to_json(raw_output_json_file, orient='records', force_ascii=False, indent=4)
        log_message(f"Raw data saved: CSV ({raw_output_csv_file}), JSON ({raw_output_json_file})")
    except Exception as e:
        log_message(f"Error converting table to DataFrame or saving raw data: {str(e)}", level="ERROR")
        raise
    
    log_message("Data extraction process completed successfully")
    return raw_df
    

def transform_gdp_data(df):
    """Transform the extracted GDP data and save it."""
    log_message("Starting Data Transformation")
    try:
        # 1. 기본 데이터 정리 및 변환
        try:
            df = df.iloc[:, [0, 1, 2]]
            df.columns = ['Country', 'GDP (Nominal)', 'Year']
            df = df.dropna(subset=['Country', 'GDP (Nominal)'])
            df['GDP (B USD)'] = (
                df['GDP (Nominal)']
                .str.replace(r'[^\d.]', '', regex=True)
                .replace('', '0')
                .astype(float) / 1e3
            )
            df['Year'] = df['Year'].str.replace(r'\[.*?\]', '', regex=True)
            df = df[['Country', 'GDP (B USD)', 'Year']]
            df = df.sort_values(by='GDP (B USD)', ascending=False)
            df['GDP (B USD)'] = df['GDP (B USD)'].round(2)
            log_message("Data cleaning and transformation completed.")
        except Exception as e:
            log_message(f"Error in data cleaning and transformation: {str(e)}", level="ERROR")
            raise

        # 2. JSON 파일 읽기 및 데이터 병합
        try:
            with open('country_region_table.json', 'r', encoding='utf-8') as region_file:
                region_data = json.load(region_file)
            region_df = pd.DataFrame(list(region_data.items()), columns=['Country', 'Region'])
            df = pd.merge(df, region_df, on='Country', how='left')
            log_message("Region data merged successfully.")
        except FileNotFoundError:
            log_message("Region mapping file 'country_region_table.json' not found.", level="ERROR")
            raise
        except Exception as e:
            log_message(f"Error during region data merging: {str(e)}", level="ERROR")
            raise

        # 3. 변환된 데이터 저장
        try:
            transformed_output_csv_file = 'transformed_gdp_data.csv'
            transformed_output_json_file = 'transformed_gdp_data.json'

            df.to_csv(transformed_output_csv_file, index=False)
            df.to_json(transformed_output_json_file, orient='records', force_ascii=False, indent=4)
            log_message(f"Transformed data saved: CSV ({transformed_output_csv_file}), JSON ({transformed_output_json_file})")
        except Exception as e:
            log_message(f"Error while saving transformed data: {str(e)}", level="ERROR")
            raise

        log_message("Data Transformation Completed")
        return df

    except Exception as e:
        log_message(f"Error during data transformation: {str(e)}", level="ERROR")
        raise


def load_gdp_data(df):
    """Load the transformed data into an SQLite database."""
    log_message("Loading data into SQLite database")
    try:
        conn = sqlite3.connect('World_Economies.db')
        df[['Country', 'GDP (B USD)', 'Year', 'Region']].rename(
            columns={'GDP (B USD)': 'GDP_USD_billion'}
        ).to_sql('Countries_by_GDP', conn, if_exists='replace', index=False)
        conn.close()
        log_message("Data successfully loaded into SQLite database")
    except Exception as e:
        log_message(f"Error while loading data into SQLite database: {str(e)}", level="ERROR")
        raise


def display_countries_with_gdp_over_100():
    """Display countries with GDP over 100B USD."""
    log_message("Displaying countries with GDP over 100B USD")
    try:
        conn = sqlite3.connect('World_Economies.db')
        query = """
        SELECT Country, GDP_USD_billion
        FROM Countries_by_GDP
        WHERE GDP_USD_billion >= 100
        """
        result = pd.read_sql_query(query, conn)
        conn.close()
        print("Countries with GDP >= 100B USD:")
        print(tabulate(result, headers='keys', tablefmt='psql', showindex=False))
    except Exception as e:
        log_message(f"Error querying database for countries with GDP >= 100B: {str(e)}", level="ERROR")
        raise


def display_region_top5_average_gdp():
    """Calculate and display the average GDP of the top 5 countries by region."""
    log_message("Calculating average GDP of top 5 countries by region")
    try:
        conn = sqlite3.connect('World_Economies.db')
        query = """
        WITH RankedCountries AS (
            SELECT Country, GDP_USD_billion, Region,
                   RANK() OVER (PARTITION BY Region ORDER BY GDP_USD_billion DESC) AS Rank
            FROM Countries_by_GDP
            WHERE Region IS NOT NULL
        )
        SELECT Region, AVG(GDP_USD_billion) AS Avg_Top5_GDP
        FROM RankedCountries
        WHERE Rank <= 5
        GROUP BY Region
        """
        result = pd.read_sql_query(query, conn)
        conn.close()
        print("Average GDP of top 5 countries by region (excluding None):")
        print(tabulate(result, headers='keys', tablefmt='psql', showindex=False))
    except Exception as e:
        log_message(f"Error querying database for top 5 average GDP: {str(e)}", level="ERROR")
        raise

def etl_process():
    """Main ETL process for GDP data."""
    try:
        log_started()
        log_message("ETL Process Started")

        # Extract
        raw_data = extract_gdp_data()
        # Transform
        transformed_data = transform_gdp_data(raw_data)
        # Load
        load_gdp_data(transformed_data)
        
        # 화면 출력 요구사항
        display_countries_with_gdp_over_100()
        display_region_top5_average_gdp()

        log_message("ETL Process Completed Successfully")
    except Exception as e:
        log_message(f"ETL Process Failed: {str(e)}", level="ERROR")

if __name__ == "__main__":
    etl_process()


Countries with GDP >= 100B USD:
+----------------------+-------------------+
| Country              |   GDP_USD_billion |
|----------------------+-------------------|
| World                |         115494    |
| United States        |          30337.2  |
| China                |          19534.9  |
| Germany              |           4921.56 |
| Japan                |           4389.33 |
| India                |           4271.92 |
| United Kingdom       |           3730.26 |
| France               |           3283.43 |
| Italy                |           2459.6  |
| Canada               |           2330.31 |
| Brazil               |           2307.16 |
| Russia               |           2195.71 |
| South Korea          |           1947.13 |
| Australia            |           1881.14 |
| Spain                |           1827.58 |
| Mexico               |           1817.82 |
| Indonesia            |           1492.62 |
| Turkey               |           1455.41 |
| Netherlands          

  raw_df = pd.read_html(str(table))[0]


---

# 내 코드를 모르는 사람이 처음 보았을 때, 이해 가능하도록 하는 설명 및 주석 추가하기

In [None]:
import os
import json
import datetime
import sqlite3
import configparser

import pandas as pd
import requests
from bs4 import BeautifulSoup
from tabulate import tabulate


def log_started():
    """
    Log the start of a new ETL execution.
    
    This function creates a separator in the log file and records a timestamp
    to indicate the beginning of a new ETL execution.
    """
    with open('etl_project_log.txt', 'a') as log_file:
        log_file.write("\n" + "=" * 50 + "\n")
        timestamp = datetime.datetime.now().strftime('%Y-%B-%d-%H-%M-%S')
        log_file.write(f"New Execution at {timestamp}\n")
        log_file.write("=" * 50 + "\n\n")


def log_message(message, level="INFO"):
    """
    Log a message with a timestamp and severity level.

    Parameters:
        message (str): The message to log.
        level (str): The severity level of the message (default: "INFO").
    """
    timestamp = datetime.datetime.now().strftime('%Y-%b-%d %H:%M:%S')
    with open('etl_project_log.txt', 'a') as log_file:
        log_file.write(f"{timestamp} - {level} - {message}\n")


def extract_gdp_data(config_path='config.ini'):
    """
    Extract GDP data from a webpage based on a configuration file.

    Parameters:
        config_path (str): Path to the configuration file (default: 'config.ini').

    Returns:
        pd.DataFrame: Raw GDP data extracted from the webpage.
    """
    log_message("Starting data extraction process")

    # Step 1: Read and validate the configuration file
    if not os.path.exists(config_path):
        log_message(f"Configuration file '{config_path}' not found.", level="ERROR")
        raise FileNotFoundError(f"Configuration file '{config_path}' not found.")

    try:
        config = configparser.ConfigParser()
        config.read(config_path)
        url = config['DEFAULT']['URL']
        table_class = config['DEFAULT']['TABLE_CLASS']
        log_message("Configuration file read successfully")
    except Exception as e:
        log_message(f"Error reading configuration file: {str(e)}", level="ERROR")
        raise

    # Step 2: Fetch the webpage data
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        log_message(f"Successfully fetched data from URL: {url}")
    except requests.exceptions.RequestException as e:
        log_message(f"Error during requests to the URL: {str(e)}", level="ERROR")
        raise

    # Step 3: Parse the HTML and extract the table
    try:
        soup = BeautifulSoup(response.text, 'html.parser')
        table = soup.find('table', {'class': table_class})
        if table is None:
            log_message(f"No table found with the specified class '{table_class}'.", level="ERROR")
            raise ValueError("Failed to locate the GDP table on the webpage.")
        log_message("Successfully located the GDP table in the HTML")
    except Exception as e:
        log_message(f"Error parsing HTML or finding table: {str(e)}", level="ERROR")
        raise

    # Step 4: Convert the table to a DataFrame and save raw data
    try:
        raw_df = pd.read_html(str(table))[0]
        raw_df.to_csv('raw_gdp_data.csv', index=False)
        raw_df.to_json('raw_gdp_data.json', orient='records', force_ascii=False, indent=4)
        log_message("Raw data saved successfully")
    except Exception as e:
        log_message(f"Error converting table to DataFrame or saving raw data: {str(e)}", level="ERROR")
        raise

    log_message("Data extraction process completed successfully")
    return raw_df


def transform_gdp_data(df):
    """
    Transform the extracted GDP data for further analysis.

    Parameters:
        df (pd.DataFrame): Raw GDP data.

    Returns:
        pd.DataFrame: Transformed GDP data.
    """
    log_message("Starting data transformation process")
    try:
        # Step 1: Clean and process the data
        df = df.iloc[:, [0, 1, 2]]
        df.columns = ['Country', 'GDP (Nominal)', 'Year']
        df = df.dropna(subset=['Country', 'GDP (Nominal)'])
        df['GDP (B USD)'] = (
            df['GDP (Nominal)']
            .str.replace(r'[^\d.]', '', regex=True)  # Remove non-numeric characters
            .astype(float) / 1e3
        )
        df['Year'] = df['Year'].str.replace(r'\[.*?\]', '', regex=True)
        df = df[['Country', 'GDP (B USD)', 'Year']]
        df.sort_values(by='GDP (B USD)', ascending=False, inplace=True)
        log_message("Data cleaned and transformed successfully")

        # Step 2: Merge with region data
        with open('country_region_table.json', 'r', encoding='utf-8') as region_file:
            region_data = json.load(region_file)
        region_df = pd.DataFrame(list(region_data.items()), columns=['Country', 'Region'])
        df = pd.merge(df, region_df, on='Country', how='left')

        # Step 3: Save transformed data
        df.to_csv('transformed_gdp_data.csv', index=False)
        df.to_json('transformed_gdp_data.json', orient='records', force_ascii=False, indent=4)
        log_message("Transformed data saved successfully")

    except Exception as e:
        log_message(f"Error in data transformation: {str(e)}", level="ERROR")
        raise

    return df


def load_gdp_data(df):
    """
    Load transformed GDP data into an SQLite database.

    Parameters:
        df (pd.DataFrame): Transformed GDP data.
    """
    log_message("Loading data into SQLite database")
    try:
        conn = sqlite3.connect('World_Economies.db')
        df.to_sql('Countries_by_GDP', conn, if_exists='replace', index=False)
        conn.close()
        log_message("Data successfully loaded into SQLite database")
    except Exception as e:
        log_message(f"Error while loading data into SQLite database: {str(e)}", level="ERROR")
        raise


def display_countries_with_gdp_over_100():
    """
    Display countries with GDP over 100 billion USD.

    Prints a table of countries with GDP greater than or equal to 100B USD from the database.
    """
    log_message("Displaying countries with GDP over 100B USD")
    try:
        conn = sqlite3.connect('World_Economies.db')
        query = """
        SELECT Country, GDP_USD_billion
        FROM Countries_by_GDP
        WHERE GDP_USD_billion >= 100
        """
        result = pd.read_sql_query(query, conn)
        conn.close()
        print("Countries with GDP >= 100B USD:")
        print(tabulate(result, headers='keys', tablefmt='psql', showindex=False))
    except Exception as e:
        log_message(f"Error querying database for countries with GDP >= 100B: {str(e)}", level="ERROR")
        raise


def display_region_top5_average_gdp():
    """
    Calculate and display the average GDP of the top 5 countries by region.

    Prints the average GDP of the top 5 countries for each region from the database.
    """
    log_message("Calculating average GDP of top 5 countries by region")
    try:
        conn = sqlite3.connect('World_Economies.db')
        query = """
        WITH RankedCountries AS (
            SELECT Country, GDP_USD_billion, Region,
                   RANK() OVER (PARTITION BY Region ORDER BY GDP_USD_billion DESC) AS Rank
            FROM Countries_by_GDP
            WHERE Region IS NOT NULL
        )
        SELECT Region, AVG(GDP_USD_billion) AS Avg_Top5_GDP
        FROM RankedCountries
        WHERE Rank <= 5
        GROUP BY Region
        """
        result = pd.read_sql_query(query, conn)
        conn.close()
        print("Average GDP of top 5 countries by region (excluding None):")
        print(tabulate(result, headers='keys', tablefmt='psql', showindex=False))
    except Exception as e:
        log_message(f"Error querying database for top 5 average GDP: {str(e)}", level="ERROR")
        raise


def etl_process():
    """
    Execute the ETL process for GDP data.

    Steps:
        1. Extract raw data from a webpage.
        2. Transform the data into a cleaned(required) format.
        3. Load the data into an SQLite database.
        4. Display additional analytical results.
    """
    try:
        log_started()
        log_message("ETL Process Started")

        # Extract
        raw_data = extract_gdp_data()

        # Transform
        transformed_data = transform_gdp_data(raw_data)

        # Load
        load_gdp_data(transformed_data)

        # Display analysis
        display_countries_with_gdp_over_100()
        display_region_top5_average_gdp()

        log_message("ETL Process Completed Successfully")
    except Exception as e:
        log_message(f"ETL Process Failed: {str(e)}", level="ERROR")


if __name__ == "__main__":
    etl_process()