In [None]:
!pip install pandas psycopg2-binary python-dotenv

Collecting psycopg2-binary
  Downloading psycopg2_binary-2.9.11-cp312-cp312-win_amd64.whl.metadata (5.1 kB)
Downloading psycopg2_binary-2.9.11-cp312-cp312-win_amd64.whl (2.7 MB)
   ---------------------------------------- 0.0/2.7 MB ? eta -:--:--
   ---------------------------------------- 0.0/2.7 MB ? eta -:--:--
   ---------------------------------------- 0.0/2.7 MB ? eta -:--:--
   --- ------------------------------------ 0.3/2.7 MB ? eta -:--:--
   ------- -------------------------------- 0.5/2.7 MB 1.1 MB/s eta 0:00:02
   ----------- ---------------------------- 0.8/2.7 MB 1.1 MB/s eta 0:00:02
   ----------- ---------------------------- 0.8/2.7 MB 1.1 MB/s eta 0:00:02
   ------------------- -------------------- 1.3/2.7 MB 1.0 MB/s eta 0:00:02
   ----------------------- ---------------- 1.6/2.7 MB 1.1 MB/s eta 0:00:02
   --------------------------- ------------ 1.8/2.7 MB 1.1 MB/s eta 0:00:01
   ------------------------------ --------- 2.1/2.7 MB 1.2 MB/s eta 0:00:01
   -----------

In [None]:
# ##  Step 1. Setup and Import Libraries
import os
import pandas as pd
import psycopg2
from psycopg2 import extras
from dotenv import load_dotenv
from io import StringIO
from pathlib import Path

In [None]:
# ##  Step 2. Load Environment Variables

# Load variables from .env file
load_dotenv()

# Database settings
DB_NAME = os.getenv("PG_DBNAME")
DB_USER = os.getenv("PG_USER")
DB_PASSWORD = os.getenv("PG_PASSWORD")
DB_HOST = os.getenv("PG_HOST")
DB_PORT = os.getenv("PG_PORT")

In [None]:
# File settings
CSV_PATH = os.getenv("CSV_PATH")
CHUNK_SIZE = int(os.getenv("CHUNK_SIZE", "200000"))
TABLE_NAME = os.getenv("TABLE_NAME", "people")

print(" Environment variables loaded successfully!")


 Environment variables loaded successfully!


In [None]:
# ##  Step 3. Define Database Table Schema

create_table_query = f"""
CREATE TABLE IF NOT EXISTS {TABLE_NAME} (
    row_index INTEGER,
    user_id VARCHAR(50),
    first_name TEXT,
    last_name TEXT,
    sex VARCHAR(10),
    email TEXT,
    phone TEXT,
    date_of_birth DATE,
    job_title TEXT
);
"""


In [None]:
# ##  Step 4. Define ETL Functions

def extract_csv(csv_path, chunk_size):
    """Extracts data from a large CSV file in chunks."""
    print(f"Extracting data from: {csv_path}")
    return pd.read_csv(csv_path, chunksize=chunk_size, dtype=str, na_values=["", "NA", "N/A"])


def transform_chunk(chunk):
    """Cleans and transforms a single chunk of data."""
    column_map = {
        "Index": "row_index",
        "User Id": "user_id",
        "First Name": "first_name",
        "Last Name": "last_name",
        "Sex": "sex",
        "Email": "email",
        "Phone": "phone",
        "Date of birth": "date_of_birth",
        "Job Title": "job_title"
    }

    # Rename columns
    chunk.rename(columns=column_map, inplace=True)

    # Convert date format
    if "date_of_birth" in chunk.columns:
        chunk["date_of_birth"] = pd.to_datetime(chunk["date_of_birth"], errors="coerce")

    # Fill missing values
    chunk.fillna({"first_name": "Unknown", "last_name": "Unknown", "job_title": "Unspecified"}, inplace=True)

    # Reorder columns
    expected_columns = [
        "row_index", "user_id", "first_name", "last_name",
        "sex", "email", "phone", "date_of_birth", "job_title"
    ]
    chunk = chunk[[col for col in expected_columns if col in chunk.columns]]

    print(f" Transformed chunk with {len(chunk)} rows")
    return chunk


def copy_from_stringio(conn, df, table):
    """Efficiently loads a Pandas DataFrame into PostgreSQL using COPY."""
    buffer = StringIO()
    df.to_csv(buffer, index=False, header=False)
    buffer.seek(0)
    cursor = conn.cursor()
    try:
        cursor.copy_from(buffer, table, sep=",", null="")
        conn.commit()
    except Exception as e:
        conn.rollback()
        print(" Error during COPY:", e)
    finally:
        cursor.close()


def load_to_postgres(conn, chunk, table_name):
    """Loads one chunk of data into PostgreSQL."""
    copy_from_stringio(conn, chunk, table_name)
    print(f" Loaded {len(chunk)} records into '{table_name}'")
