# --- Step 1: Clean GDP Columns ---

In [3]:
# Core libraries
import pandas as pd
import numpy as np

# PostgreSQL connector
import psycopg2
from sqlalchemy import create_engine

# Optional: For .env config (if you want later)
# from dotenv import load_dotenv
# load_dotenv()

# STEP 2: LOAD RAW DATA (EXTRACT PHASE)

In [6]:
# Load the raw GDP dataset
df_raw = pd.read_csv("gdp.csv")

# Preview it
df_raw.head()

Unnamed: 0,Year,Nominal GDP prices (Ksh Million),Annual GDP growth (%),Real GDP prices (Ksh Million)
0,2023,15108806,5.6,10399980
1,2022,13489642,4.9,9852583
2,2021,12027662,7.6,9395942
3,2020,10715070,-0.3,8733060
4,2019,10237727,5.1,8756946


# ✅ STEP 3: CLEAN & TRANSFORM

In [11]:
# STEP 3: Clean and Transform the Data

# Make a copy
df = df_raw.copy()

# Normalize column names
df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_").str.replace("(", "").str.replace(")", "")

# Rename to simpler column names
df.rename(columns={
    'nominal_gdp_prices_ksh_million': 'nominal_gdp',
    'real_gdp_prices_ksh_million': 'real_gdp',
    'annual_gdp_growth_%': 'annual_growth'
}, inplace=True)

# Convert types
df['year'] = df['year'].astype(int)
df['nominal_gdp'] = pd.to_numeric(df['nominal_gdp'], errors='coerce')
df['real_gdp'] = pd.to_numeric(df['real_gdp'], errors='coerce')
df['annual_growth'] = pd.to_numeric(df['annual_growth'], errors='coerce')

# Create GDP deflator
df['gdp_deflator'] = (df['nominal_gdp'] / df['real_gdp']) * 100

# Drop missing/invalid rows
df.dropna(inplace=True)

# View transformed data
df.head()


Unnamed: 0,year,nominal_gdp,annual_growth,real_gdp,gdp_deflator


In [17]:
import pandas as pd

# Step 1: Read the CSV
df = pd.read_csv("gdp.csv")

# Step 2: Optional cleanup
df.columns = [col.strip() for col in df.columns]  # remove extra spaces in column names

# Step 3: PostgreSQL connection
from sqlalchemy import create_engine

username = "etl_user"
password = "12345"
host = "localhost"
port = "5432"
database = "etl_project"

engine = create_engine(f"postgresql+psycopg2://{username}:{password}@{host}:{port}/{database}")

# Step 4: Load into database
df.to_sql("kenya_gdp_data", engine, if_exists="replace", index=False)

24