# Simple ETL Application

In this notebook, we connect to an api, extract data, create a spark dataframe and then finally write to a postgres database. 

In [34]:
import requests
import json
import pandas as pd
import psycopg2

## Get data from Randomuser API

In [35]:
def get_data():
    """
    get data from random user api
    """
    try:
        url = "https://randomuser.me/api/"
        response = requests.get(url)
        status_code = response.status_code
        response = json.loads(response.text)
        return status_code,response['results'][0]
    except Exception as e:
        print(f"there was an error{e}")
        return None

## Create dataframe

In [36]:
people_list = []

for _ in range(5):
    people_list.append(get_data())

data = pd.DataFrame(people_list,index=None)

## Create table in postgres Database using psycopg2 library

In [37]:
query = """CREATE TABLE test_table (
    id INTEGER PRIMARY KEY,
    name TEXT,
    age INTEGER,
    email TEXT UNIQUE
)"""

In [38]:
# url
url = "http://postgres-db:45432/postgres_db"
#user
username = "postgres"
password = "postgres"

conn_details = psycopg2.connect(
    host='postgres-db',
    database="postgres_db",
    user=username,
    # secretlint-disable
    password=password,
    # secretlint-enable
    port='5432'
)

with conn_details:
    cursor = conn_details.cursor()
    query = query
    cursor.execute(query)

DuplicateTable: relation "test_table" already exists


## Insert Records

## Create a spark Session

In [39]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("YourAppName") \
    .config("spark.driver.extraClassPath", "postgresql-connector-java.jar") \
    .config("spark.executor.extraClassPath", "postgresql-connector-java.jar") \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/04/11 21:19:15 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## Create Spark dataframe

In [40]:
spark_df = spark.createDataFrame(data)

## Test Functions 

In [43]:
import pytest

def test_test_data():
    """
    Tests API is returning values
    """
    status_code,_ = get_data()
    
    assert status_code == 200



In [53]:
! pytest

platform linux -- Python 3.11.9, pytest-8.1.1, pluggy-1.4.0
rootdir: /app
plugins: anyio-4.3.0
collected 1 item                                                               [0m

tests/test_get_data.py [32m.[0m[32m                                                 [100%][0m

