## Proyecto Integrador
Datos de entrada:
* `sales.csv`.
  ```csv
   id,nombre,fecha,venta
   1,juan,2024-01-10,100
   2,Ana,2024-01-11,200
   3,pedro,2024/01/12,NaN
   4,JUAN,2024-01-13,150

  ```
* `clients.xlsx`.
  | id  | nombre\_cliente | ciudad   | correo                                      |
  | --- | --------------- | -------- | ------------------------------------------- |
  | 1   | Maria           | Medellín | maria\_gmail.com                            |
  | 2   | Andres          | Bogotá   | [andres@gmail.com](mailto:andres@gmail.com) |
  | 3   | carlos          | Cali     | [carlos@yahoo.com](mailto:carlos@yahoo.com) |

1. Cada minuto se ejecuta el ETL.
2. Extrae datos de **CSV (ventas) y Excel (clientes)**.
3. Transforma los datos:
   * Normalize nombres.
   * Corrige fechas y NaN en ventas.
   * Valida correos en clientes.
4. Carga los datos limpios en PostgreSQL en las tablas:
   * `ventas`
   * `clientes`
5. Ejecuta el request:
   ```sql
   SELECT * FROM ventas;
   sELECT * FROM clientes;
   ```
   Se deben ver los datos limpios y normalizados

In [1]:
# Libraries
import pandas as pd
import numpy as np
import re
from sqlalchemy import create_engine
import schedule
import time
from datetime import datetime

In [2]:
# Create Data
sales_data = {
    'id':[1,2,3,4,],
    'name':['juan','Ana','pedro','JUAN'],
    'date':['2024-01-10','2024-01-11','2024/01/12','2024-01-13'],
    'sale':[100,200,None,150]
}
clients_data = {
    'id':[1,2,3],
    'client_name':['Maria','Andres','carlos'],
    'city':['Medellín','Bogotá','Cali'],
    'email':['maria%_gmail.com','andres@gmail.com','carlos@yahoo.com']
}

sales_df = pd.DataFrame(sales_data)
clients_df = pd.DataFrame(clients_data)

sales_df.to_csv('sales.csv',index=False)
clients_df.to_excel('clients.xlsx',sheet_name='clients_data',index=False)

In [3]:
# check email function
def is_valid_email_regex(email):
    if pd.isna(email) or not isinstance(email, str):
        return False
    pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'
    return bool(re.match(pattern, email))

In [4]:
# Conection data
user = 'postgres'
password = 'root'
host = 'localhost'
port = '5432'
database = 'postgres'
# Create engine conection
engine = create_engine(f"postgresql://{user}:{password}@{host}:{port}/{database}")

In [5]:
# ETL
def etl_job():
    print(f'running: {datetime.now().strftime('%H-%M-%S')}')
    # Extract
    salesdf = pd.read_csv('sales.csv')
    clientsdf = pd.read_excel('clients.xlsx',sheet_name='clients_data')
    
    # Transform
    # Sales
    salesdf['name'] = salesdf['name'].str.title()
    salesdf['date'] = pd.to_datetime(salesdf['date'],errors='coerce')
    salesdf['sale'] = salesdf['sale'].fillna(salesdf[salesdf['sale']>0]['sale'].mean())
    # Clients
    clientsdf['client_name'] = clientsdf['client_name'].str.title()
    clientsdf['city'] = clientsdf['city'].fillna('Without city')
    clientsdf['email'] = np.where(clientsdf['email'].apply(is_valid_email_regex),clientsdf['email'],'invalid_email')
    
    # Load
    # Sales
    salesdf.to_sql('sales',engine,if_exists='replace',index=False)
    clientsdf.to_sql('clients',engine,if_exists='replace',index=False)


In [6]:
etl_job()
query_clients = 'SELECT * FROM clients'
query_sales = 'SELECT * FROM sales'

sql_clients = pd.read_sql(query_clients,engine)
sql_sales = pd.read_sql(query_sales,engine)

print(sql_clients)
print()
print(sql_sales)

running: 11-39-40
   id client_name      city             email
0   1       Maria  Medellín     invalid_email
1   2      Andres    Bogotá  andres@gmail.com
2   3      Carlos      Cali  carlos@yahoo.com

   id   name       date   sale
0   1   Juan 2024-01-10  100.0
1   2    Ana 2024-01-11  200.0
2   3  Pedro        NaT  150.0
3   4   Juan 2024-01-13  150.0


In [7]:
# Schedule
schedule.every(1).minutes.do(etl_job)
while True:
    schedule.run_pending()
    time.sleep(1)

running: 11-40-40
running: 11-41-40
running: 11-42-40


KeyboardInterrupt: 