# **Workshop -001: Extract**

**Importación de las librerias a utilizar**

In [1]:
import yaml
import psycopg2 
from psycopg2 import sql
from sqlalchemy import create_engine, text
import pandas as pd
import matplotlib.pyplot as plt

**Creamos una función donde leemos el archivo de configuración de la DB y cargamos los datos de la conexión**

In [2]:
def load_config(file_path="config.yaml"):
    with open(file_path, "r") as file:
        return yaml.safe_load(file)

**Llamamos a la funcion que carga los dados de conexión a la base de datos, y creamo la conexión**

In [3]:
config = load_config()
db_config = config["database"]

db_user = db_config["user"]
db_password = db_config["password"]
db_host = db_config["host"]
db_port = db_config["port"]
db_name = db_config["name"]

conn = psycopg2.connect(
    dbname="postgres",
    user=db_user,
    password=db_password,
    host=db_host,
    port=db_port
)
conn.autocommit = True

**Creamos la base de datos en caso de que no exista**

In [4]:
db_name = "etl_project"
try:
    with conn.cursor() as cur:
        cur.execute(sql.SQL("CREATE DATABASE {}").format(sql.Identifier(db_name)))
        print(f"Base de datos '{db_name}' creada exitosamente.")
except psycopg2.errors.DuplicateDatabase:
    print(f"La base de datos '{db_name}' ya existe.")
finally:
    conn.close()

Base de datos 'etl_project' creada exitosamente.


**Creamos las tablas necesarias a utilizar en el proyecto**

In [9]:
engine = create_engine(f"postgresql://{db_user}:{db_password}@{db_host}:{db_port}/{db_name}")

with engine.connect() as conn:
    conn.execute(text("""
        CREATE TABLE saldos_staging (
            id BIGSERIAL PRIMARY KEY,
            documento_identidad VARCHAR(20),
            nombre VARCHAR(100),
            apellido VARCHAR(100),
            sexo CHAR(1),
            estado_civil INT,
            fecha_ingreso date,
            tipo_salario INT,
            salario NUMERIC(15, 2),
            estrato INT,
            tipovehiculo INT,
            lincred INT,
            fecsolic date,
            fecaprob date,
            fecfact date,
            fecdesc date,
            fecultcau date,
            fecultpago date,
            fecvemto date,
            plazo INT,
            vlrsolicitud NUMERIC(15, 2),
            valorob NUMERIC(15, 2),
            saldot NUMERIC(15, 2),
            cuota NUMERIC(15, 2),
            tasaint NUMERIC(3, 2),
            ciclod CHAR(1),
            periodd CHAR(1),
            clacuo CHAR(1),
            clasei CHAR(1),
            clades CHAR(1),
            periodo INT,
            saldo NUMERIC(15, 2),
            saldo_inicial NUMERIC(15, 2),
            vlr_debito NUMERIC(15, 2),
            vlr_credito NUMERIC(15, 2),
            cuopen INT,
            valor_pagado NUMERIC(15, 2),
            fecha_pago VARCHAR(20),
            mora_causado NUMERIC(15, 2),
            mora_abono NUMERIC(15, 2),
            mora_saldo NUMERIC(15, 2),
            descripcion VARCHAR(100),
            codahor CHAR(1),
            debcre CHAR(1),
            fecha_registro TIMESTAMP DEFAULT CURRENT_TIMESTAMP
        );
    """))
    conn.commit() 
    print("Tabla: saldos_staging creada exitosamente en PostgreSQL.")

Tabla: saldos_staging creada exitosamente en PostgreSQL.


**Leemos los datos del archivo CSV, e imprimimos 15 registros**

In [24]:
df = pd.read_csv('saldos.csv', sep=';', low_memory=False, dtype={    
    'vlr_debito': 'float64',
    'vlr_credito': 'float64',
    'cuopen': 'Int64',
    'valor_pagado': 'float64',    
    'mora_causado': 'float64',
    'mora_abono': 'float64',
    'mora_saldo': 'float64'
}, encoding='ISO-8859-1')

In [25]:
print(df.sample(n=15))

        documento_identidad          nombre              apellido sexo  \
138411           1118288888      LIZA MARIA          LOZADA REYES    F   
9079                9317753  ANUAR HERNANDO        LAZARO ESTRADA    M   
70399              31290755        HEGUERIA      URBANO DE URBANO    F   
71751              31466700     MARIA ELENA        VALENCIA TELLO    F   
155631           1118304884         DANIELA      ALVAREZ  BELTRAN    F   
59005              29933524         SORAIDA        CARDOZO LOZANO    F   
660                 4655301           JARBY         VILLEGAS MERA    M   
40845              16465358       ALEXANDER       HOLGUIN HIDALGO    M   
61032              29970226   MARIA MARLENY  SEPULVEDA DE CANDELO    F   
111034             66868644       ANA LUCIA       HOLGUIN CAMACHO    F   
24047              16454342           ARLEX      GAVIRIA ARBOLEDA    M   
100791             31925379          MONICA           MU OZ LA AS    F   
103906             34595084     SANDRA

**Copiamos los datos en una nueva variable, para guardarlos en la tabla saldos_staging**

In [26]:
df_staging = df.copy()

df_staging.sample(n=15)

Unnamed: 0,documento_identidad,nombre,apellido,sexo,estado_civil,fecha_ingreso,tipo_salario,salario,estrato,tipovehiculo,...,vlr_credito,cuopen,valor_pagado,fecha_pago,mora_causado,mora_abono,mora_saldo,descripcion,codahor,debcre
70479,31290755,HEGUERIA,URBANO DE URBANO,F,2,6/01/1998,4,2106465,0,0,...,23303.0,3,23303.0,22/09/2023,23303.0,23303.0,0.0,BON-SOL,3,D
170786,1144139464,MICHAEL ANDRES,BARBOSA RAMIREZ,M,2,10/02/2023,4,2420000,2,0,...,48400.0,0,48400.0,31/07/2024,48400.0,48400.0,0.0,APORTES,1,C
63844,29973387,BEATRIZ,MORENO DE AMAYA,F,2,30/11/1996,2,5449714,2,0,...,163372.0,0,163372.0,29/02/2024,108994.0,108994.0,0.0,A/PERM,2,C
143786,1118292516,DIANA MARCELA,BUSTOS ESCOBAR,F,1,21/03/2023,4,1677500,2,0,...,33550.0,0,33550.0,31/07/2023,33550.0,33550.0,0.0,APORTES,1,C
95595,31487565,PAOLA ANDREA,CASTILLO POLANCO,F,1,28/07/2016,4,2329646,2,0,...,50079.0,0,50079.0,31/07/2023,46592.0,46592.0,0.0,A/PERM,2,C
33559,16452859,EINER,MOLINA,M,1,11/09/2020,4,1525000,2,0,...,102632.0,3,110397.0,31/08/2023,110397.0,110397.0,0.0,PTMO CONSUMO,4,D
119386,94437459,URIEL,URBANO URBANO,M,2,25/05/2004,2,13900000,0,0,...,4060.0,14,4060.0,31/05/2024,4060.0,4060.0,0.0,FONDOS SOCIAL,3,D
63932,29973387,BEATRIZ,MORENO DE AMAYA,F,2,30/11/1996,2,5449714,2,0,...,986.0,0,986.0,30/09/2023,0.0,0.0,0.0,03 INT. VISTA,2,C
79861,31470460,MARTHA CECILIA,BURBANO VELASQUEZ,F,2,5/03/1998,2,1886316,0,0,...,1104.0,1,1104.0,30/11/2023,1104.0,1104.0,0.0,PAPELERIA,3,D
38842,16465442,VICTOR MARIO,MORENO ESCOBAR,M,4,11/03/2020,4,1677500,2,0,...,1206.0,1,1206.0,29/11/2024,1206.0,1206.0,0.0,PAPELERIA,3,D


In [27]:
with engine.connect() as conn:
    df_staging.to_sql("saldos_staging", con=engine, if_exists="append", index=False)

print("Los datos cargados desde archivo CSV, se almacenaron correctamente en la tabla: saldos_staging")

Los datos cargados desde archivo CSV, se almacenaron correctamente en la tabla: saldos_staging


**Consultamos los datos de la tabla de *saldos_staging* para realizar el proceso de transformación**

In [29]:
with engine.connect() as conn:
    db_staging_df = pd.read_sql("SELECT * FROM saldos_staging", conn)
    
db_staging_df.sample(n=15)

Unnamed: 0,id,documento_identidad,nombre,apellido,sexo,estado_civil,fecha_ingreso,tipo_salario,salario,estrato,...,cuopen,valor_pagado,fecha_pago,mora_causado,mora_abono,mora_saldo,descripcion,codahor,debcre,fecha_registro
7415,7416,6558019,SERGIO,RESTREPO RAMIREZ,M,1,2013-03-07,4,3512000.0,3,...,87.0,481522.0,30/09/2024,481522.0,481522.0,0.0,PTMO DE CONSUMO,4,D,2025-02-27 22:57:28.210469
44888,44889,16642992,ABELARDO,TELLO GARCIA,M,1,2000-01-25,2,1868400.0,0,...,1.0,,,,,,A FAVOR,3,C,2025-02-27 22:57:28.210469
58549,58550,29939762,SANDRA,HOLGUIN DAZA,F,4,2011-04-05,4,2100000.0,0,...,3.0,,,2617.0,0.0,2617.0,SERVICIO EXEQUIAL,3,D,2025-02-27 22:57:28.210469
69409,69410,31246131,SONIA,NAVARRETE,F,1,1996-11-30,2,5443467.0,0,...,6.0,25466.0,21/06/2024,25466.0,25466.0,0.0,BON-SOL,3,D,2025-02-27 22:57:28.210469
25628,25629,16453763,NODIER,PULIDO ALVAREZ,M,4,2003-11-28,4,3927000.0,3,...,0.0,78540.0,30/04/2023,78540.0,78540.0,0.0,APORTES,1,C,2025-02-27 22:57:28.210469
30518,30518,16454682,MARCO FIDEL,SUAREZ ARCINIEGAS,M,4,1999-05-15,4,2300000.0,3,...,1.0,,,,,,REVALORIZACION,3,C,2025-02-27 22:57:28.210469
13694,13695,16203921,MILTON,BECERRA SALDARRIAGA,M,1,2021-04-27,4,1700000.0,2,...,0.0,34156.0,31/12/2024,34000.0,34000.0,0.0,A/PERM,2,C,2025-02-27 22:57:28.210469
166989,166990,1118314105,VALERIA,PASSOS NIETO,F,1,2024-05-15,4,3900000.0,4,...,5.0,185072.0,2/08/2024,370144.0,185072.0,185072.0,PTMO DE CONSUMO,4,D,2025-02-27 22:57:28.210469
117076,117077,94364300,JUAN ISMAEL,ERAZO ORDO EZ,M,2,2007-10-25,4,4001802.0,2,...,26.0,2212.0,30/09/2023,2212.0,2212.0,0.0,SERVICIO EXEQUIAL,3,D,2025-02-27 22:57:28.210469
147383,147384,1118295643,BRIAN RICK,MU OZ RUBIO,M,4,2011-05-23,4,1980000.0,2,...,0.0,35474.0,31/10/2024,34000.0,34000.0,0.0,A/PERM,2,C,2025-02-27 22:57:28.210469


**### Hasta aqui la primera fase del proyecto ###**