# Primera Entrega Curso Data Engineering Coderhouse

## Importamos Librerías

In [11]:
# Importamos las librerías necesarias
import pandas as pd  # Para manipulación de datos
import numpy as np  # Para operaciones numéricas

import matplotlib.pyplot as plt  # Para la visualización de datos
import seaborn as sns  # Para la visualización de datos

sns.set_style(
    "whitegrid"
)  # Establecemos el estilo de los gráficos de seaborn a 'whitegrid'
plt.style.use(
    "fivethirtyeight"
)  # Establecemos el estilo de los gráficos de matplotlib a 'fivethirtyeight'

# Para leer datos de acciones de Yahoo
from pandas_datareader.data import DataReader
import yfinance as yf  # yfinance es una herramienta para descargar datos históricos de Yahoo Finance
from pandas_datareader import data as pdr

yf.pdr_override()  # Sobreescribimos los métodos de pandas_datareader.data para que utilicen yfinance

# Para trabajar con marcas de tiempo (timestamps)
from datetime import datetime
import os

## La API que será consumida es de Yahoo Finance, para ver el comportamiento financiero de algunas empresas TOP del mercado.

In [12]:
# Lista de empresa Top (24 en total)
tech_list = [
    "AAPL",
    "MSFT",
    "AMZN",
    "JPM",
    "COST",
    "GOOGL",
    "AXP",
    "WMT",
    "NVDA",
    "DAL",
    "DIS",
    "MAR",
    "NKE",
    "KO",
    "SBUX",
    "FDX",
    "PG",
    "HD",
    "PFE",
    "CRM",
    "TGT",
    "NFLX",
    "TM",
]
# Tomamos datos desde hace 10 años hasta la fecha actual
end = datetime.now()
start = datetime(end.year - 10, end.month, end.day)

# Descargamos los datos de las acciones de las empresas de la lista
for stock in tech_list:
    globals()[stock] = yf.download(stock, start, end)

# Creamos una lista de los códigos de las empresas
company_list = [
    AAPL,
    MSFT,
    AMZN,
    JPM,
    COST,
    GOOGL,
    AXP,
    WMT,
    NVDA,
    DAL,
    DIS,
    MAR,
    NKE,
    KO,
    SBUX,
    FDX,
    PG,
    HD,
    PFE,
    CRM,
    TGT,
    NFLX,
    TM,
]

# Lista de los nombres de la compañía
company_name = [
    "APPLE",
    "MICROSOFT",
    "AMAZON.COM",
    "JPMORGAN CHASE",
    "COSTCO WHOLESALE",
    "ALPHABET",
    "AMERICAN EXPRESS",
    "WALMART",
    "NVIDIA",
    "DELTA AIR LINES",
    "WALT DISNEY",
    "MARRIOTT INTERNATIONAL",
    "NIKE",
    "COCA-COLA",
    "STARBUCKS",
    "FEDEX",
    "PROCTER & GAMBLE",
    "HOME DEPOT",
    "PFIZER",
    "SALESFORCE",
    "TARGET",
    "NETFLIX",
    "TOYOTA MOTOR",
]

# Creamos un DataFrame con los datos de las acciones de las empresas
for company, com_name in zip(company_list, company_name):
    company["company_name"] = com_name

# Concatenamos los datos de las acciones de las empresas en un solo DataFrame
df = pd.concat(company_list, axis=0)
print(df.shape)
df.tail(10)

[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%*******

(57914, 7)





Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,company_name
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2024-04-30,229.940002,229.940002,227.0,227.309998,227.309998,334200,TOYOTA MOTOR
2024-05-01,228.309998,228.839996,226.559998,226.830002,226.830002,264600,TOYOTA MOTOR
2024-05-02,230.300003,231.410004,229.369995,230.800003,230.800003,228100,TOYOTA MOTOR
2024-05-03,232.059998,233.350006,231.360001,232.869995,232.869995,165200,TOYOTA MOTOR
2024-05-06,233.619995,235.679993,233.570007,235.600006,235.600006,208200,TOYOTA MOTOR
2024-05-07,233.279999,233.279999,230.910004,231.259995,231.259995,248900,TOYOTA MOTOR
2024-05-08,231.119995,232.429993,227.0,231.779999,231.779999,371600,TOYOTA MOTOR
2024-05-09,227.509995,228.029999,226.729996,227.240005,227.240005,329000,TOYOTA MOTOR
2024-05-10,220.929993,221.059998,218.139999,218.779999,218.779999,563900,TOYOTA MOTOR
2024-05-13,217.100006,217.130005,215.300003,215.639999,215.639999,466000,TOYOTA MOTOR


In [13]:
df["date"] = df.index

df.info()  # Información del DataFrame

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 57914 entries, 2014-05-13 to 2024-05-13
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   Open          57914 non-null  float64       
 1   High          57914 non-null  float64       
 2   Low           57914 non-null  float64       
 3   Close         57914 non-null  float64       
 4   Adj Close     57914 non-null  float64       
 5   Volume        57914 non-null  int64         
 6   company_name  57914 non-null  object        
 7   date          57914 non-null  datetime64[ns]
dtypes: datetime64[ns](1), float64(5), int64(1), object(1)
memory usage: 4.0+ MB


In [14]:
df

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,company_name,date
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2014-05-13,21.142857,21.233570,21.096430,21.205713,18.757195,159737200,APPLE,2014-05-13
2014-05-14,21.158215,21.335714,21.133572,21.209642,18.760677,166404000,APPLE,2014-05-14
2014-05-15,21.239286,21.307142,21.001429,21.029285,18.601152,230846000,APPLE,2014-05-15
2014-05-16,21.022499,21.340357,20.907143,21.339643,18.875668,276256400,APPLE,2014-05-16
2014-05-19,21.351786,21.690357,21.333214,21.592501,19.099335,317755200,APPLE,2014-05-19
...,...,...,...,...,...,...,...,...
2024-05-07,233.279999,233.279999,230.910004,231.259995,231.259995,248900,TOYOTA MOTOR,2024-05-07
2024-05-08,231.119995,232.429993,227.000000,231.779999,231.779999,371600,TOYOTA MOTOR,2024-05-08
2024-05-09,227.509995,228.029999,226.729996,227.240005,227.240005,329000,TOYOTA MOTOR,2024-05-09
2024-05-10,220.929993,221.059998,218.139999,218.779999,218.779999,563900,TOYOTA MOTOR,2024-05-10


In [15]:
# Renombrar las columnas del DataFrame
df.rename(
    columns={
        "Open": "open",
        "High": "high",
        "Low": "low",
        "Close": "close",
        "Adj Close": "adj_close",
        "Volume": "volume",
        "company_name": "company_name",
    },
    inplace=True,
)

In [16]:
# Supongamos que tienes un DataFrame llamado "df" que deseas exportar a un archivo .parquet
df.to_parquet("../data/data_extraida.parquet")

## Carga en RDS (Redshift)

Verificamos la conexion

In [26]:
import psycopg2

host = "data-engineer-cluster.cyhh5bfevlmn.us-east-1.redshift.amazonaws.com"
username = "criparrame_coderhouse"
password = os.getenv("REDSHIFT_PASSWORD")
database = "data-engineer-database"
port = "5439"


try:
    conn = psycopg2.connect(
        host=host, dbname=database, user=username, password=password, port="5439"
    )
    print("Connected to Redshift successfully!")

except Exception as e:
    print("Unable to connect to Redshift.")
    print(e)

Connected to Redshift successfully!


In [27]:
# Importamos las librerías necesarias
from sqlalchemy import create_engine, text
import pandas as pd
import os

# Definimos los detalles de la conexión a la base de datos Redshift
redshift_conn = {
    "host": "data-engineer-cluster.cyhh5bfevlmn.us-east-1.redshift.amazonaws.com",
    "username": "criparrame_coderhouse",
    "password": os.getenv("REDSHIFT_PASSWORD"),
    "database": "data-engineer-database",
    "port": "5439",
}


# Definimos la función para cargar los datos
def cargar_data(file_path, redshift_conn, batch_size=1000):
    # Creamos la conexión a la base de datos
    engine = create_engine(
        f'redshift+psycopg2://{redshift_conn["username"]}:{redshift_conn["password"]}@{redshift_conn["host"]}:{redshift_conn["port"]}/{redshift_conn["database"]}'
    )

    # Leemos los datos desde el archivo .parquet
    df = pd.read_parquet(file_path)

    # Ejecutamos una consulta SQL para crear la tabla 'stock_data' si no existe
    with engine.connect() as conn:
        conn.execute(
            """
            CREATE TABLE IF NOT EXISTS stock_data (
                date DATE NOT NULL,
                "open" FLOAT,
                high FLOAT,
                low FLOAT,
                close FLOAT,
                adj_close FLOAT,
                volume INT,
                company_name VARCHAR(70)
            )
            """
        )

        # Construimos la consulta de inserción de datos
        query = """
            INSERT INTO stock_data (date, "open", high, low, close, adj_close, volume, company_name)
            VALUES (:date, :open, :high, :low, :close, :adj_close, :volume, :company_name)
        """

        # Convertimos el DataFrame a una lista de diccionarios
        data_list = df.to_dict(orient="records")

        # Dividimos la lista en fragmentos más pequeños
        fragmentos = [
            data_list[i : i + batch_size] for i in range(0, len(data_list), batch_size)
        ]

        # Insertamos cada fragmento en la tabla
        for fragmento in fragmentos:
            for record in fragmento:
                conn.execute(text(query), **record)

    # Imprimimos el DataFrame 'df' después de cargar los datos
    print("Los datos se han cargado correctamente en la tabla de Redshift.")

In [None]:
# Llamamos a la función cargar_data con la ruta al archivo .parquet
cargar_data("../data/data_extraida.parquet", redshift_conn)