# Extraccion de datos desde archivos

## Librerias

In [None]:
import pandas as pd
import polars as pl
import openpyxl
import os

## Funciones

In [None]:
path_data = os.path.join(os.getcwd(), "data")
path_file = lambda file: os.path.join(path_data, file)
def read_csv(file):
    return pd.read_csv(path_file(file))

# Pandas

## CSV

In [None]:
pd.read_csv(
    filepath_or_buffer=path_file("dataset.csv")
    ).head()

## TXT

In [None]:
pd.read_csv(
    filepath_or_buffer=path_file("dataset.txt")
    , delimiter="\t"
    ).head()

## Excel

In [None]:
pd.read_excel(
    io=path_file("dataset_tipo2.xlsx"),
    sheet_name="data"
    ).head()

In [None]:
# Excel con la informacion en multiples hojas estandarizadas
df_xlsx = pd.ExcelFile(
    path_file("datos_historicos.xlsx")
    )

pd.concat(
    [pd.read_excel(df_xlsx, sheet_name=sheet) for sheet in df_xlsx.sheet_names]
    ).head()

In [None]:
# Excel con el contenido en otras rows
pd.read_excel(
    io=path_file("data.xlsx")
    , skiprows=3
    ).head(10)

# Polar

## CSV

In [None]:
pl.read_csv(
    path_file("dataset.csv")
    ).head()

In [None]:
# Lectura de varios csv
pl.read_csv(
    path_file("datos_historicos/*.csv")
    ).head()

## TXT

In [None]:
pl.read_csv(
    path_file("dataset.txt")
    , separator="\t"
    ).head()

## Excel

In [None]:
pl.read_excel(
    path_file("data.xlsx")
    , engine="xlsx2csv"
    , read_options={"skip_rows": 3}
    ).head()

In [None]:
# Excel con la informacion en multiples hojas estandarizadas
wb = openpyxl.load_workbook(
    path_file("datos_historicos.xlsx")
    )
wb_sheets = wb.sheetnames

In [None]:
# forma 1
pl.concat(
    [pl.read_excel(
        path_file("datos_historicos.xlsx")
        , sheet_name=sheet
        ) for sheet in wb_sheets]
    )

In [None]:
# forma 2
pl.concat(
    pl.read_excel(
        path_file("datos_historicos.xlsx")
        , sheet_name=wb_sheets
        ).values()
    )

## PL Schemas
https://docs.pola.rs/api/python/stable/reference/datatypes.html

In [None]:
pl.read_excel(
    path_file("data.xlsx")
    , engine="xlsx2csv"
    , read_options={"skip_rows": 3, 'ignore_errors': True}
    , schema_overrides={
        **{"in million USD": pl.String, "FY '09": pl.Float64}
        , **{f"FY '{year}": pl.Float64 for year in range(10, 19)}
    }
    ).head()