# Trabajo con archivos de datos

El notebook incluye ejemplo sobre como trabajar con diferentes tipos de datos en Python

In [1]:
# Carga de librerías
import json
import pandas as pd
import csv
import fastavro
from fastavro.schema import load_schema
import os
from dotenv import load_dotenv

# Cargar las variables de entorno desde el archivo .env
load_dotenv()

# Set environment variables
READ_FILE_PARQUET = os.getenv("READ_FILE_PARQUET")

# Set path to the data
path_input_json = '../01datos/01inputs/01data.json'
path_output_csv = '../01datos/02outputs/01data.csv'
path_output_parquet = READ_FILE_PARQUET
path_output_avro = '../01datos/02outputs/01data.avro'

# Set schema
path_schema_avro = '../01datos/01inputs/schema.avsc'

## Trabajos con Json

In [8]:
data = {"nombre": "John", "edad": '30', "ciudad": "New York"}

with open(path_input_json, 'w') as json_file:
    json.dump(data, json_file)

In [9]:
with open(path_input_json, 'r') as json_file:
    data = json.load(json_file)
    print(data)


{'nombre': 'John', 'edad': '30', 'ciudad': 'New York'}


## Archivos CSV 

In [13]:
data = [["nombre", "edad", "ciudad"],
        ["John", 30, "New York"],
        ["Anna", 25, "London"],
        ["Mike", 35, "San Francisco"]]

with open(path_output_csv, 'w', newline='') as csv_file:
    writer = csv.writer(csv_file)
    writer.writerows(data)

In [15]:
with open(path_output_csv, 'r') as csv_file:
    reader = csv.reader(csv_file)
    for row in reader:
        print(row)

['nombre', 'edad', 'ciudad']
['John', '30', 'New York']
['Anna', '25', 'London']
['Mike', '35', 'San Francisco']


In [17]:
df = pd.read_csv(path_output_csv)
type(df)

pandas.core.frame.DataFrame

In [18]:
df

Unnamed: 0,nombre,edad,ciudad
0,John,30,New York
1,Anna,25,London
2,Mike,35,San Francisco


In [21]:
df.to_csv(path_output_csv,index=False)

## Parquet

In [3]:
data = {
    "nombre": ["John", "Anna", "Mike"],
    "edad": [30, 25, 35],
    "ciudad": ["New York", "London", "San Francisco"]
}
df = pd.DataFrame(data)
df.info()

df.to_parquet(path_output_parquet)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   nombre  3 non-null      object
 1   edad    3 non-null      int64 
 2   ciudad  3 non-null      object
dtypes: int64(1), object(2)
memory usage: 204.0+ bytes


In [4]:
pd.read_parquet(path_output_parquet)

Unnamed: 0,nombre,edad,ciudad
0,John,30,New York
1,Anna,25,London
2,Mike,35,San Francisco


## Avro + schema

In [31]:
schema = load_schema(path_schema_avro)
records = [{"nombre": "John", "edad": 30, "ciudad": "New York"},
           {"nombre": "Anna", "edad": 25, "ciudad": "London"},
           {"nombre": "Mike", "edad": 35, "ciudad": "San Francisco"}]

with open(path_output_avro, 'wb') as avro_file:
    fastavro.writer(avro_file, schema, records)


In [32]:
with open(path_output_avro, 'rb') as avro_file:
    reader = fastavro.reader(avro_file)
    for record in reader:
        print(record)


{'nombre': 'John', 'edad': 30, 'ciudad': 'New York'}
{'nombre': 'Anna', 'edad': 25, 'ciudad': 'London'}
{'nombre': 'Mike', 'edad': 35, 'ciudad': 'San Francisco'}
