In [2]:
!pip install boto3
!pip install pyyaml



In [3]:
import pickle
import boto3 
import yaml
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from datetime import date

In [4]:
with open("credentials.yaml", "r") as f:
    config = yaml.safe_load(f)

### 1. Cargar datos históricos de S3

In [5]:
def cargar_datos_s3(bucket, bucket_path):
    session = boto3.Session(
        aws_access_key_id = config['s3']['aws_access_key_id'],
        aws_secret_access_key = config['s3']['aws_secret_access_key'],
        aws_session_token = config['s3']['aws_session_token']
    )

    s3 = session.resource('s3')
    
    obj = s3.Object(bucket, bucket_path).get()['Body'].read()
    dataset = pickle.loads(obj)
    
    return dataset

In [6]:
session = boto3.Session(
    aws_access_key_id = config['s3']['aws_access_key_id'],
    aws_secret_access_key = config['s3']['aws_secret_access_key'],
    aws_session_token = config['s3']['aws_session_token']
)

s3 = session.client('s3')

In [7]:
bucket = "aplicaciones-cd-1-" + config['iexe']['matricula']
key = "ingesta/inicial/"

In [8]:
bucket_path = s3.list_objects_v2(Bucket=bucket, Prefix=key)['Contents'][0]['Key']

In [9]:
bucket_path

'ingesta/inicial/inspecciones-historicas-2024-10-20.pkl'

In [10]:
dataset = cargar_datos_s3(bucket, bucket_path)

In [11]:
len(dataset)

280438

### 2. Transformar datos a DF

In [18]:
def transformar_ingesta(dataset):
    return pd.DataFrame.from_dict(dataset)

In [19]:
df_inspecciones = transformar_ingesta(dataset)

In [20]:
df_inspecciones.shape

(249327, 17)

In [21]:
df_inspecciones.head()

Unnamed: 0,inspection_id,dba_name,aka_name,license_,facility_type,risk,address,city,state,zip,inspection_date,inspection_type,results,latitude,longitude,location,violations
0,52234,Cafe 608,Cafe 608,2013328,Restaurant,Risk 1 (High),608 W BARRY AVE,CHICAGO,IL,60657,2010-01-04T00:00:00.000,License Re-Inspection,Pass,41.938006880423615,-87.6447545707008,"{'latitude': '41.938006880423615', 'longitude'...",
1,67733,WOLCOTT'S,TROQUET,1992040,Restaurant,Risk 1 (High),1834 W MONTROSE AVE,CHICAGO,IL,60613,2010-01-04T00:00:00.000,License Re-Inspection,Pass,41.961605669949854,-87.67596676683779,"{'latitude': '41.961605669949854', 'longitude'...",
2,70269,mr.daniel's,mr.daniel's,1899292,Restaurant,Risk 1 (High),5645 W BELMONT AVE,CHICAGO,IL,60634,2010-01-04T00:00:00.000,License Re-Inspection,Pass,41.93844282365204,-87.76831838068422,"{'latitude': '41.93844282365204', 'longitude':...",
3,67757,DUNKIN DONUTS/BASKIN-ROBBINS,DUNKIN DONUTS/BASKIN-ROBBINS,1380279,Restaurant,Risk 2 (Medium),100 W RANDOLPH ST,CHICAGO,IL,60601,2010-01-04T00:00:00.000,Tag Removal,Pass,41.88458626715456,-87.63101044588599,"{'latitude': '41.88458626715456', 'longitude':...",
4,67732,WOLCOTT'S,TROQUET,1992039,Restaurant,Risk 1 (High),1834 W MONTROSE AVE,CHICAGO,IL,60613,2010-01-04T00:00:00.000,License Re-Inspection,Pass,41.961605669949854,-87.67596676683779,"{'latitude': '41.961605669949854', 'longitude'...",


### 3. Identificar faltantes

In [22]:
def faltantes(df):
    return df.isna().sum()

In [23]:
faltantes(df_inspecciones)

inspection_id          0
dba_name               0
aka_name            2493
license_               8
facility_type       5038
risk                  76
address                0
city                 176
state                 47
zip                   80
inspection_date        0
inspection_type        1
results                0
latitude             857
longitude            857
location             857
violations         67855
dtype: int64

### 4. Eliminar inspecciones que no  tienen latitud o longitud

In [24]:
def elimina_faltantes_latitud_logintud(cols, df):
    for col in cols:
        df =  df[df[col].notnull()]
    
    return df

In [25]:
inspecciones = elimina_faltantes_latitud_logintud(['latitude', 'longitude'], df_inspecciones)

In [26]:
faltantes(inspecciones)

inspection_id          0
dba_name               0
aka_name            2481
license_               8
facility_type       5023
risk                  74
address                0
city                 172
state                 47
zip                   76
inspection_date        0
inspection_type        1
results                0
latitude               0
longitude              0
location               0
violations         67626
dtype: int64

### 5. Imputar faltantes

In [27]:
def imputar_faltantes(col, value, df):
    df[col].fillna(value, inplace=True)

In [28]:
# license_
imputar_faltantes('license_', str(inspecciones.license_.mode()), inspecciones)
# zip
imputar_faltantes('zip', str(inspecciones.zip.mode()), inspecciones)
# state
imputar_faltantes('state', str(inspecciones.state.mode()), inspecciones)
# facility_type
imputar_faltantes('facility_type', str(inspecciones.facility_type.mode()), inspecciones)
# risk
imputar_faltantes('risk', str(inspecciones.risk.mode()), inspecciones)

In [29]:
faltantes(inspecciones)

inspection_id          0
dba_name               0
aka_name            2481
license_               0
facility_type          0
risk                   0
address                0
city                 172
state                  0
zip                    0
inspection_date        0
inspection_type        1
results                0
latitude               0
longitude              0
location               0
violations         67626
dtype: int64

### 6. Transformación de enteros

In [30]:
def transformar_enteros(cols, df):
    for element in cols:
        df[element] = df[element].astype(int)
    
    return df

In [31]:
inspecciones = transformar_enteros(['inspection_id'], inspecciones)

In [32]:
inspecciones.dtypes

inspection_id       int64
dba_name           object
aka_name           object
license_           object
facility_type      object
risk               object
address            object
city               object
state              object
zip                object
inspection_date    object
inspection_type    object
results            object
latitude           object
longitude          object
location           object
violations         object
dtype: object

### 7. Transformación flotantes

In [33]:
def transformar_flotantes(cols, df):
    for col in cols:
        df[col] = df[col].astype(float)
    
    return df

In [34]:
inspecciones = transformar_flotantes(['latitude', 'longitude'], inspecciones)

In [35]:
inspecciones.dtypes

inspection_id        int64
dba_name            object
aka_name            object
license_            object
facility_type       object
risk                object
address             object
city                object
state               object
zip                 object
inspection_date     object
inspection_type     object
results             object
latitude           float64
longitude          float64
location            object
violations          object
dtype: object

### 8. Transformaciones fechas

In [36]:
def transformar_fechas(cols, df):
    for col in cols:
        df[col] = pd.to_datetime(df[col])
    
    return df

In [37]:
inspecciones = transformar_fechas(['inspection_date'], inspecciones)

In [38]:
inspecciones.dtypes

inspection_id               int64
dba_name                   object
aka_name                   object
license_                   object
facility_type              object
risk                       object
address                    object
city                       object
state                      object
zip                        object
inspection_date    datetime64[ns]
inspection_type            object
results                    object
latitude                  float64
longitude                 float64
location                   object
violations                 object
dtype: object

### 9. Data profiling categorías

In [39]:
def data_profiling_string(cols, df):
    data_profiling = {}
    for col in cols:
        data_profiling[col] = {'uniques': df[col].nunique(), 
                               'prop_uniques': df[col].nunique()/df[col].shape[0],
                              'mode': df[col].mode()} 

    return pd.DataFrame.from_dict(data_profiling)

In [40]:
data_profiling_string(['dba_name', 'aka_name', 'license_'], inspecciones)

Unnamed: 0,dba_name,aka_name,license_
uniques,30877,29373,42494
prop_uniques,0.124269,0.118215,0.171023
mode,0 SUBWAY dtype: object,0 SUBWAY dtype: object,0 0 dtype: object


### 10. Data profiling fechas

In [41]:
def data_profiling_fechas(cols, df):
    data_profiling = {}
    for col in cols:
        data_profiling[col] = {'uniques': df[col].nunique(), 
                               'prop_uniques': df[col].nunique()/df[col].shape[0],
                              'fecha_minima': df[col].min(),
                              'fecha_maxima': df[col].max(),
                              'anios_diferentes': df[col].dt.year.nunique(),
                              'dias_diferentes': df[col].max() - df[col].min()} 

    return pd.DataFrame.from_dict(data_profiling)

In [42]:
data_profiling_fechas(['inspection_date'], inspecciones)

Unnamed: 0,inspection_date
anios_diferentes,14
dias_diferentes,4785 days 00:00:00
fecha_maxima,2023-02-10 00:00:00
fecha_minima,2010-01-04 00:00:00
prop_uniques,0.013334
uniques,3313


### 11. Guardar datos limpios en s3

In [43]:
def guardar_datos_s3(bucket, bucket_path, dataset):
    session = boto3.Session(
        aws_access_key_id = config['s3']['aws_access_key_id'],
        aws_secret_access_key = config['s3']['aws_secret_access_key'],
        aws_session_token = config['s3']['aws_session_token']
    )

    s3 = session.resource('s3')

    s3.Object(bucket, bucket_path).put(Body=dataset)

In [44]:
TODAY = date.today()

In [45]:
pickle_data = pickle.dumps(inspecciones)

In [46]:
bucket = "aplicaciones-cd-1-" + config['iexe']['matricula']
key = "limpieza/datos-limpios-" + str(TODAY) + ".pkl"

guardar_datos_s3(bucket, key, pickle_data)