In [None]:
!pip install boto3

## Solución al proyecto 4

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pickle
import boto3
import yaml

from datetime import date
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

### 1. Cargar los datos ingestados del bucket de S3

In [None]:
with open("credentials.yaml", "r") as f:
    config = yaml.safe_load(f)

In [None]:
def cargar_datos_s3(bucket, bucket_path):
    session = boto3.Session(
        aws_access_key_id = config['s3']['aws_access_key_id'],
        aws_secret_access_key = config['s3']['aws_secret_access_key'],
        aws_session_token = config['s3']['aws_session_token']
    )

    s3 = session.resource('s3')
    
    obj = s3.Object(bucket, bucket_path).get()['Body'].read()
    dataset = pickle.loads(obj)
    
    return dataset

In [None]:
session = boto3.Session(
    aws_access_key_id = config['s3']['aws_access_key_id'],
    aws_secret_access_key = config['s3']['aws_secret_access_key'],
    aws_session_token = config['s3']['aws_session_token']
)

s3 = session.client('s3')

In [None]:
bucket = "aplicaciones-cd-1-" + config['iexe']['matricula']
key = "limpieza/"

In [None]:
bucket_path = s3.list_objects_v2(Bucket=bucket, Prefix=key)['Contents'][-1]['Key']

In [None]:
bucket_path

In [None]:
dataset = cargar_datos_s3(bucket, bucket_path)

### 2. Eliminar observaciones con resultados inválidos 

In [None]:
dataset.head()

In [None]:
dataset.dtypes

In [None]:
dataset.groupby(['results'], as_index=False)['inspection_id'].count()

In [None]:
def delete_inspections(df):
    return df[df.results.isin(['Fail', 'Not Ready', 'Pass', 'Pass w/ Conditions'])]

In [None]:
inspections_wo_not_valid_results = delete_inspections(dataset)

In [None]:
inspections_wo_not_valid_results.shape

In [None]:
inspections_wo_not_valid_results.results.value_counts()

### 3. Modificar etiquetas de resultados

In [None]:
def transform_result(df):
    df.results.mask(df.results.isin(['Pass w/ Conditions', 'Pass']), 'pass', inplace=True)
    df.results.mask(df.results.isin(['Fail', 'Not Ready']), 'fail', inplace=True)
    
    return df

In [None]:
inspections = transform_result(inspections_wo_not_valid_results)

In [None]:
inspections.results.value_counts()

### 4. Modificar etiquetas de riesgo

In [None]:
def transform_risk(df):
    df.risk.mask(df.risk.str.contains('Low'), 'low', inplace=True)
    df.risk.mask(df.risk.str.contains('Medium'), 'medium', inplace=True)
    df.risk.mask(df.risk.str.contains('High'), 'high', inplace=True)
    df.risk.mask(df.risk.str.contains('All'), 'all', inplace=True)
    
    return df

In [None]:
inspections = transform_risk(inspections)

### 5. Modificar las etiquetas de facility_type

In [None]:
def transform_facility(df):
    df.facility_type.mask(df.facility_type.str.contains('Restaurant'), 'restaurant', inplace=True)
    df.facility_type.mask(df.facility_type.str.contains('Daycare'), 'daycare', inplace=True)
    df.facility_type.mask(df.facility_type.str.contains('Mobile Food'), 'mobile food', inplace=True)
    
    return df

In [None]:
inspections = transform_facility(inspections)

In [None]:
aux = inspections.groupby(['facility_type'], as_index=False)['inspection_id']\
.count()\
.rename(columns={'inspection_id': 'count'})\
.sort_values(by="count", ascending=False).head(20)

In [None]:
def transform_facility_other(df):
    df.facility_type.mask(~(df.facility_type.isin(aux.facility_type.values)), 'other', inplace=True)
    
    return df

In [None]:
inspections = transform_facility_other(inspections)

In [None]:
inspections.groupby(['facility_type'], as_index=False)['inspection_id'].count()\
.sort_values(by="inspection_id", ascending=False).head(30)

### 6. Creación de features

In [None]:
def generate_features(df):
    df['month'] = df.inspection_date.dt.month
    df['year'] = df.inspection_date.dt.year
    df['day_of_month'] = df.inspection_date.dt.day
    df['week_of_year'] = df.inspection_date.dt.week
    df['day_of_week'] = df.inspection_date.dt.dayofweek
    # day of week 0: lunes
    df['week_day'] = 1
    df.week_day.mask(df.week_day.isin([5,6]), 0, inplace=True)
    df['weekend'] = 0
    df.week_day.mask(df.week_day.isin([5,6]), 1, inplace=True)
    
    return df

In [None]:
inspections = generate_features(inspections)

In [None]:
inspections.head()

### 7. Crear la matriz de diseño

In [None]:
df = inspections[['facility_type', 'risk', 'latitude', 'longitude', 'month', 'year', 'day_of_month',
                 'week_of_year', 'day_of_week', 'week_day', 'weekend']]

In [None]:
transformers = [('one_hot_risk', OneHotEncoder(), ['facility_type']),
               ('one_hot_facility_type', OneHotEncoder(), ['risk'])]

ct = ColumnTransformer(transformers, remainder="passthrough", n_jobs=-1)

In [None]:
fm = ct.fit_transform(df)

In [None]:
fm.shape

In [None]:
np.array(ct.get_feature_names_out())

In [None]:
feature_matrix = pd.DataFrame(fm, columns=np.array(ct.get_feature_names()))

In [None]:
feature_matrix.head()

### 8. Guardar la matriz de diseño

In [None]:
def save_feature_matrix(bucket, bucket_path, dataset):
    session = boto3.Session(
        aws_access_key_id = config['s3']['aws_access_key_id'],
        aws_secret_access_key = config['s3']['aws_secret_access_key'],
        aws_session_token = config['s3']['aws_session_token']
    )

    s3 = session.resource('s3')

    s3.Object(bucket, bucket_path).put(Body=dataset)

In [None]:
TODAY = date.today()

In [None]:
pickle_data = pickle.dumps(feature_matrix)

In [None]:
bucket = "aplicaciones-cd-1-" + config['iexe']['matricula']
key = "feature-matrix/feature-matrix-" + str(TODAY) + ".pkl"

In [None]:
save_feature_matrix(bucket, key, pickle_data)