## Install libraries

In [2]:
import os
import sys
os.system(f"{sys.executable} -m pip install --quiet openpyxl")
os.system(f"{sys.executable} -m pip install --quiet unidecode")
os.system(f"{sys.executable} -m pip install --quiet redshift_connector")

0


## Import libraries

In [3]:
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from awsglue.context import GlueContext
from pyspark.context import SparkContext
from awsglue.job import Job
import pandas as pd
import redshift_connector
import psycopg2
from unidecode import unidecode
import warnings
warnings.filterwarnings('ignore')




In [4]:
sc = SparkContext.getOrCreate()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)




## Initial definitions:

### Variable types:

In [5]:
dict_types = {
            'AÑOMES':int,
            'NATURALEZA':str,
            'CODIGO_CARGUE':int,
            'CODIGO_DESCARGUE': int,
            'HORAS_VIAJE':float,
            'HORAS_ESPERA_CARGUE':float,
            'HORAS_CARGUE':float,
            'HORAS_ESPERA_DESCARGUE':float,
            'HORAS_DESCARGUE':float,
            'CONFIGURACION':str
            }

ls_read_values = list(dict_types.keys())




### Dimensions and RedShift tables dictionary:

In [6]:
# This dictionary represent all those columns that are necessary to build the dimension tables of the model.
# The keys represent the code names of the file, the values are a list in which the first element is the dimesion table related to
# that code and the third one represents the idenfitier column name in the dimension table.

dict_dimensions = {
                'CONFIGURACION':['DIM_CONFIGURACIONES_VEHICULO','ID_CONFIGURACION_VEHICULO'],
                'CODIGO_CARGUE':['DIM_MUNICIPIOS','ID_MUNICIPIO_ORIGEN'],
                'CODIGO_DESCARGUE':['DIM_MUNICIPIOS','ID_MUNICIPIO_DESTINO'],
                'NATURALEZA':['DIM_NATURALEZAS_CARGA','ID_NATURALEZA_CARGA']
                }




### Fact table columns:

In [7]:
dict_fact = {
            'AÑOMES':['AÑOMES','ANO_MES'],
            'CONFIGURACION':['ID_CONFIGURACION_VEHICULO','ID_CONFIGURACION_VEHICULO'],
            'NATURALEZA':['ID_NATURALEZA_CARGA','ID_NATURALEZA_CARGA'],
            'CODIGO_CARGUE':['ID_MUNICIPIO_ORIGEN','ID_MUNICIPIO_ORIGEN'],
            'CODIGO_DESCARGUE':['ID_MUNICIPIO_DESTINO','ID_MUNICIPIO_DESTINO'],
            'HORAS_VIAJE':['HORAS_VIAJE','PROMEDIO_HORAS_VIAJE'],
            'HORAS_ESPERA_CARGUE':['HORAS_ESPERA_CARGUE','PROMEDIO_HORAS_ESPERA_CARGUE'],
            'HORAS_CARGUE':['HORAS_CARGUE','PROMEDIO_HORAS_CARGUE'],
            'HORAS_ESPERA_DESCARGUE':['HORAS_ESPERA_DESCARGUE','PROMEDIO_HORAS_ESPERA_DESCARGUE'],
            'HORAS_DESCARGUE':['HORAS_DESCARGUE','PROMEDIO_HORAS_DESCARGUE']
            }

ls_fact_keys = list(dict_fact.keys())
ls_fact_values = list(dict_fact.values())
ls_order_values = [x[0] for x in ls_fact_values]
ls_redshift_values = [x[-1] for x in ls_fact_values]
dict_redshift = {k: v[-1] for k, v in dict_fact.items()}




### Redshift connection:

In [8]:
conn = psycopg2.connect(
            host = 'redshift-cluster-2.cg5i3fotr9gy.sa-east-1.redshift.amazonaws.com', 
            database = 'dev', 
            port = 5439,
            user = 'admin', 
            password = 'Awscente1803*.*'
        )

cursor = conn.cursor()




## Retrive parameters from Lambda function:

In [None]:
args = getResolvedOptions(sys.argv, ['bucket','object_key'])
bucket = args['bucket']
object_key = args['object_key']

# bucket = 'rndc-raw'
# object_key = 'tiempos_logisticos/RemesasRNDC_202207.txt'

## Read the dataset

In [10]:
df = pd.read_csv(f"s3://{bucket}/{object_key}",dtype=dict_types, delimiter = '|', usecols=ls_read_values,encoding='latin-1')




## Transformations

In [11]:
df.drop_duplicates(inplace=True)

df = df.applymap(lambda x: x.upper() if isinstance(x, str) else x)
df = df.applymap(lambda x: unidecode(x) if isinstance(x, str) else x)
df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)

df['CODIGO_CARGUE'] = df['CODIGO_CARGUE'].astype(str)
df['CODIGO_DESCARGUE'] = df['CODIGO_DESCARGUE'].astype(str)

df_grouped = df.groupby(by=['AÑOMES','NATURALEZA','CODIGO_CARGUE','CODIGO_DESCARGUE','CONFIGURACION'],as_index=False).mean()
print(f"file: {object_key.split('/')[-1]} | status: refined")

file: RemesasRNDC_202207.txt | status: refined


## Fact table:

In [12]:
df_fact = df_grouped[ls_fact_keys]

for key in dict_dimensions:
    # Retrieve data from Redshift
    table_name = dict_dimensions[key][0]
    id_name = dict_dimensions[key][-1]
    if key != 'NATURALEZA':
        query = f'SELECT id AS {id_name}, codigo FROM {table_name}'
    else:
        query = f'SELECT id AS {id_name}, naturaleza_carga AS codigo FROM {table_name}'
    cursor.execute(query)
    rows = cursor.fetchall()
    column_names = [desc[0].upper() for desc in cursor.description]
    df_dimension_redshift = pd.DataFrame(rows, columns=column_names)
    
    # Change cod column to its corresponding id
    df_fact = pd.merge(df_fact,df_dimension_redshift,left_on = key,right_on = 'CODIGO',how = 'left')
    df_fact.drop([key,'CODIGO'], axis=1, inplace=True)

# Order de columns to save into Redshift
df_fact = df_fact[ls_order_values].copy()




In [14]:
dataset = list(zip(df_fact['ID_MUNICIPIO_DESTINO'], df_fact['ID_MUNICIPIO_ORIGEN'], df_fact['ID_NATURALEZA_CARGA'], df_fact['ID_CONFIGURACION_VEHICULO'], df_fact['AÑOMES'], df_fact['HORAS_DESCARGUE'], df_fact['HORAS_VIAJE'], df_fact['HORAS_ESPERA_DESCARGUE'], df_fact['HORAS_CARGUE'], df_fact['HORAS_ESPERA_CARGUE']))

#consulta = "DELETE FROM tabla WHERE idnpais = %s AND añomes = %s AND semana = %s AND canal = %s;"
#cursor.execute(consulta, (idnpais, aniomes, semana, canal))

consulta = "INSERT INTO tiempos_logisticos (id_municipio_destino, id_municipio_origen, id_naturaleza_carga, id_configuracion_vehiculo, ano_mes, promedio_horas_descargue, promedio_horas_espera_descargue, promedio_horas_cargue, promedio_horas_espera_cargue, promedio_horas_viaje) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s);"
cursor.executemany(consulta, (dataset))
conn.commit()

Index(['AÑOMES', 'ID_CONFIGURACION_VEHICULO', 'ID_NATURALEZA_CARGA',
       'ID_MUNICIPIO_ORIGEN', 'ID_MUNICIPIO_DESTINO', 'HORAS_VIAJE',
       'HORAS_ESPERA_CARGUE', 'HORAS_CARGUE', 'HORAS_ESPERA_DESCARGUE',
       'HORAS_DESCARGUE'],
      dtype='object')


In [86]:
cursor.close()
conn.close()


