# Prueba Técnica Arkon

Autor: Juan Carlos Hernández Rangel<br>
Desarrollo: 30-noviembre-2022

## Problema

En Los Ángeles existe un sistema compartido de bicicletas que brinda datos anónimos acerca
del uso del servicio. La tabla que se proporciona contiene el histórico de viajes que se han
realizado desde 2016 y contiene una columna que es de particular interés y que se buscará
analizar a más profundidad: Passholder_type. 

## Metodología

### Importar librerías

In [15]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import yaml
import joblib

from feature_engine import transformation as vt
from scipy.stats import zscore
from scipy.stats import skew, kurtosis

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, f1_score
from sklearn.model_selection import train_test_split, KFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

### Importar conjunto de datos

In [16]:
path_prueba = '../Data/test_set.csv'
dataFrame = pd.read_csv(path_prueba, low_memory=False)
trip_id = dataFrame["trip_id"]

In [17]:
with open(r'../Modelo/config.yml') as file:
    val = yaml.load(file, Loader=yaml.FullLoader)

In [4]:
modeloRF = joblib.load('../Modelo/modelo_random_forest.joblib')

### Funciones

In [5]:
def crear_variables(data):
    data["start_time"] = pd.to_datetime(data["start_time"], infer_datetime_format=True)
    data["end_time"] = pd.to_datetime(data["end_time"], infer_datetime_format=True)

    data["start_time_day"] = data["start_time"].dt.day
    data["start_time_month"] = data["start_time"].dt.month
    data["start_time_year"] = data["start_time"].dt.year
    data["start_time_hour"] = data["start_time"].dt.hour
    data["start_time_minute"] = data["start_time"].dt.minute

    data["end_time_hour"] = data["end_time"].dt.hour
    data["end_time_minute"] = data["end_time"].dt.minute
    return data

def eliminar_variables(data, variables):
    return dataFrame.drop(variables, axis=1, errors='ignore')

def mapeo_variables(data, variable, diccionario):
    dataFrame[variable] = dataFrame[variable].map(diccionario)
    return dataFrame

def capping_variables(data, variable, limite_inferior, limite_superior):
    data[variable] = np.where(
                            data[variable] > limite_superior, 
                            limite_superior, 
                            np.where(
                                    data[variable] < limite_inferior,
                                    limite_inferior,
                                    data[variable]))
    return data

def imputador(data, variable):
    promedio = data[variable].mean()
    data[variable] = data[variable].fillna(promedio)
    return data

def transformador_YJ(data, variables):
    tf_YJ = vt.YeoJohnsonTransformer(variables=variables)
    return tf_YJ.fit_transform(data)

### Transformar Datos Prueba

#### Crear Variables

In [6]:
dataFrame = crear_variables(dataFrame)

#### Eliminar Variables

In [7]:
dataFrame = eliminar_variables(dataFrame, val["variables_eliminar"])

#### Mapeo

In [8]:
for variable in val["mapeo_variables"].keys():
    dataFrame = mapeo_variables(dataFrame, variable, val["mapeo_variables"][variable])

#### Capping

In [9]:
for variable in val["limites_rango_intercuartil"].keys():
    li, ls = val["limites_rango_intercuartil"][variable]
    dataFrame = capping_variables(dataFrame, variable, li, ls)

#### Imputador

In [10]:
for variable in val["variables_con_nulos"]:
    dataFrame = imputador(dataFrame, variable)

#### Transformacion

In [11]:
dataFrame = transformador_YJ(dataFrame, val['variables_transformadas_YJ'])

### Predicciones

In [12]:
pred = modeloRF.predict(dataFrame)

In [13]:
dataFinal = pd.DataFrame({
    "trip_id":trip_id,
    "passholder_type":pred
})

In [19]:
mapeo = val['mapeo_variables']['passholder_type']
dataFinal["passholder_type"] = dataFinal['passholder_type'].map(mapeo)

In [21]:
dataFinal.to_csv("submission1.csv", index=False)