In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
import pickle
import pandas as pd
from operator import getitem
import logging
logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s', level=logging.INFO)

import mlflow
import mlflow.h2o
import mlflow.sklearn
from mlflow.tracking import MlflowClient

import h2o
from h2o.automl import H2OAutoML
h2o.init()

import plotly.graph_objects as go

from src.models.train import train_h2o_models

from src.models.constant_models import list_features
from src.models.constant_models import list_categories
from src.models.constant_models import list_categories_financial_features
from src.models.constant_models import list_financial_features
from src.models.constant_models import list_categories_purchase_features
from src.models.constant_models import list_purchase_features
from src.models.constant_models import financial_rating 
from src.models.constant_models import purchase_rating

from src.models.predict import get_kbins_discretizer_probability
from src.models.predict import get_h2o_model_object

from src.models.qualification_priority import get_deal_qualification

from src.connections.manage_connecting_mlflow import send_to_mlflow_metric
from src.connections.manage_connecting_mlflow import get_mlflow_experiment

from src.models.transform_data import split_data
from src.models.transform_data import get_label_encoder
from src.models.transform_data import get_label_encoder_features
from src.models.transform_data import resample_data
from src.models.transform_data import get_famd
from src.models.transform_data import get_label_encoder_inverse
from src.models.transform_data import get_one_hot_encoder
from src.models.transform_data import get_one_hot_encoder_features

from src.visualization.graphs import get_famd_graph

In [None]:


def train_model_by_set(client, experiment_name, exp_id, list_features_name, list_categories_features, dict_qualification):
            
    logging.info("""One Hot_encoding""")
    df_train_analysis = df_train.copy()
    one_hot_enc = get_one_hot_encoder(df_train_analysis, list_categories_features)
    df_train_analysis = get_one_hot_encoder_features(one_hot_enc, df_train_analysis, list_features_name, list_categories_features)
    df_test_analysis = get_one_hot_encoder_features(one_hot_enc, df_test, list_features_name, list_categories_features)
    try:
        df_train_analysis = df_train_analysis.drop(columns="status_form_NOT_PRE_APPROVED")
        df_test_analysis = df_test_analysis.drop(columns="status_form_NOT_PRE_APPROVED")
    except:
        pass
    mlflow.sklearn.log_model(sk_model=one_hot_enc, artifact_path="processig_data/one_hot_enc")
    
    logging.info("""Training model""")
    ### https://medium.com/@shalinisinghzoots/the-mlflow-tracking-component-is-an-api-and-ui-for-logging-parameters-code-versions-metrics-and-24f078acb55e
    train = h2o.H2OFrame.from_python(df_train_analysis)
    test = h2o.H2OFrame.from_python(df_test_analysis)
    feature_factors = [s for s in df_train_analysis.columns if any(xs in s for xs in list_categories_features)]
    for feature in feature_factors:
        train[feature] = train[feature].asfactor()
        test[feature] = test[feature].asfactor()
    train.describe()
    model = train_h2o_models(train)

    logging.info("""Sending metrics and artifacts to MLFlows""")
    y_pred = model.predict(test)["predict"].as_data_frame()
    y_test = df_test_analysis["approved"]
    send_to_mlflow_metric(mlflow, model, test, y_test, y_pred, MODELS_PATH)
    result = model.predict(train)

    logging.info("""Getting object priorization model""")
    model = get_h2o_model_object(client, exp_id, ROOT_PATH)

    logging.info("""Getting data preditions""")
    result = model.predict(train)
    df_result = result.as_data_frame()
    df_result = round(df_result[["True"]].astype(float), 6)
    
    logging.info("""Getting KBinsDiscretizer probability""") #strategy{‘uniform’, ‘quantile’, ‘kmeans’}, 
    strategy = "kmeans"
    kbins_object, probability_bins,qualification_bins = get_kbins_discretizer_probability(df_result, features_name, n_bins=3, strategy=strategy)
    mlflow.sklearn.log_model(
        sk_model=kbins_object, 
        artifact_path="processig_data/probability_kbins_object"
    )
    
    data_trans = kbins_object.transform(df_result)
    df_data_trans = pd.DataFrame(data=data_trans, columns=[f"probability_{features_name}"])
    df_data_trans[f"probability_{features_name}_bin"] = df_data_trans[f"probability_{features_name}"]. \
        map(probability_bins)
    mlflow.sklearn.log_model(
        sk_model=kbins_object, 
        artifact_path="processig_data/probability_bins"
    )
    
    df_data_trans[f"qualification_{features_name}"] = df_data_trans[f"probability_{features_name}"]. \
        map(dict_qualification)

    df_data = df_data_trans.copy()

    logging.info("""Getting and count KBinsDiscretizer probability df""")

    df_data_trans = pd.DataFrame(data=df_data_trans, columns=[f"probability_{features_name}_bin"])
    df_data_trans["counts"] = 1
    df_data_trans = df_data_trans.groupby([f"probability_{features_name}_bin" ]).count().reset_index()
    fig = go.Figure([go.Bar(x=df_data_trans[f"probability_{features_name}_bin"], y=df_data_trans.counts,
                            text=df_data_trans.counts,
                textposition='auto')])
    title=f'PROBABILITY BINS - {features_name}'
    fig.update_layout(title=title)
    fig.show()
    fig.write_html(os.path.join(MODELS_PATH, f"{title}.html"))
    mlflow.log_artifact(os.path.join(MODELS_PATH, f"{title}.html"))
    

 
    return df_data, probability_bins 

### Load Data

In [None]:
features_name = "financial_features"
list_categories_features = list_categories_financial_features
list_features_name = list_financial_features

In [None]:
logging.info("""Getting experiment MLFlow""")
client = MlflowClient()
experiment_name = f"my_priorization_model_data"
exp_id = get_mlflow_experiment(experiment_name, client)
run_name = mlflow.start_run(experiment_id=exp_id, run_name = experiment_name)
exp_id

In [None]:
ROOT_PATH = os.getcwd()
OUTPUT_PATH = os.path.join(ROOT_PATH, f"output_{features_name}")
DATA_PATH = os.path.join(OUTPUT_PATH, "data")
MODELS_PATH = os.path.join(OUTPUT_PATH, "models")
FILE_PATH = os.path.join(ROOT_PATH, "output_form.csv")

logging.info("""Getting deals information""")
df = pd.read_csv(FILE_PATH).set_index("id_row"). \
    rename(columns={'comimments_net_income': 'comimments_gross_income'})

df["payment_terms"] = df["payment_terms"].astype(str)

df = df[df["net_income"] < 200000]
df = df[~((df["created"] < '2020-05-15 00:00:00.834166') & (df["approved"] == False))]
df = df[df.email.str.find("@truehome.com.mx") == -1]
df = df[~(df["causas_perdida"].isnull() & (df.approved == False)) | (df.approved == True)]

logging.info("""Deals Approved: {}""".format(df["approved"].value_counts()))

### Split data

In [None]:
logging.info("""Splitting data in train set and test set""")
df = df[list_features]
y = df[["approved"]]
X = df.drop(columns="approved")
X_train, X_test, y_train, y_test = split_data(X, y)

df_train = pd.concat([X_train, y_train], axis=1)
df_test = pd.concat([X_test, y_test], axis=1)

df_train.to_csv(os.path.join(DATA_PATH, "df_train_raw.csv"))
mlflow.log_artifact(os.path.join(DATA_PATH, "df_train_raw.csv"))

df_test.to_csv(os.path.join(DATA_PATH, "df_test_raw.csv"))
mlflow.log_artifact(os.path.join(DATA_PATH, "df_test_raw.csv"))


### Processing data

In [None]:
logging.info("""Processing data in train set and test set""")

logging.info("""Resampling train data""")
label_encoder, object_default_dict = get_label_encoder(df_train, list_categories)
df_train_label = get_label_encoder_features(object_default_dict, df_train, list_categories)

X_train = df_train_label[list_features].drop(columns="approved")
y_train = df_train_label[list_features][['approved']]

df_resampled_train = resample_data(X_train, y_train)
df_famd = get_famd(df_resampled_train, list_features)


fig = get_famd_graph(df_famd)
fig.write_html(os.path.join(DATA_PATH, "famd_features_file.html"))
mlflow.log_artifact(os.path.join(DATA_PATH, "famd_features_file.html"))

df_train = get_label_encoder_inverse(df_resampled_train, object_default_dict, list_categories)


logging.info("""Saving data preprocessing steps""")
data_preprocessing_steps = {
    "data_preprocessing": {
        "label_features": {
            "label_encoder": label_encoder,
            "object_defaultdict": object_default_dict
        },  
        #"one_hot_enc": one_hot_enc
    }
}

pickle_preprocessing_steps = os.path.join(MODELS_PATH, "data_preprocessing_steps.pickle")

pickle.dump(
    data_preprocessing_steps, 
    open(pickle_preprocessing_steps , "wb")
)  
mlflow.log_artifact(pickle_preprocessing_steps)

In [None]:
mlflow.end_run()

## Train model - PURCHASE

In [None]:
features_name = "purchase_features"
list_categories_features = list_categories_purchase_features
list_features_name = list_purchase_features

client = MlflowClient()
experiment_name = f"my_priorization_model_with_{features_name}"
exp_id = get_mlflow_experiment(experiment_name, client)

logging.info("""Getting experiment MLFlow""")
run_name = mlflow.start_run(experiment_id=exp_id, run_name = experiment_name)

df_data_trans_purchase, probability_purchase_bins = train_model_by_set(client, experiment_name, exp_id, list_features_name, list_categories_features, purchase_rating)
mlflow.end_run()

## Train model - FINANCIAL

### Processing data by feature set

In [None]:
features_name = "financial_features"
list_categories_features = list_categories_financial_features
list_features_name = list_financial_features

client = MlflowClient()
experiment_name = f"my_priorization_model_with_{features_name}"
exp_id = get_mlflow_experiment(experiment_name, client)

logging.info("""Getting experiment MLFlow""")
run_name = mlflow.start_run(experiment_id=exp_id, run_name = experiment_name)

df_data_trans_financial, probability_financial_bins = train_model_by_set(client, experiment_name, exp_id, list_financial_features, list_categories_financial_features, financial_rating)
mlflow.end_run()

In [None]:
logging.info("""Getting experiment MLFlow""")
client = MlflowClient()
experiment_name = f"my_priorization_model_qualification_priority"
exp_id = get_mlflow_experiment(experiment_name, client)
run_name = mlflow.start_run(experiment_id=exp_id, run_name = experiment_name)
qualification_priority = get_deal_qualification(probability_financial_bins, probability_purchase_bins)

mlflow.sklearn.log_model(
    sk_model=qualification_priority, 
    artifact_path="qualification_priority"
)
mlflow.end_run()

In [None]:
exp_id 

In [None]:
mlflow.end_run()

In [None]:
qualification_priority

In [None]:
df_qualification = pd.concat([df_data_trans_financial, df_data_trans_purchase], axis=1)
df_qualification["qualification"] = df_qualification["qualification_financial_features"] + df_qualification["qualification_purchase_features"]
df_qualification["qualification_priority"] = df_qualification["qualification"].map(qualification_priority)
df_qualification


In [None]:
"""
- Confusion Matrix: a table showing correct predictions and types of incorrect predictions.
- Precision: the number of true positives divided by all positive predictions. Precision is also called Positive Predictive Value. It is a measure of a classifier’s exactness. Low precision indicates a high number of false positives.
- Recall: the number of true positives divided by the number of positive values in the test data. Recall is also called Sensitivity or the True Positive Rate. It is a measure of a classifier’s completeness. Low recall indicates a high number of false negatives.
- F1: Score: the weighted average of precision and recall.

Compute the balanced accuracy

The balanced accuracy in binary and multiclass classification problems to deal with imbalanced datasets. It is defined as the average of recall obtained on each class.
"""

"""
aml = model
perf = aml.leader.model_performance(test)
#perf.auc()
perf

m = h2o.get_model(lb[1,"model_id"])
m.as_data_frame()
"""