# Generación de resultados

## Preámbulo

### Imports

In [4]:
# -*- coding: utf-8 -*-

# Creada por Maximiliano Jones

# Manejo de datos
from ast import parse
import pandas as pd

# Funcionalidades de la aplicación
import streamlit as st
import base64
import pandas_profiling
from streamlit_pandas_profiling import st_profile_report

# Manejod del tiempo/fechas
import pytz
import time

# Automated Classification
from pycaret import classification as supervised
# import pycaret.anomaly as unsupervised

import plotly.express as px

2022-10-12 11:33:26.422 INFO    visions.backends: Pandas backend loaded 1.3.5
2022-10-12 11:33:26.442 INFO    visions.backends: Numpy backend loaded 1.19.5
2022-10-12 11:33:26.443 INFO    visions.backends: Pyspark backend NOT loaded
2022-10-12 11:33:26.444 INFO    visions.backends: Python backend loaded


### Funciones auxiliares

In [5]:
# @st.cache(suppress_st_warning=True)
def load_data(path):
    '''
    ARGS: path to the local .csv file
    Load data and search for the Date_Time column to index the dataframe by a datetime value.

    '''

    data = pd.read_csv(path, sep=None, engine='python',encoding = 'utf-8-sig',parse_dates= True)

    try:
        data['Date_Time'] = pd.to_datetime(data['Date_Time'])
        print('Se encontró una columa "Date_time"')
        data.set_index("Date_Time", inplace=True)
        chile = pytz.timezone("Chile/Continental")
        data.index = data.index.tz_localize(pytz.utc).tz_convert(chile)
        # st.dataframe(data)
        return data
    except:
        try:
            data['Datetime'] = pd.to_datetime(data["Date_Time"])
            print('Se encontró una columa "Datetime"')
            data.set_index("Datetime", inplace=True)
            chile = pytz.timezone("Chile/Continental")
            data.index = data.index.tz_localize(pytz.utc).tz_convert(chile)
            # st.dataframe(data)
            return data
        except:
            print("Se entró en el tercer except")
            # st.sidebar.write("No se encontró columna Date_Time")
            return data

# @st.cache(allow_output_mutation=True,suppress_st_warning=True)
def entrenar_modelos(df, etiqueta, metrica, ensamble=True, debug=True):

    '''
    ARGS: dataframe (pd.DataFrame),
    etiqueta con nombre de dataframe.column (str),
    metrica puede ser ['f1', 'accuracy', 'recall'] (str) y
    ensamble[default=True, False] (boolean)
    '''

    # setup
    pycaret_s = supervised.setup(df, target=etiqueta, session_id=123, silent=True, use_gpu=False, profile=False, log_experiment=False, fix_imbalance=True)
    # model training and selection
    if ensamble:
        # with st.snow():
        top10 = supervised.compare_models(n_select=10)
        top5 = top10[0:4]
        # tune top 5 base models
        grid_a = supervised.pull()
        tuned_top5 = [supervised.tune_model(i, fold=5, optimize='F1', search_library='scikit-optimize') for i in top5]
        # grid_b = supervised.pull()
        stacker = supervised.stack_models(estimator_list=tuned_top5[1:], meta_model=tuned_top5[0])
        # if debug:
            # st.write(top10)
            # st.write(grid_b)
        # else:
        #     pass
            
        #
        return (stacker, grid_a, grid_a)
    else:
        best = supervised.compare_models(sort=metrica, n_select=3)
        grid = supervised.pull()
        return (best, grid, grid)


def deteccion_no_supervisada(df, metrica, etiqueta=None, ensamble=True):
    return None

def cargar_modelo(df, modelo):
    modelo = supervised.load_model('stack inicial')

    return (modelo)

In [6]:
# Generación de gráficos

colors_blue = ["#132C33", "#264D58", '#17869E', '#51C4D3', '#B4DBE9']
colors_dark = ["#1F1F1F", "#313131", '#636363', '#AEAEAE', '#DADADA']
colors_green = ['#01411C','#4B6F44','#4F7942','#74C365','#D0F0C0']

def generar_distrib(loaded_df, etiqueta,target):
    figura = px.histogram(loaded_df,x=etiqueta,y=loaded_df[etiqueta],color=target,template='plotly_white',
                    marginal='box',opacity=0.7,nbins=100,color_discrete_sequence=[colors_green[3],colors_blue[3]],
                    barmode='group',histfunc='count')
                    
    figura.update_layout(
        font_family='monospace',
        title=dict(text=etiqueta,x=0.53,y=0.95,
                font=dict(color=colors_dark[2],size=20)),
        xaxis_title_text=etiqueta,
        yaxis_title_text='Count',
        legend=dict(x=1,y=0.96,bordercolor=colors_dark[4],borderwidth=0,tracegroupgap=5),
        bargap=0.3,
    )
    return figura

def generar_graficos(selected_df, etiqueta):


    describe=selected_df.describe().T.style.bar(subset=['mean'], color='#E68193')\
            .background_gradient(subset=['std'], cmap='mako_r')\
                .background_gradient(subset=['50%'], cmap='mako')

    # df['etiqueta conjunta'] = selected_df[etiqueta].replace([0,1],['normal','anomalía'])
    d = pd.DataFrame(selected_df[etiqueta].replace([0,1],['normal','anomalía']).value_counts())

    fig = px.pie(d,values=etiqueta,names=['normal','anomalía'],hole=0.4,opacity=0.6,
                color_discrete_sequence=[colors_blue[3],colors_green[3]],
                labels={'label':etiqueta,etiqueta:'No. Of Samples'})

    fig.add_annotation(text='Los resultados sugieren un set de datos desbalanceados',
                    x=1.3,y=0.9,showarrow=False,font_size=18,opacity=0.7,font_family='monospace')
    fig.add_annotation(text='Etiquetado <br> Experto',
                    x=0.5,y=0.5,showarrow=False,font_size=14,opacity=0.7,font_family='monospace')

    fig.update_layout(
        font_family='monospace',
        title=dict(text='. Cuántos datos corresponden a datos normales?',x=0.47,y=0.98,
                font=dict(color=colors_dark[2],size=28)),
        legend=dict(x=0.37,y=-0.05,orientation='h',traceorder='reversed'),
        hoverlabel=dict(bgcolor='white'))

    fig.update_traces(textposition='outside', textinfo='percent+label')
    fig.show()


    selected_df[etiqueta].replace([0,1],['normal','anomalía'],inplace=True)
    selected_features = selected_df.drop(columns=[etiqueta]).columns.to_list()
    for label in selected_features:
        figura = generar_distrib(selected_df,label, etiqueta)
        figura.show()



In [7]:
f = load_data('data\Horcon_1L_full.csv')
f

Se encontró una columa "Date_time"


Unnamed: 0_level_0,Pression [cm H2O],Temperatura [°C],EC [µs/cm],etiqueta conjunta
Date_Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2013-02-03 10:00:00-03:00,981.5,17.81,257,1
2013-02-03 11:00:00-03:00,871.1,17.86,9,1
2013-02-03 12:00:00-03:00,870.7,17.80,9,1
2013-02-03 13:00:00-03:00,1025.5,17.71,256,1
2013-02-03 14:00:00-03:00,870.3,17.84,9,1
...,...,...,...,...
2017-02-20 08:00:00-03:00,1034.4,22.92,0,1
2017-02-20 09:00:00-03:00,1034.0,23.44,0,1
2017-02-20 10:00:00-03:00,1034.2,22.66,0,1
2017-02-20 11:00:00-03:00,1034.6,22.51,0,1


In [8]:
column_names = f.columns.to_list()
print(column_names)
selected_df = f[column_names[0:-1]]
selected_df['Target'] = f[column_names[-1]]
column_names = selected_df.columns.to_list()

['Pression [cm H2O]', 'Temperatura [°C]', 'EC [µs/cm]', 'etiqueta conjunta']


In [9]:
selected_df

Unnamed: 0_level_0,Pression [cm H2O],Temperatura [°C],EC [µs/cm],Target
Date_Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2013-02-03 10:00:00-03:00,981.5,17.81,257,1
2013-02-03 11:00:00-03:00,871.1,17.86,9,1
2013-02-03 12:00:00-03:00,870.7,17.80,9,1
2013-02-03 13:00:00-03:00,1025.5,17.71,256,1
2013-02-03 14:00:00-03:00,870.3,17.84,9,1
...,...,...,...,...
2017-02-20 08:00:00-03:00,1034.4,22.92,0,1
2017-02-20 09:00:00-03:00,1034.0,23.44,0,1
2017-02-20 10:00:00-03:00,1034.2,22.66,0,1
2017-02-20 11:00:00-03:00,1034.6,22.51,0,1


# Gráficos distribuciones

In [10]:
generar_distrib(selected_df, column_names[0],column_names[-1])

In [11]:
generar_graficos(selected_df, 'Target')

In [12]:
from src.data.load_dataset import load_data
df2 = load_data(data_directory + '\Horcon-etiquetado_v2.csv')
df2.dropna(axis="columns", how="any", inplace=True)
df2.head(20)

NameError: name 'df2' is not defined

In [13]:
'''
## Gráficos por variable

'''
datas=df2
p = datas.loc[datas['Etiqueta P'] == 1] #anomaly

figg = go.Figure()

figg.add_trace(go.Scatter(x=datas.index, y=datas['Pression [cm H2O]'],
                    mode='lines',
                    name='operación normal',
                    line_color='cadetblue'))
# figg.add_trace(go.Scatter(x=p.index, y=p['Pression [cm H2O]'],
#                     mode='markers',
#                     name='anomalía etiquetada',
#                     marker_color='cyan',
#                     marker_line_width=0.5))
# figg.update_traces(mode='markers', marker_line_width=2, marker_size=10)
figg.update_layout(title='Presión [cm H2O]',
                    yaxis_title='Presión [cm H2O]',
                    xaxis_title='Fecha'
)

# st.plotly_chart(figg, use_container_width=True)
figg.show()

t = datas.loc[datas['Etiqueta T'] == 1] #anomaly

figg2 = go.Figure()

figg2.add_trace(go.Scatter(x=datas.index, y=datas['Temperatura [°C]'],
                    mode='lines',
                    name='operación normal',
                    line_color='darkolivegreen'))
# figg2.add_trace(go.Scatter(x=t.index, y=t['Temperatura [°C]'],
#                     mode='markers',
#                     name='anomalía etiquetada',
#                     marker_color='cyan',
#                     marker_line_width=0.5))
# figg.update_traces(mode='markers', marker_line_width=2, marker_size=10)
figg2.update_layout(title='Temperatura [°C]',
                    yaxis_title='Temperatura [°C]',
                    xaxis_title='Fecha'
)

figg2.show()

e = datas.loc[datas['Etiqueta EC'] == 1] #anomaly

figg3 = go.Figure()

figg3.add_trace(go.Scatter(x=datas.index, y=datas['EC [µs/cm]'],
                    mode='lines',
                    name='operación normal',
                    line_color='darkgoldenrod'))
# figg3.add_trace(go.Scatter(x=e.index, y=e['EC [µs/cm]'],
#                     mode='markers',
#                     name='anomalía etiquetada',
#                     marker_color='cyan',
#                     marker_line_width=0.5))
# figg.update_traces(mode='markers', marker_line_width=2, marker_size=10)
figg3.update_layout(title='EC [µs/cm]',
                    yaxis_title='EC [µs/cm]',
                    xaxis_title='Fecha'
)

figg3.show()

# # %% Anomalías
# with st.beta_expander("Procesar Anomalías",expanded=True):

#     '''
#     ## Detección de anomalías

#     Se utiliza un modelo pre-entrenado basado en LightGBM sobre toda la data cargada para detectar y visualizar anomalías.
#     '''
#     loaded_lgbm = lgbm.Booster(model_file='lgb_classifier.txt')

#     prob_output=loaded_lgbm.predict(datas_unl.to_numpy())
#     output = np.int8(prob_output >= 0.5)

#     new_data = datas_unl.copy()
#     # st.dataframe(data=new_data)
#     # new_data =new_data['label']=np.array(output)

#     b=pd.DataFrame(output,columns=['label'])
#     # st.write(b)
#     # st.write(datas_unl)
#     # st.write(b.columns)
#     datas_unl['etiqueta_anomalía'] = b.values
#     new_data.insert(3,'etiqueta_anomalia', b.to_numpy(),True)
#     # st.write(new_data.columns,new_data.shape)
#     import matplotlib.pyplot as plt

#     def read_anomalies(new_data):
#         a = new_data.loc[new_data['etiqueta_anomalia'] == 1] #anomaly
#         return a

#     a = read_anomalies(new_data)

#     st.write(new_data)

#     p = datas.loc[datas['Etiqueta P'] == 1] #anomaly

#     import plotly.graph_objects as go

#     figg = go.Figure()

#     figg.add_trace(go.Scatter(x=datas.index, y=datas['Pression [cm H2O]'],
#                         mode='lines',
#                         name='operación normal',
#                         line_color='cadetblue'))
#     figg.add_trace(go.Scatter(x=p.index, y=p['Pression [cm H2O]'],
#                         mode='markers',
#                         name='anomalía etiquetada',
#                         marker_color='cyan',
#                         marker_line_width=0.5,
#                         opacity=0.5))
#     figg.add_trace(go.Scatter(x=a.index, y=a['Pression [cm H2O]'],
#                         mode='markers',
#                         name='anomalía detectada',
#                         marker_color='red',
#                         marker_line_width=0.5,
#                         opacity=0.7))

#     # figg.update_traces(mode='markers', marker_line_width=2, marker_size=10)
#     figg.update_layout(title='Presión [cm H2O]',
#                         yaxis_title='Presión [cm H2O]',
#                         xaxis_title='Fecha'
#     )

#     st.plotly_chart(figg, use_container_width=True)

#     t = datas.loc[datas['Etiqueta T'] == 1] #anomaly

#     figg2 = go.Figure()

#     figg2.add_trace(go.Scatter(x=datas.index, y=datas['Temperatura [°C]'],
#                         mode='lines',
#                         name='operación normal',
#                         line_color='darkolivegreen'))
#     figg2.add_trace(go.Scatter(x=t.index, y=t['Temperatura [°C]'],
#                         mode='markers',
#                         name='anomalía etiquetada',
#                         marker_color='cyan',
#                         marker_line_width=0.5,
#                         opacity=0.5))
#     figg2.add_trace(go.Scatter(x=a.index, y=a['Temperatura [°C]'],
#                         mode='markers',
#                         name='anomalía detectada',
#                         marker_color='red',
#                         marker_line_width=0.5,
#                         opacity=0.7))
#     # figg.update_traces(mode='markers', marker_line_width=2, marker_size=10)
#     figg2.update_layout(title='Temperatura [°C]',
#                         yaxis_title='Temperatura [°C]',
#                         xaxis_title='Fecha'
#     )

#     st.plotly_chart(figg2, use_container_width=True)

#     e = datas.loc[datas['Etiqueta EC'] == 1] #anomaly
#     figg3 = go.Figure()

#     figg3.add_trace(go.Scatter(x=datas.index, y=datas['EC [µs/cm]'],
#                         mode='lines',
#                         name='operación normal',
#                         line_color='darkgoldenrod'))
#     figg3.add_trace(go.Scatter(x=e.index, y=e['EC [µs/cm]'],
#                         mode='markers',
#                         name='anomalía etiquetada',
#                         marker_color='cyan',
#                         marker_line_width=0.5,
#                         opacity=0.5))
#     figg3.add_trace(go.Scatter(x=a.index, y=a['EC [µs/cm]'],
#                         mode='markers',
#                         name='anomalía detectada',
#                         marker_color='red',
#                         marker_line_width=0.5,
#                         opacity=0.7))
#     # figg.update_traces(mode='markers', marker_line_width=2, marker_size=10)
#     figg3.update_layout(title='EC [µs/cm]',
#                         yaxis_title='EC [µs/cm]',
#                         xaxis_title='Fecha'
#     )

#     st.plotly_chart(figg3, use_container_width=True)


NameError: name 'df2' is not defined