# funcion para obtener informacion sin limite de datos

In [1]:
import requests
import json
from requests.auth import HTTPBasicAuth
import time
import pandas as pd
import re

# Configuración warnings
import warnings
warnings.filterwarnings('ignore')

def elasticScroll(elasticParameters, query, pages):
    # parametros de salida
    # parametros del indice
    elasticURL = elasticParameters["elasticURL"]
    elasticIndex = elasticParameters["elasticIndex"]
    elasticUser = elasticParameters["elasticUser"]
    elasticPassword = elasticParameters["elasticPassword"]
    
    if(len(elasticURL)==0) or (len(elasticIndex)==0) or (len(elasticUser)==0) or (len(elasticPassword)==0):
        raise Exception("Revisa los parametros")
    # se define la url que apunta al indice de elastic
    url_search = f"{elasticURL}/{elasticIndex}/_search?scroll=1m"
    # se ejecuta la query
    response = requests.get(url_search, json=query, auth=HTTPBasicAuth(elasticUser, elasticPassword))
    # retorna una lista con el resultado de la query
    search = json.loads(response.text)
    # guardamos el scroll id correspondiente a la query
    scroll_id = search["_scroll_id"]
    # url scroll
    url_scroll = f"{elasticURL}/_search/scroll"
    scroll_query = {
                "scroll": "1m",
                "scroll_id": f"{scroll_id}"
            }

    # condiciones iniciales
    scroll_search = {"hits":{"hits":1}}
    if pages:
        # hay paginacion
        # condiciones iniciales
        from_ = pages["from"]
        size_ = pages["size"]
        count = len(search["hits"]["hits"])

        while scroll_search["hits"]["hits"] and count < from_ + size_:
            scroll_response = requests.get(url_scroll, json=scroll_query, auth=HTTPBasicAuth(elasticUser, elasticPassword))
            scroll_search = json.loads(scroll_response.text)
            if not scroll_search["hits"]["hits"]:
                continue
            else:
                search["hits"]["hits"].extend(scroll_search["hits"]["hits"])
            count += len(scroll_search["hits"]["hits"])

        search["hits"]["hits"] = search["hits"]["hits"][from_:from_+size_+1]

    else:
        # Se devuelven todos los resultados
        while scroll_search["hits"]["hits"]:
            scroll_response = requests.get(url_scroll, json=scroll_query,
                                           auth=HTTPBasicAuth(elasticUser, elasticPassword))
            scroll_search = json.loads(scroll_response.text)
            if not scroll_search["hits"]["hits"]:
                continue
            else:
                search["hits"]["hits"].extend(scroll_search["hits"]["hits"])
    # Se elina el campo scroll del scroll_body
    del scroll_query["scroll"]
    # Se elimina el scroll de elasticsearch para liberar memoria
    delete = requests.delete(url_scroll, json=scroll_query, auth=HTTPBasicAuth(elasticUser, elasticPassword))
    return search

In [2]:
# definiendo parametros de entrada

In [3]:
# datos de conexion a elastic
elasticParameters = {"elasticURL": "https://kibana-prd.e-contact.cl/"
                    , "elasticIndex": "lea_sequences-events-banco_de_chile" 
                    , "elasticUser": "jcalderon"
                    , "elasticPassword": "jcalderon123"
                    }

# query custom para obtener fechas
query = {
  "query": {
    "bool": {
      "must": [],
      "filter": [
        {
          "match_all": {}
        },
        {
            
          "range": {
            "interactionData.dateTimeUTC": {
              "format": "strict_date_optional_time",
              "gte": "2022-05-26T00:00:00.000Z",
              "lte": "2022-05-27T00:00:00.000Z"
            }
          }
        }
      ],
      "should": [],
      "must_not": []
    }
  }
}

# total de paginas
pages =  {
    "from": 0,
    "size": 5000
}

In [4]:
def limpiar_texto(texto):
    """
    Función para realizar la limpieza de un texto dado.
    """
    # Eliminamos los caracteres especiales
   # texto = re.sub(r'\W', ' ', texto)
    # Eliminado las palabras que tengo un solo caracter
    texto = re.sub(r'\s+[a-zA-Z]\s+', ' ', str(texto))
    # Sustituir los espacios en blanco en uno solo
    texto = re.sub(r'\s+', ' ', texto, flags=re.I)
    replacements = (
        ("á", "a"),("é", "e"),("í", "i"),("ó", "o"),("ú", "u"),
        ("ñ", "n"),("?", ""),("¿", ""),("%", ""),("$", ""),
        ("#", ""),("&", ""),("(", ""),(")", ""),("=", ""),
        ("¡", ""),("!", ""),("*", ""),("+", ""),("~", ""),
        ("[", ""),("]", ""),("}", ""),("{", ""),("^", ""),
        ("<", ""),(">", ""),("¬", ""),("¨", ""),("_", "")
    )
    
    for a, b in replacements:
        texto = texto.replace(a, b)
    # Convertimos textos a minusculas
    #texto = texto.lower()
    return texto

# consulta a indice

In [5]:
response = elasticScroll(elasticParameters, query, pages)

In [6]:
from pandas import DataFrame, json_normalize
df = json_normalize(json.loads(json.dumps(response))["hits"]["hits"])


# transformar response (dict) a dataframe

In [7]:
from pandas import DataFrame, json_normalize
import re

df = json_normalize(json.loads(json.dumps(response))["hits"]["hits"])
df_00 = df [['_source.conversationSequence.topicName','_source.interactionData.interactionId','_source.conversationSequence.topics_gcloud']]
df_00['topicName']= df_00['_source.conversationSequence.topicName']
df_00['interactionId']= df_00['_source.interactionData.interactionId']
df_00['topics_gcloud']= df_00['_source.conversationSequence.topics_gcloud']
df_00['topicName'] = df_00['topicName'].replace({ "": "NO_IDENTIFICADO", " ": "NO_IDENTIFICADO" })
del df_00['_source.conversationSequence.topics_gcloud']
new_df = df_00[~df_00['topicName'].str.contains("NO_IDENTIFICADO")]
orderbook = new_df[['interactionId','topics_gcloud']]
del new_df['_source.conversationSequence.topicName']
del new_df['_source.interactionData.interactionId']
del new_df['topics_gcloud']
orderbook['topics_gcloud'] = orderbook['topics_gcloud'].astype("string")
orderbook[['topicNameGen', 'topicPhrase', 'transcriptPhrase']] = orderbook['topics_gcloud'].str.split(',', expand=True)
orderbook[['a','b']]=orderbook['topicNameGen'].str.split(':', expand=True)
orderbook[['c','d']]=orderbook['topicPhrase'].str.split(':', expand=True)
orderbook[['e','f']]=orderbook['transcriptPhrase'].str.split(':', expand=True)
del orderbook['topicNameGen']
orderbook['topicNameGen']= orderbook['b']
del orderbook['topicPhrase']
orderbook['topicPhrase']= orderbook['d']
del orderbook['transcriptPhrase']
orderbook['transcriptPhrase']= orderbook['f']
del orderbook['a']
del orderbook['b']
del orderbook['c']
del orderbook['d']
del orderbook['e']
del orderbook['f']
orderbook['topicNameGen'] = orderbook['topicNameGen'].str.strip()
orderbook['topicPhrase'] = orderbook['topicPhrase'].str.strip()
orderbook['transcriptPhrase'] = orderbook['transcriptPhrase'].str.strip()
orderbook['topicNameGen'] = orderbook['topicNameGen'].str[1:-1]
orderbook['topicPhrase'] = orderbook['topicPhrase'].str[1:-1]
orderbook['transcriptPhrase'] = orderbook['transcriptPhrase'].str[1:-3]

df_04 = new_df.merge(orderbook, how='inner', on='interactionId')

agrupadoFechaHora_A = df_04.groupby(['interactionId','topics_gcloud','topicPhrase','transcriptPhrase']).agg({'topicNameGen': 'max'}).reset_index()

agrupadoFechaHora_A['topicPhrase']      = agrupadoFechaHora_A.topicPhrase.apply(limpiar_texto)
agrupadoFechaHora_A['transcriptPhrase'] = agrupadoFechaHora_A.transcriptPhrase.apply(limpiar_texto)
agrupadoFechaHora_A['topicNameGen']     = agrupadoFechaHora_A.topicNameGen.apply(limpiar_texto)
agrupadoFechaHora_A
#agrupadoFechaHora_A.to_csv("C:\\Users\\jcalderon\\Desktop\\Trafico\\Python_2021\\ProyectoBCH\\" + 'Total3'+'.csv', index = False)

Unnamed: 0,interactionId,topics_gcloud,topicPhrase,transcriptPhrase,topicNameGen
0,025d5bed-0aa0-41de-95a1-0b75f0bf47a6,"[{'topicName': 'Clave Digipass', 'topicPhrase'...",dillipas,dillipas,Clave Digipass
1,087cea1d-ff73-40f1-b089-d2d25e328e42,"[{'topicName': 'Consulta por cheques', 'topicP...",pago de cheques,de pago cheque,Consulta por cheques
2,0d5d753b-99ab-4895-8d04-e87e7dbc7f51,[{'topicName': 'Consulta estado de cuenta y fa...,monto facturado,monto facturado,Consulta estado de cuenta facturacion
3,157e241b-6e15-4099-9f4d-130b386f01be,"[{'topicName': 'Clave dinamica', 'topicPhrase'...",dinamica,dinamica,Clave dinamica
4,157e241b-6e15-4099-9f4d-130b386f01be,"[{'topicName': 'Transferencias', 'topicPhrase'...",hacer una transferencia,hacer la transferencia,Transferencias
...,...,...,...,...,...
64,f7f45bfc-4cd2-4b1a-ad91-991319b29710,"[{'topicName': 'Transferencias', 'topicPhrase'...",por que no puedo transferir,que no no puedo transferir,Transferencias
65,f8abdc10-1bde-49b5-8ec7-b36e9c910fc3,"[{'topicName': 'Baja de productos', 'topicPhra...",quiero anular,quiero anular,Baja de productos
66,f8abdc10-1bde-49b5-8ec7-b36e9c910fc3,"[{'topicName': 'Consulta inversiones', 'topicP...",invertir,invertir,Consulta inversiones
67,f8abdc10-1bde-49b5-8ec7-b36e9c910fc3,"[{'topicName': 'Consulta inversiones', 'topicP...",tengo un deposito plazo,tengo un deposito plazo,Consulta inversiones
