# 1 Librerías y plantillas

In [3]:
import pandas as pd
import random
from datetime import datetime
import json
import pickle
import re
import os
import sys

# Obtén la ruta del directorio raíz del proyecto
proyecto_path = os.path.abspath(os.path.join(os.getcwd(), ".."))

# Agrega el directorio raíz al sys.path
sys.path.append(proyecto_path)

# Ahora puedes importar utilidades.py desde utils
import Utils.sentences as sentences

In [4]:
## check templates:
print(len(set(sentences.template_sentences)),'examples of range sentences')

233 examples of range sentences


# 2 Creación de rangos de variables

In [3]:
adults = range(1, 15)
children = range(1, 10)
days = range(1, 31)
months = [datetime(2000, month, 1).strftime('%B') for month in range(1, 13)]
year = datetime.now().year

special_dates_dictionary = {
    'summer': 1,
    'Christmas': datetime(year, 12, 25),
    'Boxing Day': datetime(year, 12, 26),
    'New Year': datetime(year+1, 1, 1),
    'Halloween': datetime(year, 10, 31),
    'Valentine': datetime(year+1, 2, 14)
}
special_dates=list(special_dates_dictionary.keys())

# 3 Creación funciones de generación de registros y etiquetas

In [46]:
def create_labels_NER(text):
    # Inicializar una lista de etiquetas como 'O' (Outside)
    labels = ['O'] * len(text.split())
    
    # Buscar y etiquetar las entidades
    for match in re.finditer(r'{(.*?)}', text):
        entity_type = match.group(1)
        start, end = match.span()
        start_word_index = len(text[:start].split())
        end_word_index = len(text[:end].split()) - 1
        
        # Usar la notación BIO
        if start_word_index == end_word_index:
            labels[start_word_index] = f'B-{entity_type.upper()}'
        else:
            labels[start_word_index] = f'B-{entity_type.upper()}'
            for i in range(start_word_index + 1, end_word_index):
                labels[i] = f'I-{entity_type.upper()}'
    
    return ' '.join(labels)

def create_samples(template_list, num):
    data=[]
    for _ in range(num):  
        sentence_template = random.choice(template_list)
        num_adults = random.choice(adults)
        num_children = random.choice(children)
        special_date = random.choice(special_dates)
        num_days = random.choice(days)
        chosen_month = random.choice(months)

        example = sentence_template.format(
            adults=num_adults,
            children=num_children,
            days=num_days,
            month=chosen_month,
            special_date=special_date)
        
        # Enfoque red neuronal
        num_adults= num_adults if "{adults}" in sentence_template else None
        num_children= num_children if "{children}" in sentence_template else None
        num_days= num_days if "{days}" in sentence_template else None
        chosen_month= chosen_month if "{month}" in sentence_template else None
        special_date= special_date if "{special_date}" in sentence_template else None
        
        entities = {"adults": num_adults, 
                    "children": num_children, 
                    "days": num_days, 
                    "month": chosen_month, 
                    "special_date": special_date}

        # Enfoque Transformer - NER
        entities_NER=create_labels_NER(sentence_template)
        
        data.append({"text": example, "entities": entities, "label_NER":entities_NER})
    
    return data

# 4 Generación de dataset de entrenamiento

In [47]:
etiquetas=create_samples(sentences.template_sentences, 5000000)

In [53]:
etiquetas[:5]

[{'text': 'Can you provide room options for Valentine?',
  'entities': {'adults': None,
   'children': None,
   'days': None,
   'month': None,
   'special_date': 'Valentine'},
  'label_NER': 'O O O O O O B-SPECIAL_DATE'},
 {'text': 'I wish to make a reservation for 4 children.',
  'entities': {'adults': None,
   'children': 4,
   'days': None,
   'month': None,
   'special_date': None},
  'label_NER': 'O O O O O O O B-CHILDREN O'},
 {'text': 'Can I get a room for 7 adults for 1 days?',
  'entities': {'adults': 7,
   'children': None,
   'days': 1,
   'month': None,
   'special_date': None},
  'label_NER': 'O O O O O O B-ADULTS O O B-DAYS O'},
 {'text': 'I need a room for 9 children for 29 days.',
  'entities': {'adults': None,
   'children': 9,
   'days': 29,
   'month': None,
   'special_date': None},
  'label_NER': 'O O O O O B-CHILDREN O O B-DAYS O'},
 {'text': 'Can I get a booking for Halloween with 8 teenagers?',
  'entities': {'adults': None,
   'children': 8,
   'days': None,
 

## Eliminar registros repetidos:

In [54]:
def delete_duplicates(df):
    unicos_set = {json.dumps(d, sort_keys=True) for d in df}
    return [json.loads(s) for s in unicos_set]

In [56]:
etiquetas=delete_duplicates(etiquetas)

print(f'tamaño final del dataset de entrenamiento: {len(etiquetas)}')

tamaño final del dataset de entrenamiento: 194828


## Guardado de datos

In [58]:
data=pd.DataFrame(etiquetas)
data=pd.concat([data.drop('entities',axis=1),pd.json_normalize(data['entities'])],axis=1)

In [62]:
data.sample(4)

Unnamed: 0,label_NER,text,adults,children,days,month,special_date
124835,O O O O O O B-ADULTS O O B-CHILDREN O O B-DAYS...,Could I reserve a room for 8 adults and 5 chil...,8.0,5.0,20.0,June,
150675,O O O O O B-DAYS O O B-ADULTS O O B-MONTH,I need a room for 29 days for 1 adults in Febr...,1.0,,29.0,February,
183662,O O O O O O O B-ADULTS O O B-DAYS O O B-MONTH,I'd like to book a room for 1 adults for 6 day...,1.0,,6.0,December,
48193,O O O O O O O O B-ADULTS O O B-CHILDREN O O B-...,Is there any availability for a room for 1 adu...,1.0,4.0,,May,


In [63]:
with open('../Data/train_data.pickle','wb') as f:
    pickle.dump(data,f)