In [1]:
import json

class DataPreprocessor:
    def getDataFromEntry(self, entry):
        data = {'title': entry['title'], 'contexts': [], 'questions': [], 'answers': []}
        paragraphs = entry['paragraphs']
        for p in paragraphs:
            qas = p['qas']
            data['contexts'].append(p['context'])
            for qa in qas:
                data['questions'].append(qa['question'])
                if not qa['is_impossible']:
                    data['answers'].append(qa['answers'][0]['text'])
                else:
                    if len(qa['plausible_answers']) > 0:
                        data['answers'].append(qa['plausible_answers'][0]['text'])
                    else:
                        data['answers'].append("I don't know")
        return data

    def remove_non_ascii(self, text):
        return ''.join([word for word in text if ord(word) < 128])

    def load_data(self, file_path):
        with open(file_path, 'rb') as file:
            dataset = json.load(file)['data']
        questions, answers = [], []
        for d in dataset:
            parsed = self.getDataFromEntry(d)
            questions += list(map(self.remove_non_ascii, parsed['questions']))
            answers += list(map(self.remove_non_ascii, parsed['answers']))
        return questions, answers


In [6]:
import numpy as np
from sklearn.model_selection import KFold
from keras.models import Model
from keras.layers import Input, LSTM, Dense

class ModelCreator:
    def __init__(self):
        # Initialization, if needed
        pass

    def createEncoders(self, questions, answers):
        input_chars, output_chars = set(), set()
    
        for i in range(0, len(questions)):
            for char in questions[i]: 
                if char not in input_chars: input_chars.add(char.lower())

        for i in range(0, len(answers)):
            for char in answers[i]:
                if char not in output_chars: output_chars.add(char.lower())

        input_chars, output_chars = sorted(list(input_chars)), sorted(list(output_chars))
        n_encoder_tokens, n_decoder_tokens = len(input_chars), len(output_chars)
        max_encoder_len = max([len(text) for text in questions])
        max_decoder_len = max([len(text) for text in answers])

        input_dictionary = {word: i for i, word in enumerate(input_chars)}
        output_dictionary = {word: i for i, word in enumerate(output_chars)}
        label_dictionary = {i: word for i, word in enumerate(output_chars)}

        x_encoder = np.zeros((len(questions), max_encoder_len, n_encoder_tokens))
        x_decoder = np.zeros((len(questions), max_decoder_len, n_decoder_tokens))
        y_decoder = np.zeros((len(questions), max_decoder_len, n_decoder_tokens))

        for i, (input, output) in enumerate(zip(questions, answers)):
            for _character, character in enumerate(input):
                x_encoder[i, _character, input_dictionary[character.lower()]] = 1.

            for _character, character in enumerate(output):
                x_decoder[i, _character, output_dictionary[character.lower()]] = 1.

                if i > 0: y_decoder[i, _character, output_dictionary[character.lower()]] = 1.
        return [x_encoder, x_decoder, y_decoder], [label_dictionary, n_decoder_tokens, n_encoder_tokens]

    def encoder_decoder(self, n_encoder_tokens, n_decoder_tokens, n_units):
        encoder_input = Input(shape=(None, n_encoder_tokens))
        encoder = LSTM(n_units, return_state=True)
        encoder_output, hidden_state, cell_state = encoder(encoder_input)
        encoder_states = [hidden_state, cell_state]
        decoder_input = Input(shape=(None, n_decoder_tokens))
        decoder = LSTM(n_units, return_state=True, return_sequences=True)
        decoder_output, _, _ = decoder(decoder_input, initial_state=encoder_states)
        decoder_dense = Dense(n_decoder_tokens, activation='softmax')(decoder_output)
        model = Model([encoder_input, decoder_input], decoder_dense)
        model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
        model.summary()
        return model

    def train_encoder_decoder(self, questions, answers, n_units, batch_size, epochs):
        input_data, variables = self.createEncoders(questions, answers)
        x_encoder, x_decoder, y_decoder = input_data
        label_dictionary, n_decoder_tokens, n_encoder_tokens = variables
        seq2seq_model = self.encoder_decoder(n_encoder_tokens, n_decoder_tokens, n_units)
        seq2seq_model.fit([x_encoder, x_decoder], y_decoder, batch_size=batch_size, epochs=epochs, shuffle=True)
        
        return seq2seq_model, questions, answers, x_encoder, x_decoder, y_decoder, label_dictionary
    
    
    
    def cross_validate(self, questions, answers, n_units, n_folds=5, batch_size=32, epochs=10):
        # Calculate the complete character sets for the entire dataset
        complete_input_chars = set(char.lower() for question in questions for char in question)
        complete_output_chars = set(char.lower() for answer in answers for char in answer)

        kf = KFold(n_splits=n_folds, shuffle=True)
        fold_results = []

        for train_index, test_index in kf.split(questions):
            train_questions = [questions[i] for i in train_index]
            train_answers = [answers[i] for i in train_index]
            test_questions = [questions[i] for i in test_index]
            test_answers = [answers[i] for i in test_index]

            # Add complete character sets to each subset
            train_questions = [q + ''.join(complete_input_chars) for q in train_questions]
            train_answers = [a + ''.join(complete_output_chars) for a in train_answers]
            test_questions = [q + ''.join(complete_input_chars) for q in test_questions]
            test_answers = [a + ''.join(complete_output_chars) for a in test_answers]

            # Use the createEncoders method normally
            train_data, variables = self.createEncoders(train_questions, train_answers)
            test_data, _ = self.createEncoders(test_questions, test_answers)
            x_encoder_train, x_decoder_train, y_decoder_train = train_data
            x_encoder_test, x_decoder_test, y_decoder_test = test_data

            model = self.encoder_decoder(variables[2], variables[1], n_units)
            model.fit([x_encoder_train, x_decoder_train], y_decoder_train, batch_size=batch_size, epochs=epochs, shuffle=True)

            scores = model.evaluate([x_encoder_test, x_decoder_test], y_decoder_test, verbose=0)
            fold_results.append(scores)

        avg_scores = np.mean(fold_results, axis=0)
        return avg_scores


In [3]:
def predictQuestion(model, question, x_encoder, x_decoder, label_dictionary, real_answer = ''):
        questions, answer = [], []
        print("Question: " + question)
        for start, c in enumerate(question):
            y_predict = model.predict([x_encoder[start:start+1], x_decoder[start:start+1]])
            input_sequences, output_sequences = [], []
            for i in range(0, len(y_predict[0])): 
                output_sequences.append(np.argmax(y_predict[0][i]))
                input_sequences.append(np.argmax(x_decoder[start][i]))

            output_sequences = ''.join([label_dictionary[key] for key in output_sequences])
            input_sequences = ''.join([label_dictionary[key] for key in input_sequences])
            answer.append(output_sequences)
            questions.append(input_sequences)
        print(answer[-1])
        print(real_answer)
        return answer, questions

In [9]:
n_units = 300
epochs =50
batch_size = 64

preprocessor = DataPreprocessor()
questions, answers = preprocessor.load_data('/kaggle/input/dev-v2-0/dev-v2.0.json')

model_creator = ModelCreator()

In [10]:
seq2seq_model, questions, answers, x_encoder, x_decoder, y_decoder, label_dictionary = model_creator.train_encoder_decoder(questions, answers, n_units, batch_size, epochs)

Model: "model_5"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_11 (InputLayer)       [(None, None, 55)]           0         []                            
                                                                                                  
 input_12 (InputLayer)       [(None, None, 60)]           0         []                            
                                                                                                  
 lstm_10 (LSTM)              [(None, 300),                427200    ['input_11[0][0]']            
                              (None, 300),                                                        
                              (None, 300)]                                                        
                                                                                            

In [8]:
n_units = 256  # Number of units in the LSTM
n_folds = 5    # Number of folds for K-Fold cross-validation
batch_size = 32
epochs = 10

# Call the cross_validate method
average_scores = model_creator.cross_validate(questions, answers, n_units, n_folds, batch_size, epochs)

# Print the results
print("Average Scores:", average_scores)

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, None, 55)]           0         []                            
                                                                                                  
 input_2 (InputLayer)        [(None, None, 60)]           0         []                            
                                                                                                  
 lstm (LSTM)                 [(None, 256),                319488    ['input_1[0][0]']             
                              (None, 256),                                                        
                              (None, 256)]                                                        
                                                                                              

In [None]:
import pickle
import os

# Ruta del archivo en el directorio de trabajo de Kaggle
file_path = '/kaggle/working/qa_model.pickle'

# Abrir (o crear) el archivo
with open(file_path, 'wb') as f:
    pass  # Esto creará el archivo si no existe y no hará nada si ya existe

# El archivo 'qa_model.pickle' se encuentra ahora en '/kaggle/working/'


In [None]:
path = "/kaggle/working/"
model_file = open(path + 'qa_model.pickle','wb')
encoders = {'x_encoder': x_encoder, 'x_decoder': x_decoder, 'y_decoder': y_decoder, 'label_dictionary': label_dictionary}
data = {'model': seq2seq_model, 'questions': questions, 'answers': answers, 'encoders': encoders}
pickle.dump(data, model_file)

In [None]:
from keras.models import load_model

# Ruta del archivo
path = "/kaggle/working/"
file_path = path + 'qa_model.pickle'

# Cargar los datos y el modelo
with open(file_path, 'rb') as model_file:
    loaded_data = pickle.load(model_file)

# Extraer el modelo y los datos adicionales
seq2seq_model = loaded_data['model']
questions = loaded_data['questions']
answers = loaded_data['answers']
encoders = loaded_data['encoders']

# Extraer los encoders específicos
x_encoder = encoders['x_encoder']
x_decoder = encoders['x_decoder']
y_decoder = encoders['y_decoder']
label_dictionary = encoders['label_dictionary']


In [7]:
questions[0], answers[0]

('In what country is Normandy located?', 'France')

In [11]:
# Suponiendo que tienes una pregunta de ejemplo
test_question = questions[0]
real_answer = answers[0]

# Llamada a la función predictQuestion
answer, questions = predictQuestion(seq2seq_model, test_question, x_encoder, x_decoder, label_dictionary, real_answer)


Question: In what country is Normandy located?
old nor ee                                                                                                                                                      
France
