<a href="https://colab.research.google.com/github/Kennyy127/AI-AOL/blob/main/Wingman.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Bidirectional, Concatenate, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.utils import shuffle

In [None]:
def load_datasets():
    conversational_dataset = pd.read_csv('conversational_dataset_25k.csv')
    sentiment_dataset = pd.read_csv('sentiment_analysis.csv')
    cornell_dataset = pd.read_csv('Cornell_Movie_Dialog_Structured_Dataset.csv')
    synthetic_train = pd.read_csv('Synthetic-Persona-Chat_train.csv')
    synthetic_valid = pd.read_csv('Synthetic-Persona-Chat_valid.csv')
    synthetic_test = pd.read_csv('Synthetic-Persona-Chat_test.csv')

    synthetic_persona_chat = pd.concat([synthetic_train, synthetic_valid, synthetic_test], ignore_index=True)
    synthetic_persona_chat = synthetic_persona_chat.rename(columns={'context': 'text', 'response': 'response'})

    return conversational_dataset, sentiment_dataset, cornell_dataset, synthetic_persona_chat


In [None]:
def clean_and_convert_to_string(dataset):
    for column in ['text', 'response']:
        if column in dataset.columns:
            dataset[column] = dataset[column].fillna('')
            dataset[column] = dataset[column].astype(str)
    return dataset

In [None]:
def preprocess_datasets(conversational_dataset, sentiment_dataset, cornell_dataset, synthetic_persona_chat):
    conversational_dataset = conversational_dataset.rename(columns={'Input': 'text', 'Response': 'response'})

    if 'text' in sentiment_dataset.columns and 'sentiment' in sentiment_dataset.columns:
        sentiment_dataset = sentiment_dataset.rename(columns={'text': 'text', 'sentiment': 'response'})
    else:
        raise KeyError("Sentiment dataset is missing required columns 'text' and 'sentiment'.")

    if 'text' in cornell_dataset.columns and 'label' in cornell_dataset.columns:
        cornell_dataset = cornell_dataset.rename(columns={'text': 'text', 'label': 'response'})
    else:
        raise KeyError("Cornell dataset is missing required columns 'text' and 'label'.")

    if 'Best Generated Conversation' in synthetic_persona_chat.columns:
        synthetic_persona_chat = synthetic_persona_chat.rename(
            columns={'user 1 personas': 'text', 'Best Generated Conversation': 'response'}
        )
    else:
        raise KeyError("Synthetic Persona Chat dataset is missing required columns.")

    # Clean datasets
    conversational_dataset = clean_and_convert_to_string(conversational_dataset)
    sentiment_dataset = clean_and_convert_to_string(sentiment_dataset)
    cornell_dataset = clean_and_convert_to_string(cornell_dataset)
    synthetic_persona_chat = clean_and_convert_to_string(synthetic_persona_chat)

    # Combine datasets
    combined_dataset = pd.concat([
        conversational_dataset[['text', 'response']],
        sentiment_dataset[['text', 'response']],
        cornell_dataset[['text', 'response']],
        synthetic_persona_chat[['text', 'response']]
    ], ignore_index=True)

    # Shuffle the dataset
    combined_dataset = shuffle(combined_dataset, random_state=42).reset_index(drop=True)

    return combined_dataset

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

def tokenize_and_pad(dataset, vocab_size=10000, max_length=50):
    tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=vocab_size, oov_token="<OOV>")
    tokenizer.fit_on_texts(dataset['text'].values)

    text_sequences = tokenizer.texts_to_sequences(dataset['text'].values)
    response_sequences = tokenizer.texts_to_sequences(dataset['response'].values)

    text_padded = pad_sequences(text_sequences, maxlen=max_length, padding='post')  # Use text_sequences
    response_padded = pad_sequences(response_sequences, maxlen=max_length, padding='post')  # Use response_sequences

    decoder_input = response_padded[:, :-1]
    decoder_target = response_padded[:, 1:]

    return text_padded, decoder_input, decoder_target, tokenizer


In [None]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense

def build_model(vocab_size, input_length):
    # Encoder
    encoder_inputs = Input(shape=(input_length,), name='encoder_inputs')
    encoder_embedding = Embedding(input_dim=vocab_size, output_dim=128, input_length=input_length)(encoder_inputs)
    encoder_lstm, state_h, state_c = LSTM(128, return_state=True, name='encoder_lstm')(encoder_embedding)
    encoder_states = [state_h, state_c]

    # Decoder
    decoder_inputs = Input(shape=(input_length - 1,), name='decoder_inputs')
    decoder_embedding = Embedding(input_dim=vocab_size, output_dim=128, input_length=input_length - 1)(decoder_inputs)
    decoder_lstm = LSTM(128, return_sequences=True, return_state=False, name='decoder_lstm')
    decoder_outputs = decoder_lstm(decoder_embedding, initial_state=encoder_states)
    decoder_dense = Dense(vocab_size, activation='softmax', name='decoder_dense')
    decoder_outputs = decoder_dense(decoder_outputs)

    # Model
    model = Model([encoder_inputs, decoder_inputs], decoder_outputs, name='seq2seq_model')
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model


In [None]:
if __name__ == "__main__":
    datasets = load_datasets()
    combined_dataset = preprocess_datasets(*datasets)

    text_padded, decoder_input, decoder_target, tokenizer = tokenize_and_pad(combined_dataset)

    model = build_model(vocab_size=10000, input_length=50)


    history = model.fit(
        [text_padded, decoder_input],
        decoder_target,
        batch_size=32,
        epochs=10,
        validation_split=0.2
)

    model.save("Wingman.keras", save_format="keras")
    print("Model training complete and saved at 'Wingman.keras'")


Epoch 1/10




[1m4863/4863[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m158s[0m 32ms/step - accuracy: 0.9189 - loss: 0.6620 - val_accuracy: 0.9669 - val_loss: 0.1547
Epoch 2/10
[1m4863/4863[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m154s[0m 32ms/step - accuracy: 0.9680 - loss: 0.1459 - val_accuracy: 0.9701 - val_loss: 0.1315
Epoch 3/10
[1m4863/4863[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m154s[0m 32ms/step - accuracy: 0.9708 - loss: 0.1248 - val_accuracy: 0.9718 - val_loss: 0.1218
Epoch 4/10
[1m4863/4863[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m154s[0m 32ms/step - accuracy: 0.9730 - loss: 0.1117 - val_accuracy: 0.9728 - val_loss: 0.1168
Epoch 5/10
[1m4863/4863[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m154s[0m 32ms/step - accuracy: 0.9736 - loss: 0.1062 - val_accuracy: 0.9734 - val_loss: 0.1138
Epoch 6/10
[1m4863/4863[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m154s[0m 32ms/step - accuracy: 0.9743 - loss: 0.1018 - val_accuracy: 0.9741 - val_loss: 0.1119
Epoch 7/1



Model training complete and saved at 'Wingman.keras'


In [None]:
def chat(model, tokenizer, max_input_length=50, max_response_length=49):
    print("Type 'exit' to end the chat.")
    while True:
        user_input = input("You: ")
        if user_input.lower() == 'exit':
            print("Goodbye!")
            break

        user_sequence = tokenizer.texts_to_sequences([user_input])
        user_padded = pad_sequences(user_sequence, maxlen=max_input_length, padding='post')

        decoder_input = np.zeros((1, max_response_length))

        response = ""
        for i in range(max_response_length):
            prediction = model.predict([user_padded, decoder_input], verbose=0)
            print(f"Step {i}: Prediction array = {prediction[0]}")

            next_token = np.argmax(prediction[0])
            print(f"Step {i}: Next token = {next_token}")

            if next_token == tokenizer.word_index.get('<end>', None):
                break

            predicted_word = tokenizer.index_word.get(next_token, '<unknown>')
            if predicted_word == '<unknown>':
                print(f"Unknown token: {next_token}")
                break

            response += predicted_word + ' '

            if i < max_response_length - 1:
                decoder_input[0, i + 1] = next_token

        print("Bot:", response.strip())


chat(model, tokenizer)

Type 'exit' to end the chat.
You: hi
Step 0: Prediction array = [[1.0000000e+00 8.6693266e-09 1.4099787e-11 ... 1.6321144e-14
  1.8169321e-14 1.8641820e-14]
 [1.0000000e+00 4.2315829e-10 7.4147476e-12 ... 7.5480258e-15
  8.3583128e-15 8.6155135e-15]
 [1.0000000e+00 4.0426440e-10 7.9556830e-12 ... 4.6547650e-15
  5.2591090e-15 5.4114165e-15]
 ...
 [1.0000000e+00 2.2388133e-12 1.3808893e-14 ... 1.6813026e-16
  1.8920564e-16 1.8565865e-16]
 [1.0000000e+00 2.2464487e-12 1.3608141e-14 ... 1.6852967e-16
  1.8967175e-16 1.8604786e-16]
 [1.0000000e+00 2.2539466e-12 1.3409923e-14 ... 1.6889780e-16
  1.9010130e-16 1.8640662e-16]]
Step 0: Next token = 0
Unknown token: 0
Bot: 
You: how are you
Step 0: Prediction array = [[9.99999881e-01 2.13630305e-08 8.18170184e-11 ... 3.92766386e-13
  4.82460805e-13 4.70225963e-13]
 [1.00000000e+00 8.18560830e-10 5.68574571e-12 ... 4.78186826e-14
  5.45396587e-14 5.43599894e-14]
 [1.00000000e+00 9.47155909e-10 2.82863034e-12 ... 2.55474048e-14
  2.85646242e-14 2

KeyboardInterrupt: Interrupted by user

/bin/bash: line 1: gh: command not found
