In [1]:
import os
import sys

import pandas as pd
import numpy as np
import random
import gc

In [2]:
root = os.path.dirname(os.getcwd())
sys.path.append(root)

df = pd.read_csv(root + os.sep + 'data'+ os.sep + 'BASE.csv')

In [3]:
from src.utils.models import GAN

In [4]:
gan = GAN(df, maxlen=20, step=3, option='word', model_type=3)

Corpus length: 630844
Total words: 118223
Unique words before ignoring: 11540
Ignoring words with frequency < 2
Unique words after ignoring: 5173
Number of sequences: 6693
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d (Conv1D)              (None, 20, 5173)          80284960  
_________________________________________________________________
leaky_re_lu (LeakyReLU)      (None, 20, 5173)          0         
_________________________________________________________________
dropout (Dropout)            (None, 20, 5173)          0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 20, 5173)          80284960  
_________________________________________________________________
leaky_re_lu_1 (LeakyReLU)    (None, 20, 5173)          0         
_________________________________________________________________
dropout_1 (Dropo

In [5]:
gan.train(epochs=1, batch_size=100, sample_interval=1, verbose=1)

0:1/66: [DR_acc: 0.269, DF_acc: 0.0] [G loss: 0.691007673740387]
0:2/66: [DR_acc: 1.0, DF_acc: 0.9] [G loss: 0.7085055708885193]
0:3/66: [DR_acc: 1.0, DF_acc: 1.0] [G loss: 0.7558914422988892]
0:4/66: [DR_acc: 1.0, DF_acc: 1.0] [G loss: 0.8614627122879028]
0:5/66: [DR_acc: 1.0, DF_acc: 1.0] [G loss: 1.1011285781860352]
0:6/66: [DR_acc: 1.0, DF_acc: 1.0] [G loss: 1.6441595554351807]
0:7/66: [DR_acc: 1.0, DF_acc: 1.0] [G loss: 2.7439534664154053]
0:8/66: [DR_acc: 1.0, DF_acc: 1.0] [G loss: 4.443102836608887]
0:9/66: [DR_acc: 1.0, DF_acc: 1.0] [G loss: 6.588928699493408]
0:10/66: [DR_acc: 1.0, DF_acc: 1.0] [G loss: 8.870881080627441]
0:11/66: [DR_acc: 1.0, DF_acc: 1.0] [G loss: 11.198172569274902]
0:12/66: [DR_acc: 1.0, DF_acc: 1.0] [G loss: 13.217643737792969]
0:13/66: [DR_acc: 1.0, DF_acc: 1.0] [G loss: 14.475693702697754]
0:14/66: [DR_acc: 1.0, DF_acc: 1.0] [G loss: 14.681601524353027]
0:15/66: [DR_acc: 1.0, DF_acc: 1.0] [G loss: 14.712143898010254]


KeyboardInterrupt: 

In [8]:
gan.generator.save(root + os.sep + 'models' + os.sep + 'Word_Type3_Gan.h5')



In [45]:
sequence = 'certainty of death. small chance of succ'
gan.generate(gan.generator, mode='gan', sentence=False, temperature=1)

' hhhhl lor neer l le lolo hiuayl wea n i'

### Predict Word-Level Gan

In [6]:
preds = gan.generator.predict(gan.X[0].reshape(1, gan.X[0].shape[0], gan.X[0].shape[1]))[0]

In [28]:
generated = ''
for pred in list(preds):
    next_index = gan.sample(pred, temperature=1)
    next_char = gan.indices_token[next_index]
    generated += next_char + ' '
generated

' the to to okay all to we to me want to to to to in me to have part gotta me to they to me to to me to they to me we to to we to back me to'

### Predict Character-Level Gan

In [46]:
preds = gan.generator.predict(gan.X[0].reshape(1, gan.X[0].shape[0], gan.X[0].shape[1]))[0]

In [50]:
generated = ''
for pred in list(preds):
    next_index = gan.sample(pred, temperature=1)
    next_char = gan.indices_token[next_index]
    generated += next_char
generated

" hhe p'e p i iu weae dmd her l hld hl hh"

In [None]:
#model = keras.models.load_model(root + os.sep + 'models'+ os.sep + 'Type3_Gan.h5')

In [None]:
from src.utils.models import Preprocessor

class GAN(Preprocessor):

    def __init__(self, df, maxlen=40, step=3, option='character', min_word_frequency=2, model_type=1):
        Preprocessor.__init__(self, df)
        self.maxlen = maxlen
        self.step = step

        self.preprocess(maxlen=self.maxlen, step=self.step, option=option, mode='gan', min_word_frequency=min_word_frequency)
        
        optimizer = Adam(learning_rate=0.0002)

        # Build and compile the discriminator
        self.discriminator = self.build_discriminator(mode=model_type)
        self.discriminator.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

        # Build the generator
        self.generator = self.build_generator(mode=model_type)

        # For the combined model we will only train the generator
        self.discriminator.trainable = False

        # Build the GAN Model
        self.gan = self.build_gan(self.generator, self.discriminator)
        self.gan.compile(loss='binary_crossentropy', optimizer=optimizer)

    def build_discriminator(self, mode=1):

        if mode == 1:
            discriminator = keras.Sequential([
                keras.layers.InputLayer(input_shape=(self.maxlen, len(self.tokens))),
                layers.LSTM(128),
                keras.layers.Dense(1, activation='sigmoid')
            ])
            discriminator.summary()

        elif mode == 2:
            discriminator = keras.Sequential([
                layers.Conv1D(filters=len(self.tokens),
                                    kernel_size=(3),
                                    input_shape=(self.maxlen, len(self.tokens)),
                                    padding='same'),
                layers.Dropout(0.25),
                layers.Dense(128),
                layers.LeakyReLU(alpha=0.3),
                layers.Dense(1, activation='sigmoid')
            ])

            discriminator.summary()

        elif mode == 3:
            discriminator = keras.Sequential([
                layers.Conv1D(filters=len(self.tokens), kernel_size=(3), input_shape=(self.maxlen, len(self.tokens)), padding='same'),
                layers.LeakyReLU(alpha=0.2),
                layers.Dropout(0.2),
                layers.Conv1D(filters=len(self.tokens), kernel_size=(3), input_shape=(self.maxlen, len(self.tokens)), padding='same'),
                layers.LeakyReLU(alpha=0.2),
                layers.Dropout(0.2),
                layers.Conv1D(filters=len(self.tokens), kernel_size=(3), input_shape=(self.maxlen, len(self.tokens)), padding='same'),
                layers.LeakyReLU(alpha=0.2),
                layers.Dropout(0.2),
                layers.Dense(1, activation='sigmoid')
            ])

            discriminator.summary()

        return discriminator

    def build_generator(self, mode=1):
        
        if mode == 1:
            generator = keras.Sequential([
                layers.InputLayer(input_shape=(self.maxlen, len(self.tokens))),
                layers.LSTM(128, return_sequences=True),
                layers.Dense(len(self.tokens), activation='softmax'),
            ])
            generator.summary()

        elif mode == 2:
            generator = keras.Sequential([
                layers.Conv1D(filters=len(self.tokens),
                                    kernel_size=(3),
                                    input_shape=(self.maxlen, len(self.tokens)),
                                    padding='same'),
                layers.Dropout(0.25),
                layers.Dense(128),
                layers.LeakyReLU(alpha=0.3),
                layers.Dense(len(self.chars), activation='softmax')
            ])

            generator.summary()

        elif mode == 3:
            generator = keras.Sequential([
                layers.Dense((len(self.tokens)), input_shape=(self.maxlen, len(self.tokens))),
                layers.LeakyReLU(alpha=0.2),
                layers.BatchNormalization(momentum=0.8),
                layers.Dense(512),
                layers.LeakyReLU(alpha=0.2),
                layers.BatchNormalization(momentum=0.8),
                layers.Dense(len(self.tokens)),
                layers.LeakyReLU(alpha=0.2),
                layers.BatchNormalization(momentum=0.8),
                layers.Reshape((self.maxlen, len(self.tokens))),
                layers.Conv1DTranspose(len(self.tokens), 2, padding="same", activation='softmax')
            ])

            generator.summary()

        return generator

    def build_gan(self, g_model, d_model):

        gan = keras.Sequential([
            layers.InputLayer(input_shape=(self.maxlen, len(self.tokens))),
            g_model,
            d_model
        ])
        gan.summary()

        return gan


    def train(self, epochs, batch_size=128, sample_interval=50, verbose=True):

        batch_per_epoch = len(self.X)//batch_size
        half_batch = batch_size//2
        # Training the model
        for epoch in range(epochs):

            for n in range(batch_per_epoch):

                # Training the discriminator
                # Select a random batch of character sequences
                if verbose > 1:
                    print(f'Generating real samples: {n+1}/{batch_per_epoch}')
                X_real, y_real = self.generate_real_samples(half_batch)

                # Generate a batch of fake character sequences
                if verbose > 1:
                    print(f'Generating fake samples: {n+1}/{batch_per_epoch}')
                X_fake, y_fake = self.generate_fake_samples(self.generator, half_batch)

                # Train the discriminator
                if verbose > 1:
                    print(f'Training Discriminator: {n+1}/{batch_per_epoch}')
                d_loss_real = self.discriminator.train_on_batch(X_real, y_real)
                d_loss_fake = self.discriminator.train_on_batch(X_fake, y_fake)
                #d_loss = 0.5 * np.add(d_loss_real, d_loss_fake)

                #  Training the Generator
                if verbose > 1:
                    print(f'Generating GAN samples: {n+1}/{batch_per_epoch}')
                X_gan, y_gan = self.generate_gan_samples(batch_size)

                # Train the generator (to have the discriminator label samples as real)
                if verbose > 1:
                    print(f'Training Generator: {n+1}/{batch_per_epoch}')
                g_loss = self.gan.train_on_batch(X_gan, y_gan)

                # Print the progress and save into loss lists
                if epoch % sample_interval == 0 and verbose > 0:
                    print(f"{epoch}:{n+1}/{batch_per_epoch}: [DR_acc: {round(d_loss_real[1],3)}, DF_acc: {round(d_loss_fake[1],3)}] [G loss: {g_loss}]")
                    self.disc_loss.append((d_loss_real, d_loss_fake))
                    self.gen_loss.append(g_loss)

    def predict(self, option='character', quote_len=40, sentence=False, temperature=1.0, verbose=False):
        
        if sentence:
            if option == 'character':
                sentence = sentence[:quote_len]
            elif option == 'word':
                sentence = sentence.lower()
                sentence = sentence.replace('--', ' ')
                # split into tokens by white space
                text_in_words = sentence.split()
                # remove punctuation from each token
                table = str.maketrans('', '', string.punctuation)
                text_in_words = [w.translate(table) for w in text_in_words]
                pops = [[seq.pop(i) for i, w in enumerate(seq) if w == '' or w == ' '] for seq in text_in_words]
                # remove remaining tokens that are not alphabetic
                sentence = [word for word in text_in_words if word.isalpha()][:quote_len]

        prediction = self.generate(self.generator, mode='gan', option=option, quote_len=quote_len, 
                                    sentence=sentence, temperature=temperature, verbose=verbose)

        return prediction