In [39]:
import tensorflow as tf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.python.keras.models import Sequential,model_from_json
from tensorflow.python.keras.layers import Dense, Activation,Dropout
from tensorflow.python.keras.layers import LSTM , Embedding
from tensorflow.python.keras.optimizers import RMSprop
from tensorflow.python.keras.callbacks import LambdaCallback, ModelCheckpoint
import random
import sys
import io

In [40]:
#print(tf.test.gpu_device_name())
# See https://www.tensorflow.org/tutorials/using_gpu#allowing_gpu_memory_growth
#config = tf.ConfigProto()
#config.gpu_options.allow_growth = True
filename = "../data/biographie_df.csv"

In [41]:
# Read in only the two columns we need 
chat = pd.read_csv(filename, encoding="utf-8", sep=";", usecols = ['name', 'biographie'])
# chat = pd.read_csv(filename, encoding="utf-8", sep=";")
# We don't want bots :) 
chat.head()

Unnamed: 0,name,biographie
0,Eminem,"Marshall Bruce Mathers III (born October 17, 1..."
1,Lady Gaga,Stefani Joanne Angelina Germanotta ( STEF-ən-e...
2,Justin Bieber,"Justin Drew Bieber (; born March 1, 1994) is a..."
3,Lil Wayne,"Dwayne Michael Carter Jr. (born September 27, ..."
4,Miley Cyrus,"Miley Ray Hemsworth (née Cyrus, born Destiny H..."


In [42]:
biographie = chat.biographie

n_messages = len(biographie)
n_chars = len(' '.join(map(str, biographie)))

print("DataFrame biographie for %d messages" % n_messages)
print("Their messages add up to %d characters" % n_chars)

DataFrame biographie for 144 messages
Their messages add up to 7198026 characters


In [43]:
sample_size = int(len(user) * 0.2)
biographie = biographie[:sample_size]
biographie = ''.join(map(str, biographie)).lower()
len(chat['biographie'][0])
user[:65970] # Show first 100 characters

'marshall bruce mathers iii (born october 17, 1972), known professionally as eminem (; often stylized as eminǝm), is an american rapper, songwriter, record producer, record executive, film producer, and actor. he is consistently cited as one of the greatest and most influential rappers of all time. in addition to his solo career, eminem was a member of the hip hop group d12. he is also known for his collaborations with fellow detroit-based rapper royce da 5\'9"; the two are collectively known as bad meets evil.\nafter his debut album infinite (1996) and the extended play slim shady ep (1997), eminem signed with dr. dre\'s aftermath entertainment and subsequently achieved mainstream popularity in 1999 with the slim shady lp, which earned him his first grammy award for best rap album. his next two releases, the marshall mathers lp (2000) and the eminem show (2002), were worldwide successes, with each being certified diamond by the recording industry association of america (riaa) and both

In [44]:
chars = sorted(list(set(biographie)))
print('Count of unique characters (i.e., features):', len(chars))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

Count of unique characters (i.e., features): 261


In [47]:
maxlen = 40
step = 3
sentences = []
next_chars = []
for i in range(0, len(biographie) - maxlen, step):
    sentences.append(biographie[i: i + maxlen])
    next_chars.append(biographie[i + maxlen])
print('Number of sequences:', len(sentences), "\n")

Number of sequences: 2399281 



In [49]:
print(sentences[:10], "\n")

['marshall bruce mathers iii (born october', 'shall bruce mathers iii (born october 17', 'll bruce mathers iii (born october 17, 1', 'bruce mathers iii (born october 17, 1972', 'ce mathers iii (born october 17, 1972), ', 'mathers iii (born october 17, 1972), kno', 'hers iii (born october 17, 1972), known ', 's iii (born october 17, 1972), known pro', 'ii (born october 17, 1972), known profes', '(born october 17, 1972), known professio'] 



In [50]:
print(next_chars[:10])

[' ', ',', '9', ')', 'k', 'w', 'p', 'f', 's', 'n']


In [51]:
x = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        x[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1

In [52]:
model = Sequential()
#model.add(Embedding(len(y), 10, input_length=len(x)))
model.add(LSTM(128, input_shape=(maxlen, len(chars)),return_sequences=True))
model.add(LSTM(128))
model.add(Dense(len(chars)))
model.add(Dropout(0.2))
model.add(Activation('softmax'))

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


In [53]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 40, 128)           199680    
_________________________________________________________________
lstm_1 (LSTM)                (None, 128)               131584    
_________________________________________________________________
dense (Dense)                (None, 261)               33669     
_________________________________________________________________
dropout (Dropout)            (None, 261)               0         
_________________________________________________________________
activation (Activation)      (None, 261)               0         
Total params: 364,933
Trainable params: 364,933
Non-trainable params: 0
_________________________________________________________________


In [54]:
optimizer = RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer,metrics=['accuracy'])

In [55]:
def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [56]:
def on_epoch_end(epoch, logs):
    # Function invoked for specified epochs. Prints generated text.
    # Using epoch+1 to be consistent with the training epochs printed by Keras
    if epoch+1 == 1 or epoch+1 == 15:
        print()
        print('----- Generating text after Epoch: %d' % epoch)

        start_index = random.randint(0, len(user) - maxlen - 1)
        for diversity in [0.2, 0.5, 1.0, 1.2]:
            print('----- diversity:', diversity)

            generated = ''
            sentence = user[start_index: start_index + maxlen]
            generated += sentence
            print('----- Generating with seed: "' + sentence + '"')
            sys.stdout.write(generated)

            for i in range(400):
                x_pred = np.zeros((1, maxlen, len(chars)))
                for t, char in enumerate(sentence):
                    x_pred[0, t, char_indices[char]] = 1.

                preds = model.predict(x_pred, verbose=0)[0]
                next_index = sample(preds, diversity)
                next_char = indices_char[next_index]

                generated += next_char
                sentence = sentence[1:] + next_char

                sys.stdout.write(next_char)
                sys.stdout.flush()
            print()
    else:
        print()
        print('----- Not generating text after Epoch: %d' % epoch)

In [57]:
generate_text = LambdaCallback(on_epoch_end=on_epoch_end)

# Training the model and generating predictions

In [None]:
# define the checkpoint
filepath = "weights.hdf5"
checkpoint = ModelCheckpoint(filepath, 
                             monitor='loss', 
                             verbose=1, 
                             save_best_only=True, 
                             mode='min')

# serialize model to YAML
model_yaml = model.to_yaml()
with open("model.yaml", "w") as yaml_file:
    yaml_file.write(model_yaml)
    
# fit model using our gpu
with tf.device('/gpu:0'):
    model.fit(x, y,
              batch_size=128,
              epochs=50,
              verbose=2,
              callbacks=[generate_text, checkpoint])

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Epoch 1/50


In [None]:
# serialize model to JSON
model_json = model.to_json()
with open("model.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.save_weights("model.h5")
print("Saved model to disk")
 
# later...
 
# load json and create model
json_file = open('model.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)
# load weights into new model
loaded_model.load_weights("weights.hdf5")
print("Loaded model from disk")

In [None]:
user

# Test avec une autre biographie

In [None]:
# evaluate loaded model on test data
model.compile(loss='categorical_crossentropy', optimizer=optimizer,metrics=['accuracy'])

In [None]:
score = loaded_model.evaluate(X, Y, verbose=0)
print("%s: %.2f%%" % (loaded_model.metrics_names[1], score[1]*100))

In [None]:
# create model
model = Sequential()
model.add(Dense(12, input_dim=8, activation='relu'))
model.add(Dense(8, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
# load weights
model.load_weights("weights.best.hdf5")
# Compile model (required to make predictions)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print("Created model and loaded weights from file")
# load pima indians dataset
dataset = numpy.loadtxt("pima-indians-diabetes.csv", delimiter=",")
# split into input (X) and output (Y) variables
X = dataset[:,0:8]
Y = dataset[:,8]
# estimate accuracy on whole dataset using loaded weights
scores = model.evaluate(X, Y, verbose=0)
print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))