# Donald Trump Twitt Generator with KERAS 

In [1]:
from __future__ import print_function
import tensorflow as tf
from tensorflow.contrib import rnn
import random
import collections
import time
import tensorflow as tf
from keras.callbacks import ModelCheckpoint
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.layers import LSTM
from keras.optimizers import RMSprop
import numpy as np

# Text file containing words for training
#training_file = 'train.txt'
training_file = 'tweets_all.txt'

def read_data(fname):
    with open(fname) as f:
        content = f.readlines()
    content = [t for t in content if 'http' not in t]
    content = [t for t in content if '&gt' not in t]
    content = [x.strip() for x in content]
    corpus = u' '.join(content)
    
    global CORPUS_LENGTH
    
    CORPUS_LENGTH = len(corpus)
    print('Corpus Length:', CORPUS_LENGTH)
        
    content = [w.replace('"',"") for w in content]
    content = [content[i].split() for i in range(len(content))]
    content = np.array(content)
    content = np.reshape(content, [-1, ])
    return content, corpus

training_data, corpus = read_data(training_file)

print("Loaded training data...")

flat_list = []
for sublist in training_data:
    for item in sublist:
        flat_list.append(item)
flat_list

def build_dataset(words):
    count = collections.Counter(words).most_common()
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)
    reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return dictionary, reverse_dictionary

dictionary, reverse_dictionary = build_dataset(flat_list)

  from ._conv import register_converters as _register_converters


Instructions for updating:
Use the retry module or similar alternatives.


Using TensorFlow backend.


Corpus Length: 2870760
Loaded training data...


# Keras

In [2]:
N_CHARS = None

def create_index_char_map(corpus):
    chars = sorted(list(set(corpus)))
    global N_CHARS
    N_CHARS = len(chars)
    #if verbose:
    print('No. of unique characters:', N_CHARS)
    char_to_idx = {c: i for i, c in enumerate(chars)}
    idx_to_char = {i: c for i, c in enumerate(chars)}
    return chars, char_to_idx, idx_to_char

chars, char_to_idx, idx_to_char = create_index_char_map(corpus)

No. of unique characters: 39


In [3]:
MAX_SEQ_LENGTH = 60
SEQ_STEP = 3
N_SEQS = None

def create_sequences(corpus):
    sequences, next_chars = [], []
    for i in range(0, CORPUS_LENGTH - MAX_SEQ_LENGTH, SEQ_STEP):
        sequences.append(corpus[i:i + MAX_SEQ_LENGTH])
        next_chars.append(corpus[i + MAX_SEQ_LENGTH])
    global N_SEQS
    N_SEQS = len(sequences)

    print('No. of sequences:', len(sequences))
    return np.array(sequences), np.array(next_chars)

sequences, next_chars = create_sequences(corpus)

No. of sequences: 956900


In [4]:
def one_hot_encode(sequences, next_chars, char_to_idx):
    X = np.zeros((N_SEQS, MAX_SEQ_LENGTH, N_CHARS), dtype=np.bool)
    y = np.zeros((N_SEQS, N_CHARS), dtype=np.bool)
    for i, sequence in enumerate(sequences):
        for t, char in enumerate(sequence):
            X[i, t, char_to_idx[char]] = 1
    y[i, char_to_idx[next_chars[i]]] = 1
    return X, y

X, y = one_hot_encode(sequences, next_chars, char_to_idx)

In [5]:
def build_model(hidden_layer_size=128, dropout=0.2, learning_rate=0.01):
    model = Sequential()
    model.add(LSTM(hidden_layer_size, return_sequences=True, input_shape=(MAX_SEQ_LENGTH, N_CHARS)))
    model.add(Dropout(dropout))
    model.add(LSTM(hidden_layer_size, return_sequences=False))
    model.add(Dropout(dropout))
    model.add(Dense(N_CHARS, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer=RMSprop(lr=learning_rate))
    print('Model Summary:')
    model.summary()
    return model

model = build_model()

Model Summary:
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 60, 128)           86016     
_________________________________________________________________
dropout_1 (Dropout)          (None, 60, 128)           0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 128)               131584    
_________________________________________________________________
dropout_2 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 39)                5031      
Total params: 222,631
Trainable params: 222,631
Non-trainable params: 0
_________________________________________________________________


In [None]:
def train_model(model, X, y, batch_size = 128, nb_epoch = 180):
    checkpointer = ModelCheckpoint(filepath="weights.hdf5", monitor='loss', save_best_only=True, mode='min')
    model.fit(X, y, batch_size=batch_size, nb_epoch=nb_epoch, callbacks=[checkpointer])

train_model(model, X, y)



Epoch 1/180
Epoch 2/180
Epoch 3/180
Epoch 4/180
Epoch 5/180
Epoch 6/180
Epoch 7/180
Epoch 8/180
Epoch 9/180
Epoch 10/180
Epoch 11/180
Epoch 12/180
Epoch 13/180
Epoch 14/180
Epoch 15/180
Epoch 16/180
Epoch 17/180
Epoch 18/180
Epoch 19/180
112512/956900 [==>...........................] - ETA: 1:03:00 - loss: 0.0000e+00

In [8]:
np.random.seed(1337)

In [9]:
def sample(preds):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / 0.2
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [10]:
def generate_tweets(model, corpus, char_to_idx, idx_to_char, n_tweets = 10, verbose=0): 
    model.load_weights('weights.hdf5')
    tweets = []
    spaces_in_corpus = np.array([idx for idx in range(CORPUS_LENGTH) if corpus[idx] == ' '])
    for i in range(1, n_tweets + 1):
        begin = np.random.choice(spaces_in_corpus)
        tweet = u''
        sequence = corpus[begin:begin + MAX_SEQ_LENGTH]
        tweet += sequence

        print('Tweet no. %03d' % i)
        print('=' * 13)
        print('Generating with seed:')
        print(sequence)
        print('_' * len(sequence))
        for _ in range(100):
            x = np.zeros((1, MAX_SEQ_LENGTH, N_CHARS))
            for t, char in enumerate(sequence):
                x[0, t, char_to_idx[char]] = 1.0

            preds = model.predict(x, verbose=0)[0]
            next_idx = sample(preds)
            next_char = idx_to_char[next_idx]

            tweet += next_char
            sequence = sequence[1:] + next_char
        if verbose:
            print(tweet)
            print()
        tweets.append(tweet)
    return tweets

tweets = generate_tweets(model, corpus, char_to_idx, idx_to_char)

Tweet no. 001
Generating with seed:
 truly enjoy your insite and opinions pl
________________________________________
Tweet no. 002
Generating with seed:
 hard at work" "obama killed over 100k j
________________________________________
Tweet no. 003
Generating with seed:
 " circulation is way down and all he th
________________________________________
Tweet no. 004
Generating with seed:
 in the same sentence as al sharpton lik
________________________________________
Tweet no. 005
Generating with seed:
 blow their chance to take the senate mu
________________________________________
Tweet no. 006
Generating with seed:
 a nice article in the new york times ab
________________________________________
Tweet no. 007
Generating with seed:
 to be able to prosper again" "it is a s
________________________________________
Tweet no. 008
Generating with seed:
 istheyre cowards mr trump i appreciate 
________________________________________
Tweet no. 009
Generating with seed:
 guy cant do a simpl

In [46]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import pairwise_distances

vectorizer = TfidfVectorizer()
tfidf = vectorizer.fit_transform(sequences)
Xval = vectorizer.transform(tweets)
print(pairwise_distances(Xval, Y=tfidf, metric='cosine').min(axis=1).mean())

0.03202742324370831


# Tensorflow -- In Progress

In [3]:
vocab_size = len(dictionary)

# number of units in RNN cell
num_hidden = 128 #512

# Parameters
learning_rate = 0.001
training_iters = 1000
display_step = 100
n_input = 3
timesteps = 3
batch_size = 3

# RNN output node weights and biases
weights = {'out': tf.Variable(tf.random_normal([num_hidden, vocab_size]))}
biases = {'out': tf.Variable(tf.random_normal([vocab_size]))}

# tf Graph input
x = tf.placeholder("float", [None, n_input, 1])
#x = tf.placeholder(tf.float32, (None, None, 3)) 
y = tf.placeholder("float", [None, vocab_size])

In [4]:
def RNN(x, weights, biases):

    # reshape to [1, n_input]
    x = tf.reshape(x, [-1, n_input])

    # Generate a n_input-element sequence of inputs
    # (eg. [had] [a] [general] -> [20] [6] [33])
    x = tf.split(x,n_input,1)
    
    # 1-layer LSTM with n_hidden units.
    #rnn_cell = rnn.BasicLSTMCell(num_hidden)
    #initial_state = rnn_cell.zero_state(batch_size, tf.float32)
    rnn_cell = rnn.MultiRNNCell([rnn.BasicLSTMCell(num_hidden*4),rnn.BasicLSTMCell(num_hidden*2),
                                rnn.BasicLSTMCell(num_hidden),rnn.BasicLSTMCell(num_hidden)])

    # generate prediction
    outputs, states = tf.nn.static_rnn(rnn_cell, inputs = x, dtype=tf.float32)

    # there are n_input outputs but
    # we only want the last output
    return tf.matmul(outputs[-1], weights['out']) + biases['out']

pred = RNN(x, weights, biases)

In [5]:
def BiRNN(x, weights, biases):

    # Prepare data shape to match `rnn` function requirements
    # Current data input shape: (batch_size, timesteps, n_input)
    # Required shape: 'timesteps' tensors list of shape (batch_size, num_input)

    # Unstack to get a list of 'timesteps' tensors of shape (batch_size, num_input)
    x = tf.unstack(x, timesteps, 1)

    # Define lstm cells with tensorflow
    # Forward direction cell
    lstm_fw_cell = rnn.BasicLSTMCell(num_hidden, forget_bias=1.0)
    # Backward direction cell
    lstm_bw_cell = rnn.BasicLSTMCell(num_hidden, forget_bias=1.0)

    # Get lstm cell output
    try:
        outputs, _, _ = rnn.static_bidirectional_rnn(lstm_fw_cell, lstm_bw_cell, x,
                                              dtype=tf.float32)
    except Exception: # Old TensorFlow version only returns outputs not states
        outputs = rnn.static_bidirectional_rnn(lstm_fw_cell, lstm_bw_cell, x,
                                        dtype=tf.float32)

    # Linear activation, using rnn inner loop last output
    return tf.matmul(outputs[-1], weights['out']) + biases['out']

#pred = BiRNN(x, weights, biases)

In [6]:
#def build_embedding_layer(inputs_, vocab_size, embed_size):
#    """
#    Create the embedding layer
#    """
#    embedding = tf.Variable(tf.random_uniform((vocab_size, embed_size), -1, 1))
#    embed = tf.nn.embedding_lookup(embedding, inputs_)
    
#def build_lstm_layers(lstm_sizes, embed, keep_prob_, batch_size):
#    """
#    Create the LSTM layers
#    """
    
#    lstms = [tf.contrib.rnn.BasicLSTMCell(size) for size in lstm_sizes]
    
    # Add dropout to the cell
#    drops = [tf.contrib.rnn.DropoutWrapper(lstm, output_keep_prob=keep_prob_) for lstm in lstms]
    
    # Stack up multiple LSTM layers, for deep learning
#    cell = tf.contrib.rnn.MultiRNNCell(drops)
    
    # Getting an initial state of all zeros
#    initial_state = cell.zero_state(batch_size, tf.float32)
#    lstm_outputs, final_state = tf.nn.dynamic_rnn(cell, embed, initial_state=initial_state)
    
#    return tf.matmul(outputs[-1], weights['out']) + biases['out']

#pred = build_lstm_layers(inputs_, vocab_size, embed_size

In [7]:
# Loss and optimizer
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits=pred, labels=y))
optimizer = tf.train.RMSPropOptimizer(learning_rate=learning_rate).minimize(cost)

# Model evaluation
correct_pred = tf.equal(tf.argmax(pred,1), tf.argmax(y,1))
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

acc_total = 0
loss_total = 0
step = 0

# Launch the graph
with tf.Session() as session:
    session.run(tf.global_variables_initializer())
    
    offset = random.randint(0,n_input+1)
    end_offset = n_input + 1
    
    while step < training_iters:
        if offset > (len(flat_list)-end_offset):
            offset = random.randint(0, n_input+1)
            
        symbols_in_keys = [ [dictionary[ str(flat_list[i])]] for i in range(offset, offset+n_input) ]
        symbols_in_keys = np.reshape(np.array(symbols_in_keys), [-1, n_input, 1])
        symbols_out_onehot = np.zeros([vocab_size], dtype=float)
        symbols_out_onehot[dictionary[str(flat_list[offset+n_input])]] = 1.0
        symbols_out_onehot = np.reshape(symbols_out_onehot,[1,-1])

        _, acc, loss, onehot_pred = session.run([optimizer, accuracy, cost, pred], feed_dict={x: symbols_in_keys, y: symbols_out_onehot})
    
        loss_total += loss
        acc_total += acc
        if (step+1) % display_step == 0:
            print("Iter= " + str(step+1) + ", Average Loss= " + "{:.6f}".format(loss_total/display_step) + ", Average Accuracy= " + "{:.2f}%".format(100*acc_total/display_step))
            acc_total = 0
            loss_total = 0
            symbols_in = [flat_list[i] for i in range(offset, offset + n_input)]
            symbols_out = flat_list[offset + n_input]
            symbols_out_pred = reverse_dictionary[int(tf.argmax(onehot_pred, 1).eval())]
            print("%s - [%s] vs [%s]" % (symbols_in,symbols_out,symbols_out_pred))
        step += 1
        offset += (n_input+1)
        
    prompt = "%s words: " % n_input
    sentence = input(prompt)
    sentence = sentence.strip()
    words = sentence.split(' ')
        
    symbols_in_keys = [dictionary[str(words[i])] for i in range(len(words))]
        
    for i in range(10):
        keys = np.reshape(np.array(symbols_in_keys), [-1, n_input, 1])
        onehot_pred = session.run(pred, feed_dict={x: keys})
        onehot_pred_index = int(tf.argmax(onehot_pred, 1).eval())
        sentence = "%s %s" % (sentence,reverse_dictionary[onehot_pred_index])
        symbols_in_keys = symbols_in_keys[1:]
        symbols_in_keys.append(onehot_pred_index)
    print(sentence)
            
    print("Optimization Finished!")

Iter= 100, Average Loss= 10.313104, Average Accuracy= 0.00%
['rest', 'until', 'the'] - [job] vs [prepaid]
Iter= 200, Average Loss= 10.375281, Average Accuracy= 0.00%
['do', 'we', 'work'] - [so] vs [welp]
Iter= 300, Average Loss= 10.075018, Average Accuracy= 1.00%
['s', 'and', 'attorney'] - [baker] vs [the]
Iter= 400, Average Loss= 9.641488, Average Accuracy= 6.00%
['god', 'bless', 'you'] - [and] vs [the]
Iter= 500, Average Loss= 9.156170, Average Accuracy= 4.00%
['her', 'about', 'an'] - [affair] vs [the]
Iter= 600, Average Loss= 8.576650, Average Accuracy= 5.00%
['war', 'negotiations', 'going'] - [on] vs [is]
Iter= 700, Average Loss= 8.717513, Average Accuracy= 7.00%
['more', 'representative', 'important'] - [and] vs [the]
Iter= 800, Average Loss= 8.819504, Average Accuracy= 6.00%
['for', 'this', 'kind'] - [of] vs [to]
Iter= 900, Average Loss= 8.692910, Average Accuracy= 3.00%
['would', 'have', 'been'] - [a] vs [the]
Iter= 1000, Average Loss= 8.796187, Average Accuracy= 2.00%
['history

In [8]:
#Iter= 50000, Average Loss= 0.433600, Average Accuracy= 91.70%
#"good morning jordan"