In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
import re
import sys
import csv
import time
import heapq
import tweepy # https://github.com/tweepy/tweepy
import numpy as np
import pandas as pd
import configparser
import tensorflow as tf

from keras.models import Sequential, load_model
from keras.layers import Dense, Activation, LSTM, Dropout, GRU, TimeDistributed, BatchNormalization
from keras.layers import CuDNNLSTM 
from keras.layers.core import Dense, Activation, Dropout, RepeatVector

from keras.utils import np_utils
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import Dropout
from keras.models import Sequential
from keras.layers import Activation, BatchNormalization
from keras.callbacks import ModelCheckpoint


In [None]:
Handler = 'elonmusk'
path = 'drive/MyDrive/Colab Notebooks/COE494_Project'

In [None]:
def authenticate(path = 'drive/MyDrive/Colab Notebooks/COE494_Project/'):
  # read config
  config = configparser.ConfigParser()
  config.read(path + 'config.ini')

  api_key = str(config['twitter']['api_key'])
  api_key_secret = str(config['twitter']['api_key_secret'])

  access_token = str(config['twitter']['access_token'])
  access_token_secret = str(config['twitter']['access_token_secret'])

  # authenticate
  auth = tweepy.OAuthHandler(api_key, api_key_secret)
  auth.set_access_token(access_token, access_token_secret)

  return tweepy.API(auth, wait_on_rate_limit = True)

In [None]:
api = authenticate()

In [None]:
# Tweet text pre-processing
def clean_tweet(tweet):
    stopwords = ["for", "on", "an", "a", "of", "and", "in", "the", "to", "from"]
    if type(tweet) == float:
        return ""
    temp = tweet.lower()
    temp = re.sub("'", "", temp) # to avoid removing contractions in english
    temp = re.sub("@[A-Za-z0-9_]+","", temp)
    temp = re.sub("#[A-Za-z0-9_]+","", temp)
    temp = re.sub(r'http\S+', '', temp)
    temp = re.sub('[()!?]', ' ', temp)
    temp = re.sub('\[.*?\]',' ', temp)
    temp = re.sub("[^a-z0-9]"," ", temp)
    temp = temp.split()
    temp = [w for w in temp if not w in stopwords]
    temp = " ".join(word for word in temp)
    return temp

In [None]:
def get_all_tweets(handler):
    # Twitter only allows access to a users most recent 3240 tweets with this method
    print(f'Grabbing @{handler}\'s Tweets')
    #initialize a list to hold all the tweepy Tweets
    all_tweets = []  
    
    # make initial request for most recent tweets (200 is the maximum allowed count)
    new_tweets = api.user_timeline(screen_name = handler, count = 200, include_rts = False, tweet_mode = 'extended')
    
    # save most recent tweets
    all_tweets.extend(new_tweets)
    
    # save the id of the oldest tweet less one
    oldest = all_tweets[-1].id - 1
    
    # keep grabbing tweets until there are no tweets left to grab
    while len(new_tweets) > 0:        
        # all subsiquent requests use the max_id param to prevent duplicates
        new_tweets = api.user_timeline(screen_name = handler, count=200, max_id = oldest, include_rts = False, tweet_mode = 'extended')
        # save most recent tweets
        all_tweets.extend(new_tweets)        
        # update the id of the oldest tweet less one
        oldest = all_tweets[-1].id - 1
        
    print(f"{len(all_tweets)} tweets downloaded...")    
    # transform the tweepy tweets into a 2D array that will populate the csv 
    out_tweets = [[tweet.id_str, tweet.created_at, tweet.full_text] for tweet in all_tweets]
    df = pd.DataFrame (out_tweets, columns = ["id", "time", "tweet"])
    df.to_csv(path + '/data/' + handler+'.csv')
    return df

In [None]:
# tweets = get_all_tweets(Handler)
tweets = pd.read_csv(path + "/data/elonmusk.csv")
cleaned_tweets = pd.DataFrame([clean_tweet(tweet) for tweet in tweets.tweet], columns = ['tweet'])

  exec(code_obj, self.user_global_ns, self.user_ns)


In [None]:
# Removing null and empty rows 
cleaned_tweets.tweet.replace('', np.nan, inplace=True)
cleaned_tweets.dropna(inplace = True)
cleaned_tweets

Unnamed: 0,tweet
0,please ignore prior tweets as that was someone...
1,so true
2,if you ever wanted know real truth about moon ...
3,walked around neighborhood recently rebuilt wi...
4,it was xmas so we brought presents kids at orp...
...,...
34870,reminds me when i hex edited ultima v get out ...
34871,yay switzerland
34872,there is no way be touch with voters when you ...
34874,let s make roaring 20 s happen


In [None]:
raw_tweets_text = ' '.join(cleaned_tweets["tweet"])

In [None]:

def prepare_input(text):
    x = np.zeros((1, SEQUENCE_LENGTH, len(chars)))
    for t, char in enumerate(text):
        x[0, t, char_indices[char]] = 1.
    return x

def temperatureSample(preds, temperature=1.0):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

def sample(preds, top_n=3):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds)
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)

    return heapq.nlargest(top_n, range(len(preds)), preds.take)

def predict_completion(text):
    original_text = text
    generated = text
    completion = ''
    while True:
        x = prepare_input(text)
        preds = model.predict(x, verbose=0)[0]
        next_index = sample(preds, top_n=1)[0]
        next_char = indices_char[next_index]
        text = text[1:] + next_char
        completion += next_char

        if len(original_text + completion) + 2 > len(original_text) and next_char == ' ':
            return completion

def predict_completions(text, n=3):
    x = prepare_input(text)
    preds = model.predict(x, verbose=0)[0]
    next_indices = sample(preds, n)
    return [indices_char[idx] + predict_completion(text[1:] + indices_char[idx]) for idx in next_indices]


In [None]:
text = raw_tweets_text
print('corpus length:', len(text))

chars = sorted(list(set(text)))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

print(f'unique chars: {len(chars)}')

SEQUENCE_LENGTH = 80
step = 4
sentences = []
next_chars = []
for i in range(0, len(text) - SEQUENCE_LENGTH, step):
    sentences.append(text[i: i + SEQUENCE_LENGTH])
    next_chars.append(text[i + SEQUENCE_LENGTH])
print(f'num training examples: {len(sentences)}')

X = np.zeros((len(sentences), SEQUENCE_LENGTH, len(chars)), dtype=bool)
y = np.zeros((len(sentences), len(chars)), dtype=bool)

for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        X[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1

print("X.shape:", X.shape)
print("y.shape:", y.shape)

corpus length: 2241073
unique chars: 37
num training examples: 560249
X.shape: (560249, 80, 37)
y.shape: (560249, 37)


In [None]:
from tensorflow.keras.optimizers import RMSprop

model = Sequential()

model.add(CuDNNLSTM(len(chars) * 5, input_shape=(SEQUENCE_LENGTH, len(chars))))
model.add(BatchNormalization())
model.add(Activation('selu'))

model.add(Dense(len(chars) * 2))
model.add(BatchNormalization())
model.add(Activation('selu'))

model.add(Dense(len(chars) * 2))
model.add(BatchNormalization())
model.add(Activation('selu'))

model.add(Dense(len(chars)))
model.add(Activation('softmax'))

optimizer = RMSprop(lr = 0.001)
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

  super(RMSprop, self).__init__(name, **kwargs)


In [None]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 cu_dnnlstm (CuDNNLSTM)      (None, 185)               165760    
                                                                 
 batch_normalization (BatchN  (None, 185)              740       
 ormalization)                                                   
                                                                 
 activation (Activation)     (None, 185)               0         
                                                                 
 dense (Dense)               (None, 74)                13764     
                                                                 
 batch_normalization_1 (Batc  (None, 74)               296       
 hNormalization)                                                 
                                                                 
 activation_1 (Activation)   (None, 74)                0

In [None]:
model.fit(X, y, validation_split = 0.05, batch_size = 124, epochs = 50, shuffle = False)

  super(RMSprop, self).__init__(name, **kwargs)


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7fab40074450>

In [None]:
model.save(path + "/models/3_LSTM")

In [None]:
model = keras.models.load_model(path + "/models/3_LSTM")

In [None]:
def genSentence(text, words = 2):
    textOG = text
    text = text.lower()
    while len(text) < SEQUENCE_LENGTH:
        text = ' ' + text
    text = text[-SEQUENCE_LENGTH:]
    for i in range(words):
        text = text[-SEQUENCE_LENGTH:]
        pred = predict_completions(text, 1)[0]
        text = text + pred
        textOG = textOG + pred
        pass
    return textOG

In [None]:
# Predict Tweets starting from the given words 
# Generates random length from the training set

## Random Number of words and seed word may be generated, however we are choosing
## constant values for comparison sake
### N_WORDS = 0
### while N_WORDS < 10:
###   N_WORDS = len(cleaned_tweets.sample().tweet.item())

### seed = cleaned_tweets.sample().tweet.item().split()[0]

seed = 'Roadster'
N_WORDS = 10
print(genSentence(seed, N_WORDS))

Roadster this is also with energy probably preking i should be 


In [None]:
seed = 'Finally'
N_WORDS = 10
print(genSentence(seed, N_WORDS))

Finally all good this is also all about will be fine 
