In [1]:
import numpy as np

import string
import re
import pandas as pd

from keras.models import Sequential
from keras.layers import LSTM, Dropout, Activation, Dense

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


# Data cleaning

In [2]:
# read the 2017 data to see the original data structure
temp=pd.read_json("./jsondata/condensed_2017.json")
temp.head()

Unnamed: 0,created_at,favorite_count,id_str,in_reply_to_user_id_str,is_retweet,retweet_count,source,text
0,2018-01-01 13:37:52,51473,947824196909961216,,False,8237,Twitter for iPhone,Will be leaving Florida for Washington (D.C.) ...
1,2018-01-01 12:44:40,53557,947810806430826496,25073877.0,False,14595,Twitter for iPhone,Iran is failing at every level despite the ter...
2,2018-01-01 12:12:00,138808,947802588174577664,,False,49566,Twitter for iPhone,The United States has foolishly given Pakistan...
3,2017-12-31 23:43:04,154769,947614110082043904,,False,35164,Twitter for iPhone,HAPPY NEW YEAR! We are MAKING AMERICA GREAT AG...
4,2017-12-31 22:18:20,157655,947592785519173632,,False,39428,Twitter for iPhone,As our Country rapidly grows stronger and smar...


In [3]:
file_name="./jsondata/condensed_"
mydf = pd.read_json(file_name+str(2000+9)+".json")
mydf = mydf.loc[mydf['is_retweet']==False][['text']]

# read all year    
for year in range(10,19):
    df = pd.read_json(file_name+str(2000+year)+".json")
    df = df.loc[df['is_retweet']==False][['text']]
    mydf=mydf.append(df,ignore_index=True)
mydf.head()

Unnamed: 0,text
0,From Donald Trump: Wishing everyone a wonderfu...
1,Trump International Tower in Chicago ranked 6t...
2,Wishing you and yours a very Happy and Bountif...
3,Donald Trump Partners with TV1 on New Reality ...
4,"--Work has begun, ahead of schedule, to build ..."


In [4]:
for i in range(len(mydf)):
    # remove url
    mydf['text'][i]= re.sub(r"http\S+", "", mydf['text'][i])
    
    # remove these charcters (" . -) at the beginning of the string 
    # remove (") at the end of the string 
    # remove whitespaces at the beginning and the end
    mydf['text'][i]= mydf['text'][i].strip().lstrip("-").lstrip('"').lstrip(".").rstrip('"').strip()
    
mydf.head()

Unnamed: 0,text
0,From Donald Trump: Wishing everyone a wonderfu...
1,Trump International Tower in Chicago ranked 6t...
2,Wishing you and yours a very Happy and Bountif...
3,Donald Trump Partners with TV1 on New Reality ...
4,"Work has begun, ahead of schedule, to build th..."


In [5]:
len(mydf) #number of tweets

32858

In [6]:
# create train and test
train=mydf.sample(frac=0.9,random_state=200)
test=mydf.drop(train.index)

In [7]:
# create train and test
text_train=''
text_val=''

for i in train['text']:
    text_train+=i+' '

for i in test['text']:
    text_val+=i+' '

In [8]:
# remove non-English letters & emoji
text_train=text_train.encode('ascii',errors='ignore').decode()
text_val=text_val.encode('ascii',errors='ignore').decode()

chars = sorted(list(set(text_train+text_val)))
','.join(chars) #show remaining letters

'\n,\r, ,!,",#,$,%,&,\',(,),*,+,,,-,.,/,0,1,2,3,4,5,6,7,8,9,:,;,=,?,@,A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V,W,X,Y,Z,[,\\,],_,`,a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u,v,w,x,y,z,{,|,},~'

In [9]:
# remove some more special characters to reduce the size of parameter matrix --> increase training spped
text_train=re.sub("[-+~`*\]\[|\\n\\r\(\)\{\}<>]", "", text_train)
text_train=text_train.replace("/","").replace("\\", "") .replace("=", "").replace('\'',"'")
text_val=re.sub("[-+~`*\]\[|\\n\\r\(\)\{\}<>]", "", text_val)
text_val=text_val.replace("/","").replace("\\", "") .replace("=", "").replace('\'',"'")

chars = sorted(list(set(text_train+text_val)))
','.join(chars)#show remaining letters

' ,!,",#,$,%,&,\',,,.,0,1,2,3,4,5,6,7,8,9,:,;,?,@,A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V,W,X,Y,Z,_,a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u,v,w,x,y,z'

In [10]:
text_train_len = len(text_train)
text_val_len = len(text_val)
print("Total of %d characters" % (text_train_len + text_val_len))

Total of 3458381 characters


In [11]:
# generic vocabulary
VOCABULARY_SIZE = len(chars)
characters_to_ix = {c:i for i,c in enumerate(chars)}
print("vocabulary len = %d" % VOCABULARY_SIZE)
print(chars)

vocabulary len = 77
[' ', '!', '"', '#', '$', '%', '&', "'", ',', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', '@', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


In [12]:
text_train[0:1000]

"@A_Beil   Thanks Andrew! Thank you Greensboro, North Carolina! Will be back soon! #AmericaFirst @Mr_president706: @realDonaldTrump please run for president, and show these so called Republicans how to get the job done. RT @CPACnews: ACU Announces @realDonaldTrump will be a featured speaker at #CPAC2013!  Get tickets today at @SAVEHICNO1: MR. TRUMP, I THINK YOU OUGHT TO DO AN APPRENTICE SHOW FOR PEOPLE TO COME UP FOR A WINNING STRADEGY FOR YOU TO RUN IN 2016 @HomesWestfield @EricTrump @MyFoxNY @rosannascotto @StJude  Thank you. @MotivationIdeas: Sometimes by losing a battle you find a new way to win the war.  Donald Trump The leader and negotiators representing Mexico are far smarter and more cunning than the leader and negotiators representing the U.S.! @rdowns I never went bankrupt. Enemies love to say I did. Didn't happen. @manc999 Write to Mark Burnett. I love Mexico but not the unfair trade deals that the US so stupidly makes with them. Really bad for US jobs, only good for Mexico

In [13]:
def describe_batch(X, y, samples=3):
    for i in range(samples):
        sentence = ""
        for s in range(SEQUENCE_LEN):
            sentence += chars[X[i,s,:].argmax()]
        next_char = chars[y[i,:].argmax()]
        
        print("sample #%d: ...%s -> '%s'" % (
            i,
            sentence[-20:],
            next_char
        ))

# Generate batches for training
def batch_generator(text, count):
    while True: # keras wants that for reasons
        for batch_ix in range(count):
            X = np.zeros((BATCH_SIZE, SEQUENCE_LEN, VOCABULARY_SIZE))
            y = np.zeros((BATCH_SIZE, VOCABULARY_SIZE))

            batch_offset = BATCH_SIZE * batch_ix

            for sample_ix in range(BATCH_SIZE):
                sample_start = batch_offset + sample_ix
                for s in range(SEQUENCE_LEN):
                    X[sample_ix, s, characters_to_ix[text[sample_start+s]]] = 1
                y[sample_ix, characters_to_ix[text[sample_start+s+1]]]=1

            yield X, y


In [14]:
BATCH_SIZE = 512
SEQUENCE_LEN = 60

# describe some samples from the first batch
for ix, (X,y) in enumerate(batch_generator(text_train, count=1)):    
    describe_batch(X, y, samples=5)
    break

sample #0: ...sboro, North Carolin -> 'a'
sample #1: ...boro, North Carolina -> '!'
sample #2: ...oro, North Carolina! -> ' '
sample #3: ...ro, North Carolina!  -> 'W'
sample #4: ...o, North Carolina! W -> 'i'


# Model building
reference: https://keras.io/getting-started/sequential-model-guide/

In [15]:
def build_model():
    model = Sequential()
    for i in range(LAYER_COUNT):
        if (i!=(LAYER_COUNT-1)):
            return_seq_flg=True
        else:
            return_seq_flg=False
            
        model.add(
            LSTM(
                HIDDEN_LAYERS_DIM, 
                return_sequences=return_seq_flg,
                input_shape=(SEQUENCE_LEN, VOCABULARY_SIZE),)
            )
        
        model.add(Dropout(DROPOUT))
    
    model.add(Dense(VOCABULARY_SIZE))
    model.add(Activation('softmax'))
    
    model.compile(loss='categorical_crossentropy', optimizer="adam")
    return model

### Training
reference for EarlyStopping: https://stackoverflow.com/questions/43906048/keras-early-stopping

In [16]:
EPOCHS = 20
HIDDEN_LAYERS_DIM = 512
LAYER_COUNT = 4
DROPOUT = 0.2

In [17]:
training_model = build_model()
train_batch_count = (text_train_len - SEQUENCE_LEN) // BATCH_SIZE
val_batch_count = (text_val_len - SEQUENCE_LEN) // BATCH_SIZE
print("training batch count: %d" % train_batch_count)
print("validation batch count: %d" % val_batch_count)

training batch count: 6080
validation batch count: 673


In [18]:
from keras.callbacks import ModelCheckpoint, EarlyStopping
filepath = "./1-gpu_BS-%d_%d-%s_dp%.2f_%dS_epoch{epoch:02d}-loss{loss:.4f}-val-loss{val_loss:.4f}_weights" % (
    BATCH_SIZE,
    LAYER_COUNT,
    HIDDEN_LAYERS_DIM,
    DROPOUT,
    SEQUENCE_LEN)

# checkpoint
checkpoint = ModelCheckpoint(
    filepath,
    save_weights_only=True)

# early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=5)
# patience argument represents the number of epochs before stopping once your loss starts to 
# increase (stops improving)
# comment: we thought we can run quite a lot epoch. However, the max epoch we ran was 10. Thus the 
# early_stopping function might not be utilized.

callbacks_list = [checkpoint, early_stopping]

In [22]:
history = training_model.fit_generator(
    batch_generator(text_train, count=train_batch_count),
    train_batch_count,
    max_queue_size=1, # no more than one queued batch in RAM
    epochs=EPOCHS,
    callbacks=callbacks_list,
    validation_data=batch_generator(text_val, count=val_batch_count),
    validation_steps=val_batch_count,
    initial_epoch=0)

Epoch 1/20
   1/6080 [..............................] - ETA: 96:03:46 - loss: 4.3439

KeyboardInterrupt: 

In [56]:
!pip install --user pydotplus

Collecting pydotplus
[?25l  Downloading https://files.pythonhosted.org/packages/60/bf/62567830b700d9f6930e9ab6831d6ba256f7b0b730acb37278b0ccdffacf/pydotplus-2.0.2.tar.gz (278kB)
[K    100% |################################| 286kB 5.5MB/s 
Building wheels for collected packages: pydotplus
  Running setup.py bdist_wheel for pydotplus ... [?25ldone
[?25h  Stored in directory: /Users/Pan/Library/Caches/pip/wheels/35/7b/ab/66fb7b2ac1f6df87475b09dc48e707b6e0de80a6d8444e3628
Successfully built pydotplus
Installing collected packages: pydotplus
Successfully installed pydotplus-2.0.2


In [63]:
from keras.utils import plot_model
import pydot

In [None]:
plot_model(training_model, to_file='./model.png', show_shapes=True)