In [453]:
import pandas as pd

In [454]:
politics_tweet_df = pd.read_csv('../data/politics_tweet.csv')
politics_tweet_df= politics_tweet_df.drop(['Unnamed: 0'], axis=1)
politics_tweet_df.head(3)

Unnamed: 0,date,name,tweet
0,2020-11-15,<JoeBiden>,congratulations to nasa and spacex on today's...
1,2020-11-14,<JoeBiden>,"to the millions of hindus, jains, sikhs, and ..."
2,2020-11-13,<JoeBiden>,"i am the president-elect, but will not be pre..."


In [455]:
politics_tweet_df[politics_tweet_df['name']=='<JoeBiden>'].count()

date     3032
name     3032
tweet    3032
dtype: int64

In [456]:
politics_tweet_df[politics_tweet_df['name']=='<realDonaldTrump>'].count()

date     4019
name     4019
tweet    4019
dtype: int64

In [457]:
politics_tweet_df[politics_tweet_df['name']=='<senatemajldr>'].count()

date     515
name     515
tweet    515
dtype: int64

In [458]:
politics_tweet_df[politics_tweet_df['name']=='<SpeakerPelosi>'].count()

date     1005
name     1005
tweet    1005
dtype: int64

In [459]:
def func(x):
    if x=='<realDonaldTrump>':
        return 0
    elif x=='<JoeBiden>':
        return 1
    elif x=='<SpeakerPelosi>':
        return 1
    else:
        return 0
politics_tweet_df['label'] = politics_tweet_df['name'].apply(func)
politics_tweet_df

Unnamed: 0,date,name,tweet,label
0,2020-11-15,<JoeBiden>,congratulations to nasa and spacex on today's...,1
1,2020-11-14,<JoeBiden>,"to the millions of hindus, jains, sikhs, and ...",1
2,2020-11-13,<JoeBiden>,"i am the president-elect, but will not be pre...",1
3,2020-11-13,<JoeBiden>,i am alarmed by the surge in reported covid-1...,1
4,2020-11-13,<JoeBiden>,as the remnants of tropical storm eta continu...,1
...,...,...,...,...
8566,2020-01-03,<senatemajldr>,"for too long, this evil man operated without ...",0
8567,2020-01-03,<senatemajldr>,soleimani made it his life’s work to take the...,0
8568,2020-01-03,<senatemajldr>,"this morning, iran’s master terrorist is dead...",0
8569,2020-01-03,<senatemajldr>,senators do not cease to be senators just bec...,0


In [460]:
y = politics_tweet_df["label"]
y_names = ['JoeBiden', 'realDonaldTrump', 'senatemajldr', 'senatemajldr']
X = politics_tweet_df["tweet"]
print(X.shape, y.shape)

(8571,) (8571,)


In [461]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [462]:
%matplotlib inline
import matplotlib.pyplot as plt
import tensorflow as tf
import numpy as np
from scipy.spatial.distance import cdist

In [463]:
# from tf.keras.models import Sequential  # This does not work!
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import Dense, GRU, Embedding
from tensorflow.python.keras.optimizers import Adam
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

In [464]:
num_words = 10000
tokenizer = Tokenizer(num_words=num_words)

In [465]:
type(X)

pandas.core.series.Series

In [466]:
%%time
tokenizer.fit_on_texts(X.tolist())

Wall time: 430 ms


In [468]:
x_train_tokens = tokenizer.texts_to_sequences(X_train)

In [469]:
x_test_tokens = tokenizer.texts_to_sequences(X_test)

In [470]:
num_tokens = [len(tokens) for tokens in x_train_tokens + x_test_tokens]
num_tokens = np.array(num_tokens)

In [471]:
num_tokens

array([19, 31, 45, ..., 41, 42, 21])

In [472]:
np.mean(num_tokens)

30.133006650332515

In [473]:
np.max(num_tokens)

60

In [474]:
max_tokens = np.mean(num_tokens) + 2 * np.std(num_tokens)
max_tokens = int(max_tokens)
max_tokens

57

In [475]:
np.sum(num_tokens < max_tokens) / len(num_tokens)

0.9994166374985416

In [476]:
pad

'pre'

In [477]:
pad = 'pre'
x_train_pad = pad_sequences(x_train_tokens, maxlen=max_tokens,
                            padding=pad, truncating=pad)


In [478]:
x_test_pad = pad_sequences(x_test_tokens, maxlen=max_tokens,
                           padding=pad, truncating=pad)

In [479]:
x_train_pad.shape

(6428, 57)

In [480]:
x_test_pad.shape

(2143, 57)

In [481]:
idx = tokenizer.word_index
inverse_map = dict(zip(idx.values(), idx.keys()))

In [482]:
def tokens_to_string(tokens):
    # Map from tokens back to words.
    words = [inverse_map[token] for token in tokens if token != 0]
    
    # Concatenate all words.
    text = " ".join(words)

    return text

In [483]:
X_train[1]

' to the millions of hindus, jains, sikhs, and buddhists celebrating the festival of lights,  and i send our best wishes for a #happydiwali. may your new year be filled with hope, happiness, and prosperity. sal mubarak.\n'

In [484]:
tokens_to_string(x_train_tokens[1])

'i issued the following statement with on the urgent need to replenish the paycheck protection program there is no excuse for a lack of urgency american jobs are literally at stake'

In [485]:
model = Sequential()

In [486]:
embedding_size = 8

In [487]:
model.add(Embedding(input_dim=num_words,
                    output_dim=embedding_size,
                    input_length=max_tokens,
                    name='layer_embedding'))

In [488]:
model.add(GRU(units=16, return_sequences=True))
model.add(GRU(units=8, return_sequences=True))
model.add(GRU(units=4))
model.add(Dense(1, activation='sigmoid'))

optimizer = Adam(lr=1e-3)


In [489]:
model.compile(loss='binary_crossentropy',
              optimizer=optimizer,
              metrics=['accuracy'])


In [490]:
model.summary()

Model: "sequential_10"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
layer_embedding (Embedding)  (None, 57, 8)             80000     
_________________________________________________________________
gru_22 (GRU)                 (None, 57, 16)            1200      
_________________________________________________________________
gru_23 (GRU)                 (None, 57, 8)             600       
_________________________________________________________________
gru_24 (GRU)                 (None, 4)                 156       
_________________________________________________________________
dense_13 (Dense)             (None, 1)                 5         
Total params: 81,961
Trainable params: 81,961
Non-trainable params: 0
_________________________________________________________________


In [491]:
%%time
model.fit(x_train_pad, y_train,
          validation_split=0.05, epochs=3, batch_size=64)


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 6106 samples, validate on 322 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
Wall time: 2min 45s


<tensorflow.python.keras.callbacks.History at 0x1d5a85d8688>

In [492]:
%%time
# result = model.evaluate(x_test_pad, y_test)
model_loss, model_accuracy = model.evaluate(
    x_test_pad, y_test, verbose=2)
print(
    f"Normal Neural Network - Loss: {model_loss}, Accuracy: {model_accuracy}")

2143/2143 - 8s - loss: 0.2194 - accuracy: 0.9169
Normal Neural Network - Loss: 0.2193977213731949, Accuracy: 0.916938841342926
Wall time: 8.27 s


In [450]:
model.save("modelnew.h5")

TypeError: wrapped_call() takes 1 positional argument but 2 were given