In [8]:
import re
import pandas as pd
import numpy as np

from tensorflow import keras
from keras.models import Sequential
from tensorflow.keras.layers import Dense

from sklearn import preprocessing
from sklearn.model_selection import train_test_split

In [16]:
# Load dataset
data = pd.read_csv('../preprocessed_congressional_tweet.csv')


data.head(10)

Unnamed: 0,favorite_count,length,COVID19,SOTU,tcot,Obamacare,ForThePeople,coronavirus,ACA,SCOTUS,...,NC13,Yes2Energy,ma09,AUMF,VA5,MN03,OH16,Other,retweet_count,party_id
0,0.0,0.028556,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,4.7e-05,R
1,0.000604,0.065386,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0.00052,R
2,0.0,0.02909,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,9e-06,R
3,2.1e-05,0.022151,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1.4e-05,R
4,7e-06,0.061649,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1.4e-05,D
5,3.3e-05,0.046704,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,6.1e-05,D
6,9e-06,0.030424,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1.4e-05,R
7,0.0,0.02055,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0.0,R
8,9e-06,0.016013,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,4.2e-05,R
9,1.2e-05,0.011209,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0.0,D


In [17]:
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

# Encode labels from string (neg or pos) to integer (0 or 1)
labelencoder = LabelEncoder()
integer_encoded = labelencoder.fit_transform(data['party_id'])

# One-hot encoding
Y = to_categorical(integer_encoded) 

In [18]:
Y[0]

array([0., 1.], dtype=float32)

In [23]:
data_array = data.values

X_train, X_test, Y_train, Y_test = train_test_split(data_array[:,:-1], Y, test_size=0.25, random_state=42)

X_train = X_train.astype('float32')
X_test = X_test.astype('float32')

print(X_train.shape)

(444602, 504)


In [25]:
model = Sequential()
model.add(Dense(512, input_dim = 504, activation='relu'))
model.add(Dense(512, activation='relu'))
model.add(Dense(2, activation='softmax'))
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_15 (Dense)            (None, 512)               258560    
                                                                 
 dense_16 (Dense)            (None, 512)               262656    
                                                                 
 dense_17 (Dense)            (None, 2)                 1026      
                                                                 
Total params: 522,242
Trainable params: 522,242
Non-trainable params: 0
_________________________________________________________________


In [26]:
# Import callbacks from keras
from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, EarlyStopping

file_path = '/tmp/checkpoint'  # for ModelCheckpoint callback that needs file path to call

# Initialize the callbacks and add them to a list
checkpoint = ModelCheckpoint(file_path, monitor='val_accuracy', verbose=1, save_best_only=True, mode='max')
reduce_on_plateau = ReduceLROnPlateau(monitor="loss", mode="min", factor=0.1, patience=5, verbose=1)
es = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

callbacks_list = [checkpoint, reduce_on_plateau, es]

In [27]:
history = model.fit(X_train, Y_train, batch_size=256, epochs=20, callbacks=callbacks_list, verbose=1, 
                    validation_data=(X_test, Y_test))

Epoch 1/20
Epoch 1: val_accuracy improved from -inf to 0.71788, saving model to /tmp/checkpoint


2022-05-08 12:34:34.721689: W tensorflow/python/util/util.cc:368] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.


INFO:tensorflow:Assets written to: /tmp/checkpoint/assets
Epoch 2/20
Epoch 2: val_accuracy improved from 0.71788 to 0.71878, saving model to /tmp/checkpoint
INFO:tensorflow:Assets written to: /tmp/checkpoint/assets
Epoch 3/20
Epoch 3: val_accuracy did not improve from 0.71878
Epoch 4/20
Epoch 4: val_accuracy improved from 0.71878 to 0.72096, saving model to /tmp/checkpoint
INFO:tensorflow:Assets written to: /tmp/checkpoint/assets
Epoch 5/20
Epoch 5: val_accuracy improved from 0.72096 to 0.72137, saving model to /tmp/checkpoint
INFO:tensorflow:Assets written to: /tmp/checkpoint/assets
Epoch 6/20
Epoch 6: val_accuracy improved from 0.72137 to 0.72268, saving model to /tmp/checkpoint
INFO:tensorflow:Assets written to: /tmp/checkpoint/assets
Epoch 7/20
Epoch 7: val_accuracy improved from 0.72268 to 0.72301, saving model to /tmp/checkpoint
INFO:tensorflow:Assets written to: /tmp/checkpoint/assets
Epoch 8/20
Epoch 8: val_accuracy improved from 0.72301 to 0.72323, saving model to /tmp/checkpo

In [28]:
scores = model.evaluate(X_test, Y_test, verbose=1)
print('Test loss:', scores[0])
print('Test accuracy:', scores[1])

Test loss: 0.4799746870994568
Test accuracy: 0.723011314868927


In [29]:
i = 3
predict_array = model.predict(X_test[[i],:])

# Index of the max value (the name of the class with the highest probability)
prediction = np.argmax(predict_array)
truth = np.argmax(Y_test[i])

print(f"Prediction: {prediction}, Truth: {truth}")

Prediction: 0, Truth: 0
