In [6]:
import numpy as np
import pandas as pd
import random
import tensorflow as tf
from keras import layers

In [7]:
train_df = pd.read_csv('/Users/jacobjohnson/data_sets/congressional_tweet_training_data.csv', names=['favorite_count', 'full_text', 'hashtags', 'retweet_count', 'year', 'party_id'], skipinitialspace=True, skiprows=1, sep=',')

test_df = pd.read_csv('/Users/jacobjohnson/data_sets/congressional_tweet_test_data.csv', names=['id', 'favorite_count', 'full_text', 'hashtags', 'retweet_count', 'year', 'party_id'], skipinitialspace=True, skiprows=1, sep=',')

train_df.pop('year')
test_df.pop('year')
test_df.pop('id')

train_df.head()

Unnamed: 0,favorite_count,full_text,hashtags,retweet_count,party_id
0,0,"b""RT @KUSINews: One of our longtime viewers wa...",KUSI,10,R
1,258,"b""Today I'm urging the @CDCgov to immediately ...",Coronavirus,111,R
2,0,"b'Tomorrow, #MO03 seniors graduate from Calvar...",MO03,2,R
3,9,b'Congrats to #TeamUSA and Canton Native @JGre...,TeamUSA WorldJuniors,3,R
4,3,b'Pleased to support @amergateways at their Ju...,ImmigrantHeritageMonth,3,D


In [8]:
train_df['target'] = np.where(train_df['party_id']=='D', 0, 1)

train_df = train_df.drop(columns=['party_id'])

In [9]:
train, val, test = np.split(train_df.sample(frac=1), [int(0.8*len(train_df)), int(0.9*len(train_df))])

print(len(train), 'training examples')
print(len(val), 'validation examples')
print(len(test), 'test examples')

474242 training examples
59280 validation examples
59281 test examples


In [10]:
def df_to_dataset(dataframe, shuffle=True, batch_size=32):
  df = dataframe.copy()
  names = df.pop('target')
  df = {key: value[:,tf.newaxis] for key, value in dataframe.items()}
  ds = tf.data.Dataset.from_tensor_slices((dict(df), names))
  if shuffle:
    ds = ds.shuffle(buffer_size=len(dataframe))
  ds = ds.batch(batch_size)
  ds = ds.prefetch(batch_size)
  return ds

In [11]:
batch_size = 256
train_ds = df_to_dataset(train, batch_size=batch_size)
val_ds = df_to_dataset(val, shuffle=False, batch_size=batch_size)
test_ds = df_to_dataset(test, shuffle=False, batch_size=batch_size)

  df = {key: value[:,tf.newaxis] for key, value in dataframe.items()}
  df = {key: value[:,tf.newaxis] for key, value in dataframe.items()}
  df = {key: value[:,tf.newaxis] for key, value in dataframe.items()}


In [12]:
[(train_features, label_batch)] = train_ds.take(1)
print('Every feature:', list(train_features.keys()))
print('A batch of hashtags:', train_features['hashtags'])
print('A batch of targets:', label_batch )

Every feature: ['favorite_count', 'full_text', 'hashtags', 'retweet_count', 'target']
A batch of hashtags: tf.Tensor(
[[b'COVID19']
 [b'ERAVote ERANow']
 [b'DontDoubleMyRate']
 [b'HurricanHarvey']
 [b'ag ag taxreform']
 [b'NFIP']
 [b'ADA DisabilityRightsAreCivilRights']
 [b'UAS']
 [b'economy immigrationreform']
 [b'mepolitics']
 [b'GOP']
 [b'delIrene']
 [b'NationalWalkoutDay NeverAgain']
 [b'IN03']
 [b'GraduateTogether']
 [b'ABetterDeal']
 [b'CarrollCo']
 [b'StateoftheUnion']
 [b'Yazidis']
 [b'Oklahoma']
 [b'HAPPENINGNOW']
 [b'HigherEd']
 [b'STTA']
 [b'Israel']
 [b'2020Census MakeNCCount']
 [b'NEheroes']
 [b'CalTrans quickmaps']
 [b'GOPShutdown DemandAVote EnoughAlready']
 [b'Enough']
 [b'SU4T']
 [b'BillofRights']
 [b'NY22 Vets']
 [b'RoshHashanah ShanaTova']
 [b'citrus Farmers Florida ag']
 [b'ia03']
 [b'DefundPP ProLife PalinOnPoint']
 [b'EndGunViolence NoBillNoBreak']
 [b'EPA Zika PuertoRico']
 [b'NJ GoTeamUSA']
 [b'PuertoRico USVI']
 [b'Enough']
 [b'mtpol']
 [b'LangevinCybersecurity

In [13]:
def get_normalization_layer(name, dataset):
  normalizer = layers.Normalization(axis=None)
  feature_ds = dataset.map(lambda x, y: x[name])
  normalizer.adapt(feature_ds)
  return normalizer

def get_category_encoding_layer(name, dataset, dtype, max_tokens=None):
  if dtype == 'string':
    index = layers.StringLookup(max_tokens=max_tokens)
  else:
    index = layers.IntegerLookup(max_tokens=max_tokens)
  feature_ds = dataset.map(lambda x, y: x[name])
  index.adapt(feature_ds)
  encoder = layers.CategoryEncoding(num_tokens=index.vocabulary_size())
  return lambda feature: encoder(index(feature))

In [14]:
# Numerical features.

all_inputs = []
encoded_features = []

for header in ['favorite_count', 'retweet_count']:
  numeric_col = tf.keras.Input(shape=(1,), name=header)
  normalization_layer = get_normalization_layer(header, train_ds)
  encoded_numeric_col = normalization_layer(numeric_col)
  all_inputs.append(numeric_col)
  encoded_features.append(encoded_numeric_col)

In [15]:
text_cols = ['full_text', 'hashtags']

for header in text_cols:
  text_col = tf.keras.Input(shape=(1,), name=header, dtype='string')
  encoding_layer = get_category_encoding_layer(name=header, dataset=train_ds, dtype='string', max_tokens=None)
  encoded_text_col = encoding_layer(text_col)
  all_inputs.append(text_col)
  encoded_features.append(encoded_text_col)

In [16]:
all_features = tf.keras.layers.concatenate(encoded_features)
x = tf.keras.layers.Dense(32, activation="relu")(all_features)
x = tf.keras.layers.Dropout(0.5)(x)
output = tf.keras.layers.Dense(1)(x)

model = tf.keras.Model(all_inputs, output)

In [17]:
model.compile(optimizer='adam', loss=tf.keras.losses.BinaryCrossentropy(from_logits=True), metrics=["accuracy"])

In [18]:
model.fit(train_ds, epochs=1, validation_data=val_ds)

  inputs = self._flatten_to_reference_inputs(inputs)




<keras.callbacks.History at 0x159723a90>

In [27]:
randNum = random.randint(0, 265000)
sample = test_df.iloc[randNum]

print(test_df.iloc[randNum])

input_dict = {name: tf.convert_to_tensor([value]) for name, value in sample.items()}
predictions = model.predict(input_dict)
probR = tf.nn.sigmoid(predictions[0])

if probR >= 0.5:
    partyName = 'Republican'
    prob = probR
else:
    partyName = 'Democrat'
    prob = (1 - probR)

print(
    "This tweet was tweeted by a %s with a %.1f percent probability." % (partyName, (100 * prob))
)

favorite_count                                                    3
full_text         b'This invaluable program will use research to...
hashtags                                                NIH AllOfUs
retweet_count                                                     1
party_id                                                          D
Name: 234081, dtype: object
This tweet was tweeted by a Democrat with a 77.4 percent probability.
