In [20]:
import numpy as np
import pandas as pd
import tensorflow as tf
from keras import layers

In [4]:
train_df = pd.read_csv('/Users/jacobjohnson/data_sets/congressional_tweet_training_data.csv', names=['favorite_count', 'full_text', 'hashtags', 'retweet_count', 'year', 'party_id'], 
                    # dtype={'favorite_count': int, 'full_text': str, 'hashtags': str, 'retweet_count': int, 'year': int, 'party_id': str}, 
                    skipinitialspace=True, skiprows=1, sep=',')

test_df = pd.read_csv('/Users/jacobjohnson/data_sets/congressional_tweet_test_data.csv', names=['id', 'favorite_count_test', 'full_text_test', 'hashtags_test', 'retweet_count_test', 'year', 'party'], 
                    # dtype={'id': int, 'favorite_count': int, 'full_text': str, 'hashtags': str, 'retweet_count': int, 'year': int, 'party': str}, 
                    skipinitialspace=True, skiprows=1, sep=',')

train_df.pop('year')
test_df.pop('year')

# test_df.head()
train_df.head()

# target variable
# party = train_df.pop('party_id')



Unnamed: 0,favorite_count,full_text,hashtags,retweet_count,party_id
0,0,"b""RT @KUSINews: One of our longtime viewers wa...",KUSI,10,R
1,258,"b""Today I'm urging the @CDCgov to immediately ...",Coronavirus,111,R
2,0,"b'Tomorrow, #MO03 seniors graduate from Calvar...",MO03,2,R
3,9,b'Congrats to #TeamUSA and Canton Native @JGre...,TeamUSA WorldJuniors,3,R
4,3,b'Pleased to support @amergateways at their Ju...,ImmigrantHeritageMonth,3,D


In [5]:
train, val, test = np.split(train_df.sample(frac=1), [int(0.8*len(train_df)), int(0.9*len(train_df))])

print(len(train), 'training examples')
print(len(val), 'validation examples')
print(len(test), 'test examples')

474242 training examples
59280 validation examples
59281 test examples


In [6]:
def df_to_dataset(dataframe, shuffle=True, batch_size=32):
  df = dataframe.copy()
  names = df.pop('party_id')
  df = {key: value[:,tf.newaxis] for key, value in dataframe.items()}
  ds = tf.data.Dataset.from_tensor_slices((dict(df), names))
  if shuffle:
    ds = ds.shuffle(buffer_size=len(dataframe))
  ds = ds.batch(batch_size)
  ds = ds.prefetch(batch_size)
  return ds

In [7]:
# batch_size = 5
# train_ds = df_to_dataset(train, batch_size=batch_size)

batch_size = 256
train_ds = df_to_dataset(train, batch_size=batch_size)
val_ds = df_to_dataset(val, shuffle=False, batch_size=batch_size)
test_ds = df_to_dataset(test, shuffle=False, batch_size=batch_size)

  df = {key: value[:,tf.newaxis] for key, value in dataframe.items()}
2022-05-06 16:14:14.098628: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
  df = {key: value[:,tf.newaxis] for key, value in dataframe.items()}
  df = {key: value[:,tf.newaxis] for key, value in dataframe.items()}


In [8]:
[(train_features, label_batch)] = train_ds.take(1)
print('Every feature:', list(train_features.keys()))
print('A batch of hashtags:', train_features['hashtags'])
print('A batch of targets:', label_batch )

Every feature: ['favorite_count', 'full_text', 'hashtags', 'retweet_count', 'party_id']
A batch of hashtags: tf.Tensor(
[[b'HappyHolidays']
 [b'SickleCellAwareness']
 [b'Hero']
 [b'OpportunityZones']
 [b'constitutionday2020']
 [b'PittsburghKneelers']
 [b'VA10']
 [b'Gitmo']
 [b'ProtectAllWorkers']
 [b'NY13']
 [b'DREAMers DreamAct']
 [b'WA01']
 [b'InternationalWomensDay']
 [b'ACA CBCHBT']
 [b'IL02Proud']
 [b'GetCovered']
 [b'whokilledlaurapalmer twinpeaks']
 [b'NY21']
 [b'Gorsuch SCOTUS']
 [b'smallbiz']
 [b'EqualityAct']
 [b'ProtectOurCare']
 [b'Trumpcare']
 [b'NationalFarmersDay']
 [b'MI08']
 [b'COVID19 CA29']
 [b'WRRDA America']
 [b'ClintonPoliceDept ThinBlueLine ultimatesacrifice']
 [b'Trump TPS']
 [b'WRDA2020 CA38']
 [b'EstamosUnidosVE Venezuela']
 [b'NARLegislative']
 [b'SCOTUS Obamacare ACA tcot hcr']
 [b'AffordableCareAct ACAWorks 20MillionStrong']
 [b'ACA ACATurns3']
 [b'OH2']
 [b'FL25']
 [b'HurricaneDorian']
 [b'wontlast summerscomingsoon']
 [b'TrumpCaves China Trump']
 [b'Prote

In [9]:
def get_normalization_layer(name, dataset):
  normalizer = layers.Normalization(axis=None)
  feature_ds = dataset.map(lambda x, y: x[name])
  normalizer.adapt(feature_ds)
  return normalizer

def get_category_encoding_layer(name, dataset, dtype, max_tokens=None):
  if dtype == 'string':
    index = layers.StringLookup(max_tokens=max_tokens)
  else:
    index = layers.IntegerLookup(max_tokens=max_tokens)
  feature_ds = dataset.map(lambda x, y: x[name])
  index.adapt(feature_ds)
  encoder = layers.CategoryEncoding(num_tokens=index.vocabulary_size())
  return lambda feature: encoder(index(feature))

In [10]:
# Numerical features.

all_inputs = []
encoded_features = []

for header in ['favorite_count', 'retweet_count']:
  numeric_col = tf.keras.Input(shape=(1,), name=header)
  normalization_layer = get_normalization_layer(header, train_ds)
  encoded_numeric_col = normalization_layer(numeric_col)
  all_inputs.append(numeric_col)
  encoded_features.append(encoded_numeric_col)

In [11]:
text_cols = ['full_text', 'hashtags']

for header in text_cols:
  text_col = tf.keras.Input(shape=(1,), name=header, dtype='string')
  encoding_layer = get_category_encoding_layer(name=header,
                                               dataset=train_ds,
                                               dtype='string',
                                               max_tokens=None)
  encoded_text_col = encoding_layer(text_col)
  all_inputs.append(text_col)
  encoded_features.append(encoded_text_col)

In [12]:
# party_id_col = tf.keras.Input(shape=(1,), name='party_id', dtype='string')

# encoding_layer = get_category_encoding_layer(name='party_id',
#                                              dataset=train_ds,
#                                              dtype='string',
#                                              max_tokens=None)
# encoded_party_id_col = encoding_layer(party_id_col)
# all_inputs.append(party_id_col)
# encoded_features.append(encoded_party_id_col)


# test_party_id_col = train_features['party_id']
# test_party_id_layer = get_category_encoding_layer(name='party_id',
#                                              dataset=train_ds,
#                                              dtype='string',
#                                              max_tokens=None)
# test_party_id_layer(test_party_id_col)

In [13]:
# JUST TO TEST ENCODING

# test_retweet_count_col = train_features['retweet_count']
# test_retweet_count_layer = get_normalization_layer(name='retweet_count', dataset=train_ds)
# test_retweet_count_layer(test_retweet_count_col)

# test_favorite_count_col = train_features['favorite_count']
# test_favorite_count_layer = get_normalization_layer(name='favorite_count', dataset=train_ds)
# test_favorite_count_layer(test_favorite_count_col)

# test_full_text_col = train_features['full_text']
# test_full_text_layer = get_category_encoding_layer(name='full_text', dataset=train_ds, dtype='string')
# test_full_text_layer(test_full_text_col)

# test_hashtags_col = train_features['hashtags']
# test_hashtags_layer = get_category_encoding_layer(name='hashtags', dataset=train_ds, dtype='string')
# test_hashtags_layer(test_hashtags_col)

In [14]:
all_features = tf.keras.layers.concatenate(encoded_features)
x = tf.keras.layers.Dense(32, activation="relu")(all_features)
x = tf.keras.layers.Dropout(0.5)(x)
output = tf.keras.layers.Dense(1)(x)

model = tf.keras.Model(all_inputs, output)

In [15]:
model.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=["accuracy"])

In [19]:
model.inputs

[<KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'favorite_count')>,
 <KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'retweet_count')>,
 <KerasTensor: shape=(None, 1) dtype=string (created by layer 'full_text')>,
 <KerasTensor: shape=(None, 1) dtype=string (created by layer 'hashtags')>]

In [16]:
model.fit(train_ds, epochs=10, validation_data=val_ds)

# model.summary()

Epoch 1/10


  inputs = self._flatten_to_reference_inputs(inputs)
2022-05-06 16:14:42.425391: W tensorflow/core/framework/op_kernel.cc:1722] OP_REQUIRES failed at cast_op.cc:121 : UNIMPLEMENTED: Cast string to float is not supported


UnimplementedError: Graph execution error:

Detected at node 'binary_crossentropy/Cast' defined at (most recent call last):
    File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/runpy.py", line 196, in _run_module_as_main
      return _run_code(code, main_globals, None,
    File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/runpy.py", line 86, in _run_code
      exec(code, run_globals)
    File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/ipykernel_launcher.py", line 17, in <module>
      app.launch_new_instance()
    File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/traitlets/config/application.py", line 846, in launch_instance
      app.start()
    File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/ipykernel/kernelapp.py", line 712, in start
      self.io_loop.start()
    File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/tornado/platform/asyncio.py", line 199, in start
      self.asyncio_loop.run_forever()
    File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/asyncio/base_events.py", line 595, in run_forever
      self._run_once()
    File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/asyncio/base_events.py", line 1881, in _run_once
      handle._run()
    File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/asyncio/events.py", line 80, in _run
      self._context.run(self._callback, *self._args)
    File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 504, in dispatch_queue
      await self.process_one()
    File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 493, in process_one
      await dispatch(*args)
    File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 400, in dispatch_shell
      await result
    File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 724, in execute_request
      reply_content = await reply_content
    File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 390, in do_execute
      res = shell.run_cell(code, store_history=store_history, silent=silent)
    File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/ipykernel/zmqshell.py", line 528, in run_cell
      return super().run_cell(*args, **kwargs)
    File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 2863, in run_cell
      result = self._run_cell(
    File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 2909, in _run_cell
      return runner(coro)
    File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/IPython/core/async_helpers.py", line 129, in _pseudo_sync_runner
      coro.send(None)
    File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3106, in run_cell_async
      has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
    File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3309, in run_ast_nodes
      if await self.run_code(code, result, async_=asy):
    File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3369, in run_code
      exec(code_obj, self.user_global_ns, self.user_ns)
    File "/var/folders/nf/brcqg8zx7xg31hjn_xbd2jc80000gp/T/ipykernel_48827/4003320285.py", line 1, in <cell line: 1>
      model.fit(train_ds, epochs=10, validation_data=val_ds)
    File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/keras/utils/traceback_utils.py", line 64, in error_handler
      return fn(*args, **kwargs)
    File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/keras/engine/training.py", line 1384, in fit
      tmp_logs = self.train_function(iterator)
    File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/keras/engine/training.py", line 1021, in train_function
      return step_function(self, iterator)
    File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/keras/engine/training.py", line 1010, in step_function
      outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/keras/engine/training.py", line 1000, in run_step
      outputs = model.train_step(data)
    File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/keras/engine/training.py", line 860, in train_step
      loss = self.compute_loss(x, y, y_pred, sample_weight)
    File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/keras/engine/training.py", line 918, in compute_loss
      return self.compiled_loss(
    File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/keras/engine/compile_utils.py", line 201, in __call__
      loss_value = loss_obj(y_t, y_p, sample_weight=sw)
    File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/keras/losses.py", line 141, in __call__
      losses = call_fn(y_true, y_pred)
    File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/keras/losses.py", line 245, in call
      return ag_fn(y_true, y_pred, **self._fn_kwargs)
    File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/keras/losses.py", line 1922, in binary_crossentropy
      y_true = tf.cast(y_true, y_pred.dtype)
Node: 'binary_crossentropy/Cast'
Cast string to float is not supported
	 [[{{node binary_crossentropy/Cast}}]] [Op:__inference_train_function_16404]