In [1]:
import os
import pickle
import torch
import random
import numpy as np
import pandas as pd
import tensorflow as tf
from tqdm.auto import tqdm
from tensorflow.keras import Model
import tensorflow_recommenders as tfrs
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score
from transformers import AutoTokenizer, AutoModelForCausalLM
from tensorflow.keras.layers import Input, Dense, Dropout, Embedding, Reshape

2024-03-14 13:38:12.573236: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [30]:
# set seeds in order to get consistent random results
os.environ['PYTHONHASHSEED'] = str(42)
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
tqdm.pandas()

# load data
df = pd.read_parquet('data/labels.parquet')[:1000].drop_duplicates(subset=['tweet'])

In [31]:
df.head()

Unnamed: 0,user_id,label,tweet,year
0,360666534,0,Elecciones2020 | En Florida: JoeBiden dice q...,2020
1,8436472,0,"Trump: As a student I used to hear for years, ...",2020
2,828355589206056960,0,2 hours since last tweet from Trump! Maybe he...,2020
3,47413798,0,You get a tie! And you get a tie! Trump ‘s ra...,2020
4,1138416104,0,CLady62 Her 15 minutes were over long time ago...,2020


In [11]:
# encode labels
le = LabelEncoder()
df['user_id'] = le.fit_transform(df['user_id'])
df['year_order'] = le.fit_transform(df['year'])
df['tweet_id'] = le.fit_transform(df['tweet'])

# split dataset
(train_data, val_data) = train_test_split(df, test_size=0.1)
train_data.to_parquet('data/train_data.parquet')
val_data.to_parquet('data/val_data.parquet')

In [12]:
# make embeddings
tokenizer = AutoTokenizer.from_pretrained('gpt2')
model = AutoModelForCausalLM.from_pretrained('gpt2')
tweet_list = df['tweet'].drop_duplicates().tolist()
embeddings = []

In [13]:
for tweet in tqdm(tweet_list):
    prompt = \
        '''Below is a Tweet that discusses the Presidential Election. Write a response that appropriately describes the author's political position.
'''
    prompt += f"### Tweet:{tweet}\n\n### Response:"
    input_ids = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)
    outputs = model(input_ids, output_hidden_states=True)
    embeddings.append(outputs.hidden_states[-1][:, -1, :].detach().numpy())
pickle.dump(embeddings, open('gpt2_embeddings.pkl', 'wb'))

  0%|          | 0/988 [00:00<?, ?it/s]

In [23]:
# build the models
def get_model(dim=50):
    weights = np.vstack(pickle.load(open('gpt2_embeddings.pkl', 'rb')))
    user_id = Input(name='user_id', shape=(1, ))
    year_id = Input(name='year_id', shape=(1, ))
    tweet_id = Input(name='tweet_id', shape=(1, ))

    # first embedding: user's label
    x1 = Embedding(df['user_id'].max() + 1, dim, name='user_embedding')(user_id)
    x1 = Reshape((dim, ), name='reshape1')(x1)

    # second embedding: tweet
    tweet_embedding = Embedding(weights.shape[0], weights.shape[1], weights=[weights], trainable=False, name='tweet_embedding')(tweet_id)
    x2 = Dense(dim, name='tweet_embedding2')(tweet_embedding) 
    x2 = Reshape((dim, ), name='reshape2')(x2)

    # third embeddings: year
    x3 = Embedding(df['year_order'].max() + 1, dim, name='year_embedding')(year_id)
    x3 = Reshape((dim, ), name='reshape3')(x3)

    # combine the above three embeddings
    x = [x1, x2, x3]
    x = tf.concat(x, axis=1, name='concat1')
    for i in range(3):
        x = tfrs.layers.dcn.Cross(projection_dim=dim*3, kernel_initializer="glorot_uniform", name=f'cross_layer_{i}')(x)
        x = Dropout(0.2)(x)
    for i in range(3):
        x = Dense(dim*3, activation="relu", name=f'dense_layer_{i}')(x)
        x = Dropout(0.2)(x)
    
    # generate predictions
    out = Dense(1, activation='sigmoid', name="out")(x)

    # model building
    inputs = {'user_id': user_id, 'tweet_id': tweet_id, 'year_id': year_id}
    model = Model(inputs=inputs, outputs=out)
    return model

In [24]:
# model training
def train(train, val):
    model = get_model()
    # make the features for training and validating sets
    train_features = {
        'user_id': np.array(train['user_id']),
        'year_order': np.array(train['year_order'])
    }
    val_features = {
        'user_id': np.array(val['user_id']),
        'year_order': np.array(val['year_order'])
    }
    train_label = np.array(train['label'])
    val_label = np.array(val['label'])

    # model compiling
    lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
        initial_learning_rate=2e-5,
        decay_steps=80000,
        decay_rate=0.96,
        staircase=True
    )
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=lr_schedule),
        loss=tf.keras.losses.BinaryCrossentropy(),
        metrics=[
            tf.keras.metrics.AUC(),
            tf.keras.metrics.BinaryAccuracy()
        ]
    )
    history = model.fit(
        train_features,
        train_label,
        validation_data=(val_features, val_label),
        batch_size=8,
        epochs=10,
        verbose=1,
        use_multiprocessing=True,
        workers=20
    )

    return model, history

In [25]:
# run the models
train_data = pd.read_parquet('data/train_data.parquet')
val_data = pd.read_parquet('data/val_data.parquet')
trained_model, training_history = train(train_data, val_data)
trained_model.save_weights('data/missing_imputation.h5')

Epoch 1/10


ValueError: in user code:

    File "/Users/haoyufu/opt/anaconda3/lib/python3.9/site-packages/keras/src/engine/training.py", line 1401, in train_function  *
        return step_function(self, iterator)
    File "/Users/haoyufu/opt/anaconda3/lib/python3.9/site-packages/keras/src/engine/training.py", line 1384, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/Users/haoyufu/opt/anaconda3/lib/python3.9/site-packages/keras/src/engine/training.py", line 1373, in run_step  **
        outputs = model.train_step(data)
    File "/Users/haoyufu/opt/anaconda3/lib/python3.9/site-packages/keras/src/engine/training.py", line 1150, in train_step
        y_pred = self(x, training=True)
    File "/Users/haoyufu/opt/anaconda3/lib/python3.9/site-packages/keras/src/utils/traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "/Users/haoyufu/opt/anaconda3/lib/python3.9/site-packages/keras/src/engine/input_spec.py", line 197, in assert_input_compatibility
        raise ValueError(

    ValueError: Missing data for input "tweet_id". You passed a data dictionary with keys ['user_id', 'year_order']. Expected the following keys: ['tweet_id', 'user_id', 'year_id']


In [None]:
# model evaluations
prediction_model = get_model()
prediction_model.load_weights('data/missing_imputation.h5')
val_features = {
    'user_id': np.array(val_data['yearid_id']),
    'year_id': np.array(val_data['year_order'])
}
pred = prediction_model.predict(val_features)
obs = val_data['binarized'].values
# roc_auc = roc_auc_score(obs, pred)
accuracy = accuracy_score(obs, pred > 0.5)
# print(f"ROC AUC Score: {roc_auc:.3f}")
print(f"Accuracy Score: {accuracy:.3f}")