**Dataset origin:** https://www.unb.ca/cic/datasets/truthseeker-2023.html

*S. Dadkhah, X. Zhang, A. G. Weismann, A. Firouzi and A. A. Ghorbani, "The Largest Social Media Ground-Truth Dataset for Real/Fake Content: TruthSeeker," in IEEE Transactions on Computational Social Systems, 99. 1-15, Oct. 2023.*

Pytorch self-attention: https://docs.pytorch.org/docs/stable/generated/torch.nn.MultiheadAttention.html

In [1]:
import nltk
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
import torch



In [82]:
PATH_TO_FILE ="/Users/mikhailleontev/PycharmProjects/Attestation/TruthSeeker2023/Truth_Seeker_Model_Dataset.csv"
df = pd.read_csv(PATH_TO_FILE)
print(df.shape)
df.head()

(134198, 9)


Unnamed: 0.1,Unnamed: 0,author,statement,target,BinaryNumTarget,manual_keywords,tweet,5_label_majority_answer,3_label_majority_answer
0,0,D.L. Davis,End of eviction moratorium means millions of A...,True,1.0,"Americans, eviction moratorium",@POTUS Biden Blunders - 6 Month Update\n\nInfl...,Mostly Agree,Agree
1,1,D.L. Davis,End of eviction moratorium means millions of A...,True,1.0,"Americans, eviction moratorium",@S0SickRick @Stairmaster_ @6d6f636869 Not as m...,NO MAJORITY,Agree
2,2,D.L. Davis,End of eviction moratorium means millions of A...,True,1.0,"Americans, eviction moratorium",THE SUPREME COURT is siding with super rich pr...,Agree,Agree
3,3,D.L. Davis,End of eviction moratorium means millions of A...,True,1.0,"Americans, eviction moratorium",@POTUS Biden Blunders\n\nBroken campaign promi...,Mostly Agree,Agree
4,4,D.L. Davis,End of eviction moratorium means millions of A...,True,1.0,"Americans, eviction moratorium",@OhComfy I agree. The confluence of events rig...,Agree,Agree


In [83]:
tweets = df['tweet']
tweets.head(10)

0    @POTUS Biden Blunders - 6 Month Update\n\nInfl...
1    @S0SickRick @Stairmaster_ @6d6f636869 Not as m...
2    THE SUPREME COURT is siding with super rich pr...
3    @POTUS Biden Blunders\n\nBroken campaign promi...
4    @OhComfy I agree. The confluence of events rig...
5    I've said this before, but it really is incred...
6    As many face backlogged rent payments, America...
7    @Thomas1774Paine @JoeBiden\n#DOJ@TheJusticeDep...
8    @SocialismIsDone @TheeKHiveQueenB Its a win fo...
9    @daysofarelives2 @Sen_JoeManchin There is not ...
Name: tweet, dtype: object

In [84]:
labes = df['BinaryNumTarget']
print(labes.value_counts())
labes.head(10)

BinaryNumTarget
1.0    68930
0.0    65268
Name: count, dtype: int64


0    1.0
1    1.0
2    1.0
3    1.0
4    1.0
5    1.0
6    1.0
7    1.0
8    1.0
9    1.0
Name: BinaryNumTarget, dtype: float64

In [85]:
tokenized_tweets = []
i = 0
for tweet in tweets:
    tokens = nltk.word_tokenize(tweet)
    tokenized_tweets.append(tokens)
    i += 1
    if i % 10000 == 0:
        print(f'Tokenized {i} tweets')


Tokenized 10000 tweets
Tokenized 20000 tweets
Tokenized 30000 tweets
Tokenized 40000 tweets
Tokenized 50000 tweets
Tokenized 60000 tweets
Tokenized 70000 tweets
Tokenized 80000 tweets
Tokenized 90000 tweets
Tokenized 100000 tweets
Tokenized 110000 tweets
Tokenized 120000 tweets
Tokenized 130000 tweets


In [86]:
tweet_lengths = [len(tokens) for tokens in tokenized_tweets]
print(f'Mean length of tokenized tweets: {np.mean(tweet_lengths)}')
print(f'Median tweet length: {np.median(tweet_lengths)}')
print(f'Max length of tokenized tweets: {max(tweet_lengths)}')
print(f'Min length of tokenized tweets: {min(tweet_lengths)}')


Mean length of tokenized tweets: 42.12054576074159
Median tweet length: 44.0
Max length of tokenized tweets: 174
Min length of tokenized tweets: 1


In [87]:
CAP_LENGTH = 50
tweets_capped = [tokens[:CAP_LENGTH] for tokens in tokenized_tweets]

In [88]:
# Word2Vec Hyper parameters
VECTOR_SIZE = 64
WINDOW = 5
WORKERS = 4

In [89]:
model_vec = Word2Vec(tweets_capped, vector_size=VECTOR_SIZE, window=WINDOW, min_count=1, workers=WORKERS)

In [90]:
embedded_tweets = []
for tokens in tweets_capped:
    tweet_vector = []
    for token in tokens:
        tweet_vector.append(model_vec.wv[token])
    # Pad with zero vectors if tweet is shorter than CAP_LENGTH
    while len(tweet_vector) < CAP_LENGTH:
        tweet_vector.append(np.zeros(VECTOR_SIZE))
    embedded_tweets.append(tweet_vector)
embedded_tweets = np.array(embedded_tweets) # turn into numpy array
print(embedded_tweets.shape)

(134198, 50, 64)


In [91]:
TEST_SIZE = 0.2
RANDOM_STATE = 20042004
SHUFFLE = True
train_x, test_x, train_y, test_y = train_test_split(embedded_tweets, labes, test_size=TEST_SIZE, random_state=RANDOM_STATE, shuffle=SHUFFLE)
print(f'Train shape: {train_x.shape}, Test shape: {test_x.shape}')
print(f'Train labels shape: {train_y.shape}, Test labels shape: {test_y.shape}')

Train shape: (107358, 50, 64), Test shape: (26840, 50, 64)
Train labels shape: (107358,), Test labels shape: (26840,)


In [93]:
# define torch model
# Hyperparameters
EPOCHS = 2
BATCH_SIZE = 128
LEARNING_RATE = 0.001
NUMBER_OF_HEADS = 4

class TweeterClassifier (torch.nn.Module):
    def __init__(self):
        super(TweeterClassifier, self).__init__()
        self.self_attention = torch.nn.MultiheadAttention(embed_dim=VECTOR_SIZE, num_heads=NUMBER_OF_HEADS)
        self.fc1 = torch.nn.Linear(CAP_LENGTH * VECTOR_SIZE, 1)
        self.sigmoid = torch.nn.Sigmoid()

    def forward(self, x):
        # x shape: (batch_size, CAP_LENGTH, VECTOR_SIZE)
        x = x.permute(1, 0, 2)  # Change to (CAP_LENGTH, batch_size, VECTOR_SIZE)
        attn_output, _ = self.self_attention(x, x, x)
        attn_output = attn_output.permute(1, 0, 2)
        # shape: (batch_size, CAP_LENGTH, VECTOR_SIZE)
        attn_output = attn_output.reshape(attn_output.size(0), -1) # flattening output
        out = self.fc1(attn_output)
        out = self.sigmoid(out)
        return out

    def get_attention_weights(self):
        return self.self_attention.in_proj_weight


In [94]:
# Initialize model, loss function, and optimizer
model = TweeterClassifier()
criterion = torch.nn.BCELoss() # Binary Cross Entropy Loss
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

In [56]:
# Training loop
for epoch in range(EPOCHS):
    model.train()
    epoch_loss = 0
    for i in range(0, len(train_x), BATCH_SIZE):
        batch_x = torch.tensor(train_x[i:i+BATCH_SIZE], dtype=torch.float32)
        batch_y = torch.tensor(train_y[i:i+BATCH_SIZE].values, dtype=torch.float32).view(-1, 1)
        optimizer.zero_grad()
        outputs = model(batch_x)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    print(f'Epoch {epoch+1}/{EPOCHS}, Loss: {epoch_loss/len(train_x)}')

Epoch 1/5, Loss: 0.006537646255230623
Epoch 2/5, Loss: 0.005141963395650173
Epoch 3/5, Loss: 0.00453762426264793
Epoch 4/5, Loss: 0.004121792647277579
Epoch 5/5, Loss: 0.00380176213164317


In [68]:
accuracy = 0
model.eval()
with torch.no_grad(): # Disable gradient calculation for evaluation
    for i in range(0, len(test_x), BATCH_SIZE):
        batch_x = torch.tensor(test_x[i:i+BATCH_SIZE], dtype=torch.float32)
        batch_y = test_y[i:i+BATCH_SIZE].values
        outputs = model(batch_x)
        predicted = (outputs.numpy() > 0.5).astype(int)
        accuracy += sum((predicted.flatten() == batch_y))
accuracy = accuracy / len(test_y)
print(f'Test Accuracy: {accuracy * 100:.2f}%')

Test Accuracy: 90.15%
