# Training a model with all the posts for an author (with at least 5 posts)

In [None]:
# imports
import gzip
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split





import torch
import torch.nn as nn
from transformers import AutoModel, BertModel
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, BertTokenizerFast, DataCollatorWithPadding, get_scheduler
from torch.optim import AdamW
from sklearn.metrics import accuracy_score, f1_score
from torch.nn.functional import cross_entropy


In [57]:
# Helper functions
# || Load the persona data || #
def load_pickle(file_path):
    with open(file_path, 'rb') as f:
        persona_embeddings = pd.read_pickle(f)
    return persona_embeddings

def load_csv(file_path):
    with open(file_path, 'rb') as f:
        training_data = pd.read_csv(f)
    return training_data

def save_pickle(training_data, file_path):
    with open(file_path, 'wb') as f:
        pd.to_pickle(training_data, f)

def save_csv(training_data, file_path):
    with open(file_path, 'wb') as f:
        training_data.to_csv(f)

def create_persona_embeddings(grouped_data, model):
    persona_embeddings = {}
    for _, row in grouped_data.iterrows():
        author = row['author_fullname']
        posts = row['fulltext']
        post_embeddings = model.encode(posts)
        avg_embedding = np.mean(post_embeddings, axis=0)
        persona_embeddings[author] = avg_embedding
    return persona_embeddings


In [58]:
# # Get data


# # Load the persona data
# data = load_pickle('../data/social_chemistry_posts.gzip')
# data = data.dropna(subset=['author_fullname']) # Drop rows with missing author_fullname as then we can not create a persona for them

# # keep only fulltext and author_fullname columns
# data = data[['fulltext', 'author_fullname']]
# # print(data)

# # filter out authors with less than 5 posts
# filtered_authors_counts = data['author_fullname'].value_counts()
# filtered_authors = data[data['author_fullname'].isin(filtered_authors_counts[filtered_authors_counts >= 5].index)]
# # print(filtered_authors)

# # grouped by author_fullname
# grouped = filtered_authors.groupby('author_fullname').agg(list).reset_index()
# # print(grouped.head())


# # create an embedding for each post and then average them to get the persona embedding


# embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
# author_embeddings = create_persona_embeddings(grouped, embedding_model)

# print(len(author_embeddings["t2_17vpaz83"]))

# # Save the persona embeddings to a file
# with open('../data/post_aggragate_embeddings.pkl', 'wb') as f:
#     pd.to_pickle(author_embeddings, f)

# print("Persona embeddings created and saved to ../data/post_aggragate_embeddings.pkl")

In [59]:
# load the persona embeddings from a file
persona_embeddings = load_pickle('../data/post_aggragate_embeddings.pkl')

# load training data
training_data = load_csv('../data/social_comments_filtered.csv')

# append the persona embeddings to the training data
def append_persona_embeddings(training_data, persona_embeddings):
    training_data['persona_embedding'] = training_data['author_fullname'].map(persona_embeddings)
    return training_data

training_data = append_persona_embeddings(training_data, persona_embeddings)

# save the training data with persona embeddings to a file
save_pickle(training_data, '../data/post_level_model/training_data.pkl')
save_csv(training_data, '../data/post_level_model/training_data.csv')




In [60]:
class SentBertClassifier(nn.Module):
    def __init__(self, use_embeddings=True, embedding_dim=384, 
                 num_outputs=2, sbert_dim=384, 
                 sbert_model='sentence-transformers/all-MiniLM-L6-v2'):
        super().__init__()
        print("Initializing with embedding input layer:", use_embeddings)

        self.model = AutoModel.from_pretrained(sbert_model)
        self.dropout = nn.Dropout(0.2)
        self.linear1 = nn.Linear(sbert_dim, sbert_dim // 2)
        self.relu = nn.ReLU()
        self.use_embeddings = use_embeddings
        
        if use_embeddings:
            # Combine SBERT + embedding vector
            comb_in_dim = sbert_dim // 2 + embedding_dim
            self.combine_linear = nn.Linear(comb_in_dim, comb_in_dim // 2)
            self.output_layer = nn.Linear(comb_in_dim // 2, num_outputs)
        else:
            self.output_layer = nn.Linear(sbert_dim // 2, num_outputs)

    def forward(self, input_ids, attention_mask, embedding_vector=None):
        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
        pooled = outputs.last_hidden_state[:, 0]
        x = self.relu(self.linear1(self.dropout(pooled)))

        if self.use_embeddings and embedding_vector is not None:
            x = torch.cat([x, embedding_vector], dim=1)
            x = self.relu(self.combine_linear(x))

        logits = self.output_layer(x)
        return logits

In [61]:
# Load training data
data = load_pickle('../data/post_level_model/training_data.pkl')
# drop useless columns
data = data.drop(columns=['id', 'permalink', 'parent_id', 'author_name'])
# drop rows with missing values
data = data.dropna(subset=['body', 'label', 'persona_embedding'])

data = data[:100]

# print(data.head())

# create annotator model
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") 
# print(device)

# split data 
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

print("Train data shape:", train_data.shape)
print("Test data shape:", test_data.shape)

print(train_data.head())

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')

# Tokenize the training data
def tokenize_function(data):
    tokenized = tokenizer(
        data['body'].tolist(),
        padding="max_length",
        truncation=True,
        max_length=512,
        return_tensors='pt'
    )

    tokenized["persona_embedding"] = torch.tensor(data["persona_embedding"].tolist())

    tokenized["labels"] = torch.tensor(
        [1 if label == "YTA" else 0 for label in data["label"].tolist()],
        dtype=torch.long
    )
    return tokenized


train_encodings = tokenize_function(train_data)
test_encodings = tokenize_function(test_data)

print(train_encodings["persona_embedding"].shape)

# print first row of the encodings
print(train_encodings.keys())

# convert to dataloaders
train_dataset = torch.utils.data.TensorDataset(
    train_encodings["input_ids"],
    train_encodings["attention_mask"],
    train_encodings["labels"],
    train_encodings["persona_embedding"]
)
test_dataset = torch.utils.data.TensorDataset(
    test_encodings["input_ids"],
    test_encodings["attention_mask"],
    test_encodings["labels"],
    test_encodings["persona_embedding"]
)

BATCH_SIZE = 16
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

Train data shape: (80, 4)
Test data shape: (20, 4)
      label                                               body  \
17251   NTA  NTA not only did you inform her about the movi...   
25565   YTA  YTA You never made any kind of formal agreemen...   
7859    YTA  YTA. You are a stay-at-home-mom. If it was agr...   
12340   NTA  NTA That is not his business. I understand he ...   
21171   YTA  YTA It can be really hard as a speaker to have...   

      author_fullname                                  persona_embedding  
17251     t2_1hu208zh  [-0.0154501535, 0.0719409, 0.006097397, -0.014...  
25565     t2_2yhxhifg  [0.015882641, 0.034143534, 0.039112315, 0.0099...  
7859      t2_17vpaz83  [0.022095518, 0.058831934, -0.03174363, -0.013...  
12340        t2_ehvk2  [0.02064454, 0.024615703, 0.004179083, 0.02008...  
21171        t2_ehvk2  [0.02064454, 0.024615703, 0.004179083, 0.02008...  
torch.Size([80, 384])
dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'persona_embedding',

In [62]:
# Training loop
def train_model(model, train_dataloader, criterion, optimizer, num_epochs):
    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        
        for batch in train_dataloader:

            # print(batch[0])
            # print(batch[1])
            # print(batch[2])
            # print(batch[3])

            input_ids = batch[0].to(device)
            attention_mask = batch[1].to(device)
            labels = batch[2].to(device)
            persona_embedding = batch[3].to(device)
            # print(input_ids.shape)
            # print(attention_mask.shape)
            # print(labels.shape)
            # print(persona_embedding.shape)

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask, embedding_vector=persona_embedding)
            loss = criterion(outputs, labels)
            total_loss += loss.item()
            loss.backward()
            optimizer.step()

        
        print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {total_loss / len(train_dataloader)}")

In [63]:
# Evaluate model
def evaluate_model(model, test_dataloader, criterion):
    model.eval()
    predictions = []
    true_labels = []
    total_loss = 0

    with torch.no_grad():
        for batch in test_dataloader:
            input_ids = batch[0].to(device)
            attention_mask = batch[1].to(device)
            labels = batch[2].to(device)
            persona_embedding = batch[3].to(device)

            outputs = model(input_ids, attention_mask, embedding_vector=persona_embedding)
            loss = criterion(outputs, labels)
            total_loss += loss.item()

            logits = outputs.argmax(dim=1).cpu().numpy()
            predictions.extend(logits)
            true_labels.extend(labels.cpu().numpy())

    # Calculate accuracy and F1 score
    accuracy = accuracy_score(true_labels, predictions)
    f1 = f1_score(true_labels, predictions, average='weighted')

    print(f"Test Loss: {total_loss / len(test_dataloader)}")
    print(f"Accuracy: {accuracy}")
    print(f"F1 Score: {f1}")


In [69]:
EPOCHS = 3
LEARNING_RATE = 2e-5

In [70]:
# Train model with embeddings 

# Initialize the model
model = SentBertClassifier(use_embeddings=True, embedding_dim=384, num_outputs=2, sbert_dim=384, sbert_model='sentence-transformers/all-MiniLM-L6-v2')
model = model.to(device)

# Loss and Optimizer
criterion = nn.CrossEntropyLoss()
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)

train_model(model, train_dataloader, criterion, optimizer, EPOCHS)
evaluate_model(model, test_dataloader, criterion)



Initializing with embedding input layer: True




Epoch 1/3, Loss: 0.7102759003639221
Epoch 2/3, Loss: 0.6865322709083557
Epoch 3/3, Loss: 0.6606812238693237
Test Loss: 0.6341192424297333
Accuracy: 0.85
F1 Score: 0.7810810810810811


In [67]:
# Train model without embeddings
# Initialize the model
model_no_emb = SentBertClassifier(use_embeddings=False, num_outputs=2, sbert_dim=384, sbert_model='sentence-transformers/all-MiniLM-L6-v2')
model_no_emb = model_no_emb.to(device)

# Loss and Optimizer
criterion_no_emb = nn.CrossEntropyLoss()
optimizer_no_emb = AdamW(model_no_emb.parameters(), lr=LEARNING_RATE)

train_model(model_no_emb, train_dataloader, criterion_no_emb, optimizer_no_emb, EPOCHS)
evaluate_model(model_no_emb, test_dataloader, criterion_no_emb)


Initializing with embedding input layer: False
Epoch 1/3, Loss: 0.6198644876480103
Epoch 2/3, Loss: 0.5616430878639221
Epoch 3/3, Loss: 0.49254134893417356
Test Loss: 0.4147469401359558
Accuracy: 0.85
F1 Score: 0.7810810810810811
