In [4]:
import os

# Set the environment variable
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModel
import torch

In [2]:
# Load data into Pandas DataFrame
df = pd.read_csv('2017_1.csv', on_bad_lines='skip', engine = 'python')
print('Data loaded!')

# Wrangle DataFrame
df = df[df["body"].notnull() & (df["body"] != "")]  # Filter rows with valid text

# Load pretrained model and tokenizer
model_path = "trained_model_gral_imbd.pth"
device = 'cuda' if torch.cuda.is_available() else 'cpu'

model = AutoModel.from_pretrained("distilbert-base-uncased", output_hidden_states=True)
model.load_state_dict(torch.load(model_path, map_location=torch.device(device)), strict=False)
model.eval().to(device)

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

Data loaded!


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
  model.load_state_dict(torch.load(model_path, map_location=torch.device(device)), strict=False)


In [9]:
# Function to compute (embeddings) for text
def compute_embeddings(text_list):
  embeddings = []
  batch_size = 1  # Adjust based on memory availability
  for i in range(0, len(text_list), batch_size):
    batch = text_list[i:i+batch_size]
    inputs = tokenizer(batch, return_tensors="pt", truncation=True, padding=True, max_length=512)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
      outputs = model(**inputs)

        # Extract the last hidden state of the first token (CLS token)
    embeddings.append(outputs.hidden_states[-1][:, 0, :].cpu())

        # Clear CUDA cache to manage memory
    torch.cuda.empty_cache()

  return torch.cat(embeddings)

# Compute embeddings for the dataset
df["logit_embed"] = compute_embeddings(df["body"].tolist()).cpu().numpy().tolist()

In [10]:
labels = ['LEFT', 'RIGHT', 'CENTER', 'UNDEFINED']
mean_embeddings = {}
#rando comm
for label in labels:
    label_embeddings = df[df["political_leaning"] == label]["logit_embed"].apply(torch.tensor).tolist()
    if label_embeddings:
        mean_embeddings[label] = torch.mean(torch.stack(label_embeddings), dim=0)
    else:
        mean_embeddings[label] = torch.zeros(4)  # Default embedding if no data

# Function to classify based on cosine similarity using PyTorch
def classify_embedding(embedding):
    embedding_tensor = torch.tensor(embedding, device=device)
    similarities = {
        label: torch.nn.functional.cosine_similarity(embedding_tensor.unsqueeze(0), mean_embeddings[label].unsqueeze(0).to(device)).item()
        for label in mean_embeddings
    }
    return max(similarities, key=similarities.get)

# Apply classification
df["pred_class"] = df["logit_embed"].apply(classify_embedding)
print(mean_embeddings)

{'LEFT': tensor([-1.4963e-01, -2.0931e-01, -1.8463e-01, -1.6202e-01, -6.5144e-02,
        -2.7861e-02,  1.3247e-01,  6.9593e-02, -1.1476e-01, -1.8346e-01,
        -1.2039e-01, -9.8362e-02, -2.2226e-01,  2.8408e-01, -1.5265e-01,
         1.1337e-01, -4.3750e-02,  2.3086e-01,  1.9741e-01,  6.6621e-02,
        -4.5887e-03, -2.1860e-01,  3.2513e-01,  1.6916e-01,  6.6777e-02,
        -2.6662e-01,  8.6275e-02, -2.1229e-01, -9.9476e-02,  1.5161e-01,
         1.7733e-01,  2.2769e-01, -2.3935e-01, -3.1069e-01,  1.4558e-01,
        -8.5969e-02,  1.7098e-01, -1.4431e-01,  1.8692e-02,  2.3472e-01,
        -1.5090e-01,  1.3458e-01, -8.9212e-02,  1.3192e-02,  6.0995e-02,
         3.5916e-02, -3.0172e+00,  5.6869e-02, -1.0708e-01, -1.9186e-01,
         1.6389e-01,  1.4584e-01,  1.2914e-02,  3.0634e-01,  3.5728e-01,
         1.8229e-01, -4.8397e-01,  1.1201e-01, -7.2116e-02, -1.1158e-01,
         1.6798e-01,  1.6726e-01, -1.8003e-01, -1.2770e-01,  2.4625e-02,
        -1.6217e-01, -1.2826e-01,  3.9068e

In [11]:
# Initialize metrics
metrics_dict = {}
classes = ["LEFT", "RIGHT", "CENTER", "UNDEFINED"]

for label in classes:
    print(f"Calculating performance metrics for {label}...", end = "")
    TP = len(df[(df["pred_class"] == label) & (df["political_leaning"] == label)])
    FP = len(df[(df["pred_class"] == label) & (df["political_leaning"] != label)])
    FN = len(df[(df["pred_class"] != label) & (df["political_leaning"] == label)])
    TN = len(df[(df["pred_class"] != label) & (df["political_leaning"] != label)])

    precision = TP / (TP + FP) if (TP + FP) > 0 else 0
    recall = TP / (TP + FN) if (TP + FN) > 0 else 0
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    metrics_dict[label] = {
        "Precision": precision,
        "Recall": recall,
        "F1 Score": f1_score
    }
    print("COMPLETE!")

# Calculate overall accuracy
accuracy = len(df[df["pred_class"] == df["political_leaning"]]) / len(df)

# Display results
print(f"For Dataset 2017_1 Overall Accuracy: {accuracy:.2f}")
for label, metrics in metrics_dict.items():
    print(f"\nMetrics for {label}:")
    print(f"  Precision: {metrics['Precision']:.2f}")
    print(f"  Recall: {metrics['Recall']:.2f}")
    print(f"  F1 Score: {metrics['F1 Score']:.2f}")

Calculating performance metrics for LEFT...COMPLETE!
Calculating performance metrics for RIGHT...COMPLETE!
Calculating performance metrics for CENTER...COMPLETE!
Calculating performance metrics for UNDEFINED...COMPLETE!
For Dataset 2017_1 Overall Accuracy: 0.50

Metrics for LEFT:
  Precision: 0.62
  Recall: 0.47
  F1 Score: 0.53

Metrics for RIGHT:
  Precision: 0.28
  Recall: 0.53
  F1 Score: 0.36

Metrics for CENTER:
  Precision: 0.67
  Recall: 0.52
  F1 Score: 0.59

Metrics for UNDEFINED:
  Precision: 0.44
  Recall: 0.50
  F1 Score: 0.47


In [7]:
df_new = pd.read_csv('2017_2.csv', on_bad_lines='skip', engine = 'python')
# Compute embeddings for the dataset
df_new["logit_embed"] = compute_embeddings(df_new["body"].tolist()).numpy().tolist()

In [12]:
# Apply classification
df_new["pred_class"] = df_new["logit_embed"].apply(classify_embedding)

In [13]:
# Initialize metrics
metrics_dict = {}
classes = ["LEFT", "RIGHT", "CENTER", "UNDEFINED"]

for label in classes:
    print(f"Calculating performance metrics for {label}...", end = "")
    TP = len(df_new[(df_new["pred_class"] == label) & (df_new["political_leaning"] == label)])
    FP = len(df_new[(df_new["pred_class"] == label) & (df_new["political_leaning"] != label)])
    FN = len(df_new[(df_new["pred_class"] != label) & (df_new["political_leaning"] == label)])
    TN = len(df_new[(df_new["pred_class"] != label) & (df_new["political_leaning"] != label)])

    precision = TP / (TP + FP) if (TP + FP) > 0 else 0
    recall = TP / (TP + FN) if (TP + FN) > 0 else 0
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    metrics_dict[label] = {
        "Precision": precision,
        "Recall": recall,
        "F1 Score": f1_score
    }
    print("COMPLETE!")

# Calculate overall accuracy
accuracy = len(df_new[df_new["pred_class"] == df_new["political_leaning"]]) / len(df_new)

# Display results
print(f"For Dataset 2017_1 Overall Accuracy: {accuracy:.2f}")
for label, metrics in metrics_dict.items():
    print(f"\nMetrics for {label}:")
    print(f"  Precision: {metrics['Precision']:.2f}")
    print(f"  Recall: {metrics['Recall']:.2f}")
    print(f"  F1 Score: {metrics['F1 Score']:.2f}")

Calculating performance metrics for LEFT...COMPLETE!
Calculating performance metrics for RIGHT...COMPLETE!
Calculating performance metrics for CENTER...COMPLETE!
Calculating performance metrics for UNDEFINED...COMPLETE!
For Dataset 2017_1 Overall Accuracy: 0.45

Metrics for LEFT:
  Precision: 0.63
  Recall: 0.49
  F1 Score: 0.55

Metrics for RIGHT:
  Precision: 0.31
  Recall: 0.44
  F1 Score: 0.36

Metrics for CENTER:
  Precision: 0.49
  Recall: 0.52
  F1 Score: 0.50

Metrics for UNDEFINED:
  Precision: 0.35
  Recall: 0.36
  F1 Score: 0.35
