In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/distilbert3/transformers/default/1/trained_model_gral_imbd.pth
/kaggle/input/distilbert2/transformers/default/1/trained_model_gral_imbd.pth
/kaggle/input/polititcaldata/2017_2.csv
/kaggle/input/polititcaldata/2017_1.csv


In [3]:
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

In [4]:
# Load data into Pandas DataFrame
df = pd.read_csv('/kaggle/input/polititcaldata/2017_1.csv')
print('Data loaded!')

# Wrangle DataFrame
df = df[df["body"].notnull() & (df["body"] != "")]  # Filter rows with valid text

# Load pretrained model and tokenizer
model_path = "/kaggle/input/distilbert2/transformers/default/1/trained_model_gral_imbd.pth"
device = 'cuda' if torch.cuda.is_available() else 'cpu'

model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=4)
model.load_state_dict(torch.load(model_path, map_location=torch.device(device)))
model.eval().to(device)

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

Data loaded!


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  model.load_state_dict(torch.load(model_path, map_location=torch.device(device)))


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



In [5]:
# Function to compute logits (embeddings) for text
def compute_embeddings(text_list):
    embeddings = []
    batch_size = 32  # Adjust based on memory availability
    for i in range(0, len(text_list), batch_size):
        batch = text_list[i:i+batch_size]
        inputs = tokenizer(batch, return_tensors="pt", truncation=True, padding=True, max_length=512)
        inputs = {k: v.to(device) for k, v in inputs.items()}
        with torch.no_grad():
            outputs = model(**inputs)
        embeddings.append(outputs.logits.cpu())
    return torch.cat(embeddings)

# Compute embeddings for the dataset
df["logit_embed"] = compute_embeddings(df["body"].tolist()).numpy().tolist()

# Calculate mean embeddings per label
labels = ['LEFT', 'RIGHT', 'CENTER', 'UNDEFINED']

In [6]:
mean_embeddings = {}

for label in labels:
    label_embeddings = df[df["political_leaning"] == label]["logit_embed"].apply(torch.tensor).tolist()
    if label_embeddings:
        mean_embeddings[label] = torch.mean(torch.stack(label_embeddings), dim=0)
    else:
        mean_embeddings[label] = torch.zeros(4)  # Default embedding if no data

# Function to classify based on cosine similarity using PyTorch
def classify_embedding(embedding):
    embedding_tensor = torch.tensor(embedding, device=device)
    similarities = {
        label: torch.nn.functional.cosine_similarity(embedding_tensor.unsqueeze(0), mean_embeddings[label].unsqueeze(0).to(device)).item()
        for label in mean_embeddings
    }
    return max(similarities, key=similarities.get)

# Apply classification
df["pred_class"] = df["logit_embed"].apply(classify_embedding)
print(mean_embeddings)

{'LEFT': tensor([-0.1004, -0.0203,  0.0215,  0.0242]), 'RIGHT': tensor([-0.0958, -0.0243,  0.0276,  0.0177]), 'CENTER': tensor([-0.0934, -0.0210,  0.0144,  0.0127]), 'UNDEFINED': tensor([-0.0948, -0.0193,  0.0134,  0.0201])}


In [8]:
# Initialize metrics
metrics_dict = {}
classes = ["LEFT", "RIGHT", "CENTER", "UNDEFINED"]

for label in classes:
    print(f"Calculating performance metrics for {label}...", end = "")
    TP = len(df[(df["pred_class"] == label) & (df["political_leaning"] == label)])
    FP = len(df[(df["pred_class"] == label) & (df["political_leaning"] != label)])
    FN = len(df[(df["pred_class"] != label) & (df["political_leaning"] == label)])
    TN = len(df[(df["pred_class"] != label) & (df["political_leaning"] != label)])

    precision = TP / (TP + FP) if (TP + FP) > 0 else 0
    recall = TP / (TP + FN) if (TP + FN) > 0 else 0
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    metrics_dict[label] = {
        "Precision": precision,
        "Recall": recall,
        "F1 Score": f1_score
    }
    print("COMPLETE!")

# Calculate overall accuracy
accuracy = len(df[df["pred_class"] == df["political_leaning"]]) / len(df)

# Display results
print(f"For Dataset 2017_1 Overall Accuracy: {accuracy:.2f}")
for label, metrics in metrics_dict.items():
    print(f"\nMetrics for {label}:")
    print(f"  Precision: {metrics['Precision']:.2f}")
    print(f"  Recall: {metrics['Recall']:.2f}")
    print(f"  F1 Score: {metrics['F1 Score']:.2f}")

Calculating performance metrics for LEFT...COMPLETE!
Calculating performance metrics for RIGHT...COMPLETE!
Calculating performance metrics for CENTER...COMPLETE!
Calculating performance metrics for UNDEFINED...COMPLETE!
For Dataset 2017_1 Overall Accuracy: 0.30

Metrics for LEFT:
  Precision: 0.38
  Recall: 0.23
  F1 Score: 0.29

Metrics for RIGHT:
  Precision: 0.15
  Recall: 0.43
  F1 Score: 0.23

Metrics for CENTER:
  Precision: 0.41
  Recall: 0.29
  F1 Score: 0.34

Metrics for UNDEFINED:
  Precision: 0.32
  Recall: 0.31
  F1 Score: 0.31


## Test on a new dataset

In [9]:
df_new = pd.read_csv('/kaggle/input/polititcaldata/2017_2.csv')

In [10]:
# Compute embeddings for the dataset
df_new["logit_embed"] = compute_embeddings(df_new["body"].tolist()).numpy().tolist()

In [11]:
# Apply classification
df_new["pred_class"] = df_new["logit_embed"].apply(classify_embedding)

In [12]:
# Initialize metrics
metrics_dict = {}
classes = ["LEFT", "RIGHT", "CENTER", "UNDEFINED"]

for label in classes:
    print(f"Calculating performance metrics for {label}...", end = "")
    TP = len(df_new[(df_new["pred_class"] == label) & (df_new["political_leaning"] == label)])
    FP = len(df_new[(df_new["pred_class"] == label) & (df_new["political_leaning"] != label)])
    FN = len(df_new[(df_new["pred_class"] != label) & (df_new["political_leaning"] == label)])
    TN = len(df_new[(df_new["pred_class"] != label) & (df_new["political_leaning"] != label)])

    precision = TP / (TP + FP) if (TP + FP) > 0 else 0
    recall = TP / (TP + FN) if (TP + FN) > 0 else 0
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    metrics_dict[label] = {
        "Precision": precision,
        "Recall": recall,
        "F1 Score": f1_score
    }
    print("COMPLETE!")

# Calculate overall accuracy
accuracy = len(df_new[df_new["pred_class"] == df_new["political_leaning"]]) / len(df_new)

# Display results
print(f"For Dataset 2017_1 Overall Accuracy: {accuracy:.2f}")
for label, metrics in metrics_dict.items():
    print(f"\nMetrics for {label}:")
    print(f"  Precision: {metrics['Precision']:.2f}")
    print(f"  Recall: {metrics['Recall']:.2f}")
    print(f"  F1 Score: {metrics['F1 Score']:.2f}")

Calculating performance metrics for LEFT...COMPLETE!
Calculating performance metrics for RIGHT...COMPLETE!
Calculating performance metrics for CENTER...COMPLETE!
Calculating performance metrics for UNDEFINED...COMPLETE!
For Dataset 2017_1 Overall Accuracy: 0.30

Metrics for LEFT:
  Precision: 0.42
  Recall: 0.18
  F1 Score: 0.25

Metrics for RIGHT:
  Precision: 0.26
  Recall: 0.54
  F1 Score: 0.35

Metrics for CENTER:
  Precision: 0.30
  Recall: 0.30
  F1 Score: 0.30

Metrics for UNDEFINED:
  Precision: 0.28
  Recall: 0.31
  F1 Score: 0.29
