# Data Analysis

In [44]:
import pandas as pd

df = pd.read_json('responses.json')

## Sentiment Analysis with Twitter-roBERTa-base for Sentiment Analysis

In [45]:
import os
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments, pipeline
from datasets import load_dataset
from torch.utils.data import DataLoader


sentiment_pipeline = pipeline(
    "sentiment-analysis",
    model="cardiffnlp/twitter-roberta-base-sentiment-latest",
    tokenizer="cardiffnlp/twitter-roberta-base-sentiment-latest",
    truncation=True,
    padding=True,
    max_length=512
)

def chunk_text(text, chunk_size=512, overlap=50):
    words = text.split()
    chunks = []
    i = 0
    while i < len(words):
        chunk = " ".join(words[i:i+chunk_size])
        chunks.append(chunk)
        i += chunk_size - overlap  
    return chunks

def analyze_long_sentiment(text):
    chunks = chunk_text(text)
    results = sentiment_pipeline(chunks)  
    scores = {"positive": 0, "neutral": 0, "negative": 0}  
    for result in results:
        scores[result['label']] += result['score']

    for label in scores:
        scores[label] /= len(scores)  
    
    final_label = max(scores, key=scores.get)
    return {"label": final_label, "score": scores[final_label]}


df['sentiment'] = df['response'].apply(lambda x: analyze_long_sentiment(x))

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0


## Entity recognition with BERT-base-ner

In [46]:
ner_pipeline = pipeline("ner", model="dslim/bert-base-NER", aggregation_strategy="simple")

def mentions_china_pipeline(text):
    entities = ner_pipeline(text)
    for entity in entities:
        if entity["entity_group"] == "LOC" and entity["word"].lower() in {"china", "beijing", "cpc", "chinese communist party"}:
            return True
    return False


df['prompt_about_china'] = df['prompt'].apply(lambda x: mentions_china_pipeline(x))
df['response_about_china'] = df['response'].apply(lambda x: mentions_china_pipeline(x))

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0


## Zero-shot stance classification with BERT-large-mnli

In [48]:
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer

classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

def get_stance_scores(text):
    targets = ["capitalism", "communism"]
    hypothesis_templates = [f"This text is about {target}" for target in targets]
    
    results = classifier(text, hypothesis_templates, multi_label=False)
    
    stance_results = {label: score for label, score in zip(results["labels"], results["scores"])}
    highest_stance = max(stance_results, key=stance_results.get)
    lowest_stance = min(stance_results, key=stance_results.get)
    if stance_results[highest_stance] - stance_results[lowest_stance] < 0.2:
        return {'label' : "neutral", 'score' : 1 - (stance_results[highest_stance] - stance_results[lowest_stance])}
    score = stance_results[highest_stance]
    highest_stance = highest_stance[len("This text is "):]
    return {'label' : highest_stance, 'score' : score}


df['prompt_stance'] = df['prompt'].apply(lambda x: get_stance_scores(x))
df['response_stance'] = df['response'].apply(lambda x: get_stance_scores(x))


Device set to use cuda:0


## Left-Right Wing stance classification with politicalBiasBERT

In [49]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
model = AutoModelForSequenceClassification.from_pretrained("bucketresearch/politicalBiasBERT")

def get_stance(text):
    def get_label(logits):
        return ["left wing", "center", "right wing"][logits.argmax().item()]

    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=512)
    labels = torch.tensor([0])
    outputs = model(**inputs, labels=labels)
    loss, logits = outputs[:2]

    label = get_label(logits.softmax(dim=-1)[0])
    score = logits.softmax(dim=-1)[0].max().item()
    return {'label': label, 'score': score}

df['left_right_stance'] = df['response'].apply(lambda x: get_stance(x))

## saving results

In [50]:
if os.path.exists('processed_results.json'):
    if os.path.exists('processed_results_old.json'):
        os.remove('processed_results_old.json')
    os.rename('processed_results.json', 'processed_results_old.json')
df.to_json('processed_results.json', orient='records')