# Data Analysis

In [1]:
import pandas as pd

df = pd.read_json('responses.json', lines=True)

## Sentiment Analysis with Twitter-roBERTa-base for Sentiment Analysis with and without fine-tuning

In [None]:
import os
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments, pipeline
from datasets import load_dataset
from torch.utils.data import DataLoader

# Inizializza la pipeline con troncamento e padding
sentiment_pipeline = pipeline(
    "sentiment-analysis",
    model="cardiffnlp/twitter-roberta-base-sentiment-latest",
    tokenizer="cardiffnlp/twitter-roberta-base-sentiment-latest",
    truncation=True,
    padding=True,
    max_length=512
)

def chunk_text(text, chunk_size=512, overlap=50):
    words = text.split()
    chunks = []
    i = 0
    while i < len(words):
        chunk = " ".join(words[i:i+chunk_size])
        chunks.append(chunk)
        i += chunk_size - overlap  
    return chunks

def analyze_long_sentiment(text):
    chunks = chunk_text(text)
    results = sentiment_pipeline(chunks)  
    
    scores = {"LABEL_0": 0, "LABEL_1": 0, "LABEL_2": 0}  
    for result in results:
        scores[result['label']] += result['score']

    total = sum(scores.values())
    for label in scores:
        scores[label] /= total  
    
    final_label = max(scores, key=scores.get)
    return {"label": final_label, "score": scores[final_label]}

df['sentiment'] = df['response'].apply(lambda x: analyze_long_sentiment(x))

config.json:   0%|          | 0.00/929 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


pytorch_model.bin:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/501M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Device set to use cuda:0
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


[{'label': 'positive', 'score': 0.5702665448188782}]
[{'label': 'positive', 'score': 0.6355164647102356}]
[{'label': 'positive', 'score': 0.9253115653991699}]
[{'label': 'positive', 'score': 0.6384999752044678}]
[{'label': 'neutral', 'score': 0.641802191734314}]
[{'label': 'positive', 'score': 0.6996219754219055}]
[{'label': 'neutral', 'score': 0.5015963315963745}]
[{'label': 'positive', 'score': 0.8814913034439087}]
[{'label': 'positive', 'score': 0.8855143785476685}]
[{'label': 'neutral', 'score': 0.5042963624000549}]
[{'label': 'positive', 'score': 0.9194397926330566}]
[{'label': 'neutral', 'score': 0.5211573243141174}]
[{'label': 'positive', 'score': 0.6978632807731628}]
[{'label': 'positive', 'score': 0.8007522225379944}]
[{'label': 'positive', 'score': 0.9374840259552002}]
[{'label': 'positive', 'score': 0.6382399201393127}]
[{'label': 'positive', 'score': 0.8974376320838928}]
[{'label': 'neutral', 'score': 0.6030388474464417}]
[{'label': 'neutral', 'score': 0.5229507088661194}]


RuntimeError: The expanded size of the tensor (613) must match the existing size (514) at non-singleton dimension 1.  Target sizes: [1, 613].  Tensor sizes: [1, 514]

In [None]:
dataset = load_dataset("classla/parlasent")
model_name = "cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)

if not os.path.exists("./fine_tuned_roberta_parlasent"):
    # Definire funzione di tokenizzazione
    def tokenize_function(examples):
        return tokenizer(examples["text"], padding="max_length", truncation=True)

    # Applicare la tokenizzazione al dataset
    tokenized_datasets = dataset.map(tokenize_function, batched=True)

    # Preparare il dataset per il training
    tokenized_datasets = tokenized_datasets.remove_columns(["text"])
    tokenized_datasets.set_format("torch")

    training_args = TrainingArguments(
        output_dir="./results",
        evaluation_strategy="epoch",
        save_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=3,
        weight_decay=0.01,
        logging_dir="./logs",
        logging_steps=10,
        push_to_hub=False,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets["validation"],
    )

    trainer.train()
    trainer.save_model("./fine_tuned_roberta_parlasent")


In [None]:
from torch import softmax

def analyze_sentiment_finetuned(text: str) -> float:
    # Tokenizza il testo
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    
    # Calcola le previsioni
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
    
    # Calcola la probabilità usando softmax
    probs = softmax(logits, dim=-1).squeeze().cpu().numpy()    
    # Trova la classe con la probabilità massima
    sentiment_class = probs.argmax()
    
    return 2 * (sentiment_class - 1)  # -1, 0, 1 corrispondenti a negative, neutral, positive

df['sentiment_score_finetuned'] = df['response'].apply(lambda x: analyze_sentiment_finetuned(x))

## Entity recognition with BERT-base-ner

In [None]:
ner_pipeline = pipeline("ner", model="dslim/bert-base-NER", aggregation_strategy="simple")

def mentions_china_pipeline(text):
    entities = ner_pipeline(text)
    for entity in entities:
        if entity["entity_group"] == "LOC" and entity["word"].lower() in {"china", "beijing", "cpc", "chinese communist party"}:
            return True
    return False


df['prompt_about_china'] = df['prompt'].apply(lambda x: mentions_china_pipeline(x))
df['response_about_china'] = df['response'].apply(lambda x: mentions_china_pipeline(x))

## Entity recognition with dbmdz/bert-large-cased-finetuned-conll03-english

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

model_name = "dbmdz/bert-large-cased-finetuned-conll03-english"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)

ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

china_related_keywords = {
    "China", "Chinese", "Beijing", "CPC", "Communist Party of China",
    "Huawei", "Alibaba", "Belt and Road Initiative", "Xi Jinping"
}

def mentions_china_pipeline_finetuned(text):
    """ Esegue Named Entity Recognition (NER) e filtra le entità correlate alla Cina """
    entities = ner_pipeline(text)
    for word in entities:
        if word['word'] in china_related_keywords:
            return True
    return False


df['prompt_about_china_finetuned'] = df['prompt'].apply(lambda x: mentions_china_pipeline_finetuned(x))
df['response_about_china_finetuned'] = df['response'].apply(lambda x: mentions_china_pipeline_finetuned(x))


## Pre-Trained stance classification with RoBERTa

In [None]:
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch.nn.functional as F

# Carica il modello e il tokenizer
MODEL_NAME = "cardiffnlp/twitter-roberta-base-stance"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)

# Label originali del modello
LABELS = ["against", "neutral", "favor"]

def get_stance_label(scores):
    """Restituisce la label con il punteggio più alto."""
    return LABELS[torch.argmax(scores)]

def classify_stance(text):
    """Classifica il testo come Pro Communism, Neutral o Pro Capitalism, restituendo anche la probabilità."""
    topics = ["communism", "capitalism"]
    stance_results = {}

    for topic in topics:
        inputs = tokenizer(f"{text} [SEP] {topic}", return_tensors="pt")
        outputs = model(**inputs)
        scores = F.softmax(outputs.logits, dim=1).squeeze()
        label = get_stance_label(scores)
        probability = scores[torch.argmax(scores)].item()  # Estrai la probabilità della classe scelta
        stance_results[topic] = (label, probability)

    # Mappatura ai nostri label con probabilità
    if stance_results["communism"][0] == "favor" and stance_results["capitalism"][0] == "against":
        return "Pro Communism", stance_results["communism"][1]
    elif stance_results["capitalism"][0] == "favor" and stance_results["communism"][0] == "against":
        return "Pro Capitalism", stance_results["capitalism"][1]
    elif stance_results["communism"][0] == "against" and stance_results["capitalism"][0] == "against":
        return "Against both", (stance_results["communism"][1], stance_results["capitalism"][1])
    elif stance_results["communism"][0] == "favor" and stance_results["capitalism"][0] == "favor":
        return "Pro both", (stance_results["communism"][1], stance_results["capitalism"][1])
    else:
        return "Neutral", (stance_results["communism"][1], stance_results["capitalism"][1])


df['stance'] = df['response'].apply(lambda x: classify_stance(x))


## saving results

In [None]:
if os.path.exists('processed_results.json'):
    if os.path.exists('processed_results_old.json'):
        os.remove('processed_results_old.json')
    os.rename('processed_results.json', 'processed_results_old.json')
df.to_json('processed_results.json', orient='records')