In [11]:
import pandas as pd
import numpy as np
import os
from itertools import chain

In [None]:
df = pd.read_csv(os.path.join(os.path.expanduser('./datasets'), ''))
df.columns

Index(['article_id', 'paragraph', 'text', 'refined_technique'], dtype='object')

In [74]:
def create_technique_mapping():
    """Creates a dictionary that maps original techniques to their new categories"""
    mapping = {
        'Appeal_to_Authority': 'Appeal_to_Authority',
        'Appeal_to_Fear-Prejudice': 'Appeal_to_Fear',
        'Appeal_to_Hypocrisy': 'Tu_Quoque/Whataboutism',
        'Appeal_to_Popularity': 'Appeal_to_Popularity',
        'Appeal_to_Time': 'Appeal_to_Time',
        'Appeal_to_Values': 'Appeal_to_Values/Flag_Waving',
        'Causal_Oversimplification': 'Causal_Oversimplification/Consequential_Oversimplification',
        'Consequential_Oversimplification': 'Causal_Oversimplification/Consequential_Oversimplification',
        'Conversation_Killer': 'Slogan/Conversation_Killer',
        'Doubt': 'Smear/Doubt',
        'Exaggeration-Minimisation': 'Exaggeration_Minimisation',
        'False_Dilemma-No_Choice': 'False_Dilemma_No_Choice',
        'Flag_Waving': 'Appeal_to_Values/Flag_Waving',
        'Guilt_by_Association': 'Reductio_ad_Hitlerum',
        'Loaded_Language': 'Loaded_Language',
        'Name_Calling-Labeling': 'Name_Calling',
        'Obfuscation-Vagueness-Confusion': 'Intentional_Confusion_Vagueness',
        'Questioning_the_Reputation': 'Smear/Doubt',
        'Red_Herring': 'Straw_Man/Red_Herring',
        'Repetition': 'Repetition',
        'Slogans': 'Slogan/Conversation_Killer',
        'Straw_Man': 'Straw_Man/Red_Herring',
        'Whataboutism': 'Tu_Quoque/Whataboutism',
        'no_technique_detected': 'no_technique_detected'
    }
    return mapping

In [75]:
def remap_techniques(df, mapping, technique_column='refined_technique'):
    """
    Remaps the techniques in the specified column according to the new schema

    Args:
        df (pd.DataFrame): DataFrame containing the technique column
        mapping (dict): Dictionary mapping old technique names to new ones
        technique_column (str): Name of the column containing techniques to be remapped

    Returns:
        pd.DataFrame: DataFrame with updated technique names
    """
    def update_techniques(techniques):
        if pd.isna(techniques):
            return 'no_technique_detected'

        # Split multiple techniques if present
        technique_list = techniques.split(',')
        # Map each technique to its new value
        updated_techniques = [mapping.get(t.strip(), t.strip()) for t in technique_list]
        # Remove duplicates while preserving order
        seen = set()
        unique_techniques = [x for x in updated_techniques if not (x in seen or seen.add(x))]
        return ','.join(unique_techniques)

    df_copy = df.copy()
    df_copy[technique_column] = df_copy[technique_column].apply(update_techniques)
    return df_copy


In [76]:
# Create the mapping
mapping = create_technique_mapping()

# Process the DataFrame
processed_df = remap_techniques(df, mapping)
processed_df

Unnamed: 0,article_id,paragraph,text,refined_technique
0,0,1,Verde torbido: cosa si nasconde dietro il farm...,"Smear/Doubt,Loaded_Language"
1,0,2,"di Roberto Graziano, Lo psicologo olandese Joo...","Appeal_to_Fear,Smear/Doubt,Loaded_Language"
2,0,3,Sembra davvero che le ipnosi collettive e la r...,"Appeal_to_Values/Flag_Waving,Appeal_to_Fear,Na..."
3,0,4,"Quasi contemporaneamente a Meerloo, il Premio ...",no_technique_detected
4,0,5,"Gli esperimenti su dissonanza cognitiva, condi...",no_technique_detected
...,...,...,...,...
1005,69,22,Il pompaggio di enormi quantità di CO2 nel sot...,"Appeal_to_Fear,Appeal_to_Values/Flag_Waving,Ca..."
1006,69,23,Non dimentichiamoci che la CO2 ha una densità ...,"Appeal_to_Fear,Intentional_Confusion_Vagueness"
1007,69,24,Basti ricordare la liberazione improvvisa di u...,Exaggeration_Minimisation
1008,69,25,"Altri disastri, sia naturali, sia causati dall...","Name_Calling,Smear/Doubt,Loaded_Language,Inten..."


In [78]:
all_labels = set(chain.from_iterable(label.split(',') for label in processed_df['refined_technique']))

label_to_int = {label: idx for idx,label in enumerate(sorted(all_labels))}

def encode_labels(label_str):
    return [label_to_int[label] for label in label_str.split(',')]

processed_df['encoded_labels'] = processed_df['refined_technique'].apply(encode_labels)

print(label_to_int)

{'Appeal_to_Authority': 0, 'Appeal_to_Fear': 1, 'Appeal_to_Popularity': 2, 'Appeal_to_Time': 3, 'Appeal_to_Values/Flag_Waving': 4, 'Causal/Consequential_Oversimplification': 5, 'Causal_Oversimplification/Consequential_Oversimplification': 6, 'Exaggeration_Minimisation': 7, 'False_Dilemma_No_Choice': 8, 'Intentional_Confusion_Vagueness': 9, 'Loaded_Language': 10, 'Name_Calling': 11, 'Reductio_ad_Hitlerum': 12, 'Repetition': 13, 'Slogan/Conversation_Killer': 14, 'Smear/Doubt': 15, 'Straw_Man/Red_Herring': 16, 'Tu_Quoque/Whataboutism': 17, 'no_technique_detected': 18}


In [80]:
processed_df['encoded_labels'].value_counts()

encoded_labels
[18]                  350
[15]                   69
[7]                    19
[10]                   19
[6]                    18
                     ... 
[0, 1, 3, 7]            1
[3, 1, 7]               1
[0, 10, 7]              1
[7, 1, 13, 0]           1
[11, 15, 10, 9, 7]      1
Name: count, Length: 313, dtype: int64

In [81]:
processed_df = processed_df.sample(frac = 1).reset_index(drop = True)

In [82]:
processed_df.head()

Unnamed: 0,article_id,paragraph,text,refined_technique,encoded_labels
0,63,2,Basta l’aritmetica per spiega perchè l’idea di...,Causal_Oversimplification/Consequential_Oversi...,"[6, 15]"
1,69,6,Dopo essersi inventati il business della Carbo...,"Straw_Man/Red_Herring,Name_Calling,Smear/Doubt...","[16, 11, 15, 10, 7]"
2,0,113,Costruite su una giungla inestricabile di rego...,"Appeal_to_Values/Flag_Waving,Appeal_to_Fear,Fa...","[4, 1, 8, 15, 10, 11, 7]"
3,54,17,"Certo, c’è da evidenziare che buoni numeri di ...",no_technique_detected,[18]
4,62,17,Se utilizzassimo le loro posizioni non ci sare...,no_technique_detected,[18]


In [83]:
from pydpp.dpp import DPP

In [84]:
from sklearn.model_selection import train_test_split

In [85]:
df_train, df_test = train_test_split(processed_df, test_size = 0.2, random_state = 42) # We do this for sanity check

In [86]:
df_train = df_train.reset_index(drop = True)
df_test = df_test.reset_index(drop = True)

In [87]:
df_train['encoded_labels'].value_counts()

encoded_labels
[18]                 282
[15]                  58
[15, 10]              16
[10]                  14
[7]                   13
                    ... 
[1, 10, 7, 3]          1
[8, 14, 3, 10, 7]      1
[7, 1, 13, 0]          1
[6, 8, 15, 10]         1
[8, 6]                 1
Name: count, Length: 262, dtype: int64

In [88]:
import torch
device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')

In [89]:
def get_embeddings(df, model):
    sentences = df['text'].to_list()
    embeddings = model.encode(sentences)
    return embeddings

In [90]:
from sentence_transformers import SentenceTransformer

In [91]:
model = SentenceTransformer("embaas/sentence-transformers-multilingual-e5-large")

In [92]:
df_h = df_train[df_train['encoded_labels'].apply(lambda x: any(label in range(0, 18) for label in x))].reset_index(drop=True)
df_n_h = df_train[df_train['encoded_labels'].apply(lambda x: any(label == 18 for label in x))].reset_index(drop=True)


In [93]:
df_n_h

Unnamed: 0,article_id,paragraph,text,refined_technique,encoded_labels
0,43,22,"Ad esempio, i ghiacciai del Calderone (Gran Sa...",no_technique_detected,[18]
1,4,3,In questa ottava pillola su energia e clima ri...,no_technique_detected,[18]
2,56,15,Vorrei solo confutare un leitmotiv ricorrente ...,no_technique_detected,[18]
3,35,4,"In breve, secondo quanto riporta il giornale e...",no_technique_detected,[18]
4,38,14,"Infatti, una simile falsa teoria si diffuse al...",no_technique_detected,[18]
...,...,...,...,...,...
277,59,14,Naturalmente consumo e produzione energetica n...,no_technique_detected,[18]
278,34,5,Noi partiamo dalle previsioni allarmanti sul “...,no_technique_detected,[18]
279,51,11,"È stato calcolato che un solo vulcano, l’Hunga...",no_technique_detected,[18]
280,62,36,Ma la risposta è errata perché il corpo umano ...,no_technique_detected,[18]


In [94]:
embeds_h = get_embeddings(df_h, model)
embeddings_n_h = get_embeddings(df_n_h, model)

In [47]:
embeds_h.shape

(527, 1024)

In [95]:
def get_dpp_points(dpp, num_samples, df):
    
    dpp.compute_kernel(kernel_type = 'cos-sim', sigma= 0.4)
    samples_n = dpp.sample_k(num_samples) # These will return the indices 
    # Be very careful you have to use the index column not then index in itself
    df_sampled = df.iloc[samples_n]
    return df_sampled   

In [96]:
dpp_h = DPP(embeds_h) # Call by reference
dpp_n_h = DPP(embeddings_n_h) 

In [97]:
df_samples_h = get_dpp_points(dpp_h, num_samples = 20, df = df_h)
df_samples_n_h = get_dpp_points(dpp_n_h, num_samples = 10, df = df_n_h)

In [98]:
df_samples_h

Unnamed: 0,article_id,paragraph,text,refined_technique,encoded_labels
14,36,5,"Il motivo risiede:, • sia nel fatto che le pre...","Straw_Man/Red_Herring,Smear/Doubt,Loaded_Language","[16, 15, 10]"
42,5,18,"Così, come gente che passava di qua, degli ali...","Loaded_Language,Exaggeration_Minimisation,Smea...","[10, 7, 15]"
128,49,5,Ma voi ci credete ai politici di destra che di...,Smear/Doubt,[15]
167,0,115,Le persone libere da militanze ideologiche e n...,"Appeal_to_Fear,Causal_Oversimplification/Conse...","[1, 6, 11, 10, 7, 12, 15]"
169,67,2,La segretaria Pd prima vota e sostiene le camp...,"Tu_Quoque/Whataboutism,False_Dilemma_No_Choice...","[17, 8, 15, 10]"
171,37,13,Se le regole vengono scritte in maniera ideolo...,"Appeal_to_Values/Flag_Waving,Causal_Oversimpli...","[4, 6, 10, 7, 15, 5]"
212,0,34,Le preoccupazioni sul clima in ambito UE sono ...,"Loaded_Language,Smear/Doubt","[10, 15]"
247,66,38,"A proposito, le manovre per diminuire la produ...","Loaded_Language,Exaggeration_Minimisation,Smea...","[10, 7, 15, 1]"
258,55,8,"Sono essi che ci tengono al caldo d’inverno, a...","Loaded_Language,Exaggeration_Minimisation,Caus...","[10, 7, 6]"
275,62,38,Tale debolezza può essere superata solo con un...,False_Dilemma_No_Choice,[8]


In [99]:
df_samples_n_h

Unnamed: 0,article_id,paragraph,text,refined_technique,encoded_labels
44,37,21,"In soldoni, Pechino ha visto la sua produzione...",no_technique_detected,[18]
56,40,18,Allo stesso modo il mare si è innalzato più vo...,no_technique_detected,[18]
117,42,4,L’aspetto qualitativo delle curve di carico è ...,no_technique_detected,[18]
131,37,37,Nissan ha annunciato un drastico piano di rior...,no_technique_detected,[18]
141,50,60,"(**) In sintesi, la CO₂ antropogenica è un fat...",no_technique_detected,[18]
144,60,11,"In assenza di una solida ipotesi, nessuna agen...",no_technique_detected,[18]
146,54,7,"E non sorprende, dunque, se il suo utilizzo è ...",no_technique_detected,[18]
206,50,26,Questo aumento è direttamente legato alle atti...,no_technique_detected,[18]
240,47,20,Queste proposte non intendono modificare gli o...,no_technique_detected,[18]
276,40,7,Un presupposto ispirato dal fatto che Arrheniu...,no_technique_detected,[18]


In [104]:
def generate_shots(dpp, dpp_n, num_samples, df, df_n):
    """
    Generate 'shot' lists from provided DataFrames and DPP points.
    """

    # Create the 'shot' column
    df_samples_h["shot"] = df_samples_h["text"] + ','+ df_samples_h["refined_technique"].astype(str)
    h_list = df_samples_h['shot'].to_list() #hyperpartisan shots

    df_samples_n_h["shot"] = df_samples_n_h["text"] + ','+ df_samples_n_h["refined_technique"].astype(str)
    n_h_list = df_samples_n_h['shot'].to_list() #non hyperpartisan shots

    # Convert to lists
    h_list = df_samples_h['shot'].to_list()  # hyperpartisan shots
    n_h_list = df_samples_n_h['shot'].to_list()  # non-hyperpartisan shots

    return h_list, n_h_list

In [105]:
h_list, n_h_list = generate_shots(dpp_h, dpp_n_h, num_samples=5, df=df_h, df_n=df_n_h) #check function generate_shots
h_list, n_h_list

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_samples_h["shot"] = df_samples_h["text"] + ','+ df_samples_h["refined_technique"].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_samples_n_h["shot"] = df_samples_n_h["text"] + ','+ df_samples_n_h["refined_technique"].astype(str)


(['Il motivo risiede:, • sia nel fatto che le previsioni erronee dell’IPCC provengono da un organismo delle Nazioni Unite che i decisori politici e i Media sono molto cauti nel contraddire (anche se sanno che sono discutibili); • sia nei colossali interessi economici che si sono creati attorno alla teoria dell’AGW (vedi più avanti la sezione “Il Business delle Emissioni zero”)., Oltre a questo, come spesso accade con gli atti di fede, il riscaldamento globale è entrato anche nella cultura del politicamente corretto, o cultura del piagnisteo , in cui qualcuno impedisce a qualcun altro di pensare o dire qualcosa! Di conseguenza si viene definiti negazionisti (e condannati come eretici) se:, 1.,Straw_Man/Red_Herring,Smear/Doubt,Loaded_Language',
  'Così, come gente che passava di qua, degli alieni che hanno scoperto solo oggi il pianeta terra. Paradigmatico dell’informazione che gira oggi.,Loaded_Language,Exaggeration_Minimisation,Smear/Doubt',
  'Ma voi ci credete ai politici di destra c

In [23]:
h_shot = []
n_h_shot = []
for k in range(3):
    df_samples_h = get_dpp_points(dpp_h, num_samples = 5, df = df_h)
    df_samples_n_h = get_dpp_points(dpp_n_h, num_samples = 5, df = df_n_h)

    h_list, n_h_list = generate_shots(dpp_h, dpp_n_h, num_samples=5, df=df_h, df_n=df_n_h)

    h_shot.append(h_list)
    n_h_shot.append(n_h_list)

print(h_shot, n_h_shot)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_samples_h["shot"] = df_samples_h["text"] + ','+ df_samples_h["label"].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_samples_n_h["shot"] = df_samples_n_h["text"] + ','+ df_samples_n_h["label"].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_samples_h["shot"] = df_sam

[['Kevin McCarthy calling corona virus Chinese is a racist ploy to make a public health issue something that can be blamed on "others" - API kids in schools already being taunted about having the virus - this kind of "leadership" by Republicans is a real danger to all of us.,1', "Let me be very clear about how I believe that wearing a mask is better than taking the COVID Vaccine. First off, vaccines AREN'T full-prooff, and with these new COVID variants, we have NO IDEA how effective ALL COVID VACCINES are against the new strains at all.,1", "WSJ's @mgordonwsj reported Russian efforts to sow disinfo regarding Pfizer &amp; other vaccines 👉Read more for insights on Russia's overt information manipulation re: vaccines from a new @SecureDemocracy report by Bret Shafer and the Hamilton team https://t.co/hPFxruP4hC,1", 'we have 1.3 million vaccine doses, but only 100k have been given. Some states are lagging badly and now not sharing data QLD only gave 22% of doses they got last week, VIC jus

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_samples_h["shot"] = df_samples_h["text"] + ','+ df_samples_h["label"].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_samples_n_h["shot"] = df_samples_n_h["text"] + ','+ df_samples_n_h["label"].astype(str)


In [None]:
import json

runs = {}

for k in range(3):
    # Generate the shots
    df_samples_h = get_dpp_points(dpp_h, num_samples=15, df=df_h)
    df_samples_n_h = get_dpp_points(dpp_n_h, num_samples=5, df=df_n_h)

    h_list, n_h_list = generate_shots(dpp_h, dpp_n_h, num_samples=5, df=df_h, df_n=df_n_h)

    dic_h = {f"h_shot_run_{i+1}": el for i, el in enumerate(h_list)}
    dic_n_h = {f"h_shot_run_{i+1}": el for i, el in enumerate(n_h_list)}


    runs[f"run_n_{k+1}"] = {'dic_h': dic_h, 'dic_n_h':dic_n_h}

with open(f"DPP.json", "w") as json_file:
    json.dump(runs, json_file, indent=4)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_samples_h["shot"] = df_samples_h["text"] + ','+ df_samples_h["refined_technique"].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_samples_n_h["shot"] = df_samples_n_h["text"] + ','+ df_samples_n_h["refined_technique"].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sa