# Calculation of Bias Variables 

In [2]:
# Import relevant libraries
import pandas as pd
from Dbias.bias_classification import classify
import numpy as np

In [3]:
# import datasets
df0_cleaned = pd.read_parquet('data/sentiment_data_0.parquet', engine="pyarrow")

# check first few rows
df0_cleaned.head(2)

Unnamed: 0,ID,publishedAt,instances,source-id,source-name,en-title,language,location_code,location,category,year,month,neg,neu,pos,compound,sentiment_category
0,12436,2020-08-06T09:21:27Z,"[{'category': 'general', 'collectedAt': '2020-...",,Albidda.net,A doctor warns of new symptoms of “Corona” tha...,ar,ae,United Arab Emirates,general,2020,8,0.149,0.851,0.0,-0.2023,negative
1,12541,2020-08-06T15:45:39Z,"[{'category': 'general', 'collectedAt': '2020-...",,Middle East Online,Foldable phones lead Samsung to climb the top ...,ar,ae,United Arab Emirates,general,2020,8,0.0,0.893,0.107,0.2023,positive


In [6]:
# add bias category and probability to each dataframe based on the 'en-title' column

def add_bias_cols(df):
    df_copy = df.copy()

    # Extract text column as clean strings
    texts = df_copy["en-title"].fillna("").astype(str).tolist()

    # Batch inference ↓↓↓
    outputs = classify(texts)

    # Convert model output (list of dicts) to dataframe
    bias_df = pd.DataFrame(outputs)

    # Merge back
    return pd.concat([df_copy.reset_index(drop=True), bias_df], axis=1)

In [7]:
# test on subset
subset = df0_cleaned.head(20)
subset_w_bias = add_bias_cols(subset)
subset_w_bias.head(2)

TensorFlow and JAX classes are deprecated and will be removed in Transformers v5. We recommend migrating to PyTorch classes or pinning your version of Transformers.


Unnamed: 0,ID,publishedAt,instances,source-id,source-name,en-title,language,location_code,location,category,year,month,neg,neu,pos,compound,sentiment_category,label,score
0,12436,2020-08-06T09:21:27Z,"[{'category': 'general', 'collectedAt': '2020-...",,Albidda.net,A doctor warns of new symptoms of “Corona” tha...,ar,ae,United Arab Emirates,general,2020,8,0.149,0.851,0.0,-0.2023,negative,Non-biased,0.781007
1,12541,2020-08-06T15:45:39Z,"[{'category': 'general', 'collectedAt': '2020-...",,Middle East Online,Foldable phones lead Samsung to climb the top ...,ar,ae,United Arab Emirates,general,2020,8,0.0,0.893,0.107,0.2023,positive,Biased,0.581067


In [None]:
# turn score into signed score 
def add_sign(row):
    if row.label == "Biased":
        return row.score  # positive
    elif row.label == "Non-biased":
        return -row.score # negative
    else:
        return np.nan

In [9]:
# test on subset
subset_w_bias["bias_score"] = subset_w_bias.apply(add_sign, axis=1)
subset_w_bias.head(2)

Unnamed: 0,ID,publishedAt,instances,source-id,source-name,en-title,language,location_code,location,category,year,month,neg,neu,pos,compound,sentiment_category,label,score,bias_score
0,12436,2020-08-06T09:21:27Z,"[{'category': 'general', 'collectedAt': '2020-...",,Albidda.net,A doctor warns of new symptoms of “Corona” tha...,ar,ae,United Arab Emirates,general,2020,8,0.149,0.851,0.0,-0.2023,negative,Non-biased,0.781007,-0.781007
1,12541,2020-08-06T15:45:39Z,"[{'category': 'general', 'collectedAt': '2020-...",,Middle East Online,Foldable phones lead Samsung to climb the top ...,ar,ae,United Arab Emirates,general,2020,8,0.0,0.893,0.107,0.2023,positive,Biased,0.581067,0.581067


In [13]:
# rename cols and drop 
subset_w_bias = subset_w_bias.rename(columns={"label":"bias_category"})
subset_w_bias = subset_w_bias.drop(columns=["score"])
subset_w_bias.head(2)

Unnamed: 0,ID,publishedAt,instances,source-id,source-name,en-title,language,location_code,location,category,year,month,neg,neu,pos,compound,sentiment_category,bias_category,bias_score
0,12436,2020-08-06T09:21:27Z,"[{'category': 'general', 'collectedAt': '2020-...",,Albidda.net,A doctor warns of new symptoms of “Corona” tha...,ar,ae,United Arab Emirates,general,2020,8,0.149,0.851,0.0,-0.2023,negative,Non-biased,-0.781007
1,12541,2020-08-06T15:45:39Z,"[{'category': 'general', 'collectedAt': '2020-...",,Middle East Online,Foldable phones lead Samsung to climb the top ...,ar,ae,United Arab Emirates,general,2020,8,0.0,0.893,0.107,0.2023,positive,Biased,0.581067


In [14]:
# function to put it all together 

def add_and_clean_bias(df):

    # add bias cols to df 
    new_df = add_bias_cols(df)
    print("Bias columns added!")

    # convert to signed score
    new_df["bias_score"] = new_df.apply(add_sign, axis=1)

    # rename and drop
    new_df = new_df.rename(columns={"label":"bias_category"})
    new_df = new_df.drop(columns=["score"])
    print("Cleaning finished!")

    # return
    return new_df

In [15]:
import pyarrow.parquet as pq

def process_parquet_in_chunks(path, batch_size=5000):

    parquet_file = pq.ParquetFile(path)
    result_chunks = []
    count = 0

    for batch in parquet_file.iter_batches(batch_size=batch_size):
        count += 1
        print(f"Running batch {count}:")
        df = batch.to_pandas()
        processed_chunk = add_and_clean_bias(df)
        result_chunks.append(processed_chunk)
        print(f"Finished batch {count}!")

    final_df = pd.concat(result_chunks, ignore_index=True)
    return final_df

In [None]:
# add bias cols to df 
df0_bias = process_parquet_in_chunks("data/sentiment_data_0.parquet", batch_size=10000)

# check first rows
df0_bias.head(2)

Running batch 1:
Bias columns added!
Cleaning finished!
Finished batch 1!
Running batch 2:
Bias columns added!
Cleaning finished!
Finished batch 2!
Running batch 3:


In [None]:
# Save bias cols to new file
df0_bias.to_parquet("data/final_data_0.parquet", index=False)

In [None]:
# loop through rest of files 

for i in range(1, 5):
    print(i)

    # get filename and new filename 
    filename = f"data/sentiment_data_{i}.parquet"
    new_filename = f"data/final_data_{i}.parquet"

    # process data 
    print(f"Processing {filename}:")
    df_bias = process_parquet_in_chunks(filename, batch_size=5000)

    # save to parquet 
    df_bias.to_parquet(new_filename, index=False)
    print(f"Successfully saved {new_filename}!")