In [1]:
import pandas as pd
import numpy as np
import os
import torch 
import torch.nn.functional as F

import plotly
import plotly.graph_objects as go

from config import DATA_FOLDER, DATA_PCL_NAME, DATA_CATEGORIES_NAME
from utils import Utils
from transformers import BertTokenizer

import plotly
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score

df = pd.read_csv(os.path.join(
            os.path.dirname(os.getcwd()),
            DATA_FOLDER,
            DATA_PCL_NAME
        ))

df = df.dropna()
df

Unnamed: 0,par_id,art_id,keyword,country_code,text,label,binary_label
0,1,24942188,hopeless,ph,"We 're living in times of absolute insanity , ...",0.0,0.0
1,2,21968160,migrant,gh,"In Libya today , there are countless number of...",0.0,0.0
2,3,16584954,immigrant,ie,"""White House press secretary Sean Spicer said ...",0.0,0.0
3,4,7811231,disabled,nz,Council customers only signs would be displaye...,0.0,0.0
4,5,1494111,refugee,ca,""""""" Just like we received migrants fleeing El ...",0.0,0.0
...,...,...,...,...,...,...,...
10463,10464,19612634,disabled,ie,"""When Marie O'Donoghue went looking for a spec...",0.0,0.0
10464,10465,14297363,women,lk,"""Sri Lankan norms and culture inhibit women fr...",1.0,0.0
10465,10466,70091353,vulnerable,ph,He added that the AFP will continue to bank on...,0.0,0.0
10466,10467,20282330,in-need,ng,""""""" She has one huge platform , and informatio...",3.0,1.0


In [2]:
set(df["keyword"].unique())

{'disabled',
 'homeless',
 'hopeless',
 'immigrant',
 'in-need',
 'migrant',
 'poor-families',
 'refugee',
 'vulnerable',
 'women'}

In [112]:
def save_figure(fig, fig_name: str) -> None:
    fig_path = os.path.join("plot_folder", fig_name)
    if isinstance(fig, plotly.graph_objs._figure.Figure):  # plotly
        fig.write_image(fig_path)
    else:  # matplotlib
        fig.savefig(fig_path)

def dataset_analysis(df_analysis) -> None:
    # make a copy first
    df = df_analysis.copy(deep=True)

    # Distribution of labels
    fig = go.Figure(data=[
        go.Bar(
            name="No PCL",
            x=['No PCL'],
            y=[len(df[(df["binary_label"] == 0)])]
        ),
        go.Bar(
            name="PCL",
            x=['PCL'],
            y=[len(df[(df["binary_label"] == 1)])]
        ),
    ])
    fig.update_layout(
        barmode="group",
        showlegend=True,
        title_text="Distribution of PCL and No PCL labels",
        margin=dict(l=0, r=0, t=30, b=0))
    fig.show()
    save_figure(fig, "label_pcl_no_pcl_distribution.png")

    # Distribution of PCL labels
    fig = go.Figure(data=[
        go.Bar(
            name="PCL level 2",
            x=['Level 2'],
            y=[len(df[(df["label"] == 2)])]
        ),
        go.Bar(
            name="PCL level 3",
            x=['Level 3'],
            y=[len(df[(df["label"] == 3)])]
        ),
        go.Bar(
            name="PCL level 4",
            x=['Level 4'],
            y=[len(df[(df["label"] == 4)])]
        )
    ])
    fig.update_layout(
        barmode="group",
        showlegend=True,
        title_text="Distribution of PCL labels",
        margin=dict(l=0, r=0, t=30, b=0))
    fig.show()
    save_figure(fig, "label_pcl_distribution.png")

    # Label per type of article
    article_type = np.unique(df["keyword"])
    fig = go.Figure(data=[
        go.Bar(
            name="No PCL",
            x=article_type,
            y=[len(df[(df["keyword"] == c) & (df["binary_label"] == 0)]) for c in article_type]
        ),
        go.Bar(
            name="PCL",
            x=article_type,
            y=[len(df[(df["keyword"] == c) & (df["binary_label"] == 1)]) for c in article_type]
        ),
    ])
    fig.update_layout(
        barmode="group",
        showlegend=True,
        title_text="Distribution per article type",
        margin=dict(l=0, r=0, t=30, b=0))
    fig.show()
    save_figure(fig, "label_article.png")


    # Label per country code
    country_codes = np.unique(df["country_code"])
    fig = go.Figure(data=[
        go.Bar(
            name="No PCL",
            x=country_codes,
            y=[len(df[(df["country_code"] == c) & (df["binary_label"] == 0)]) for c in country_codes]
        ),
        go.Bar(
            name="PCL",
            x=country_codes,
            y=[len(df[(df["country_code"] == c) & (df["binary_label"] == 1)]) for c in country_codes]
        ),
    ])
    fig.update_layout(
        barmode="group",
        showlegend=True,
        title_text="Distribution per country code",
        margin=dict(l=0, r=0, t=30, b=0))
    fig.show()
    save_figure(fig, "label_country.png")


    # Length of text and label
    df["length_text"] = df["text"].apply(lambda x: len(x))
    df_test = df.copy(deep=True)
    fig = go.Figure(data=[
        go.Box(
            name="No PCL",
            x=df_test[(df_test["binary_label"] == 0) & (df_test["length_text"] < 5000)]["length_text"],
            boxpoints='outliers', # can also be outliers, or suspectedoutliers, or False
            jitter=1, # add some jitter for a better separation between points
        ),
        go.Box(
            name="PCL",
            x=df_test[(df_test["binary_label"] == 1) & (df_test["length_text"] < 5000)]["length_text"],
            boxpoints='outliers', # can also be outliers, or suspectedoutliers, or False
            jitter=1, # add some jitter for a better separation between points
        )
        ])
    fig.update_layout(
        showlegend=True,
        margin=dict(l=0, r=0, t=30, b=0),
        title_text="Length of text for PCL and No PCL labels")
    fig.show()
    save_figure(fig, "label_length_text_box.png")


    # Length of text and label
    df["length_text"] = df["text"].apply(lambda x: len(x))
    article_type = np.unique(df["keyword"])
    df_test = df.copy(deep=True)

    fig = go.Figure(data=[
        go.Box(
            name=c,
            x=["No PCL"]*len(df_test[(df_test["keyword"] == c) & (df_test["binary_label"] == 0) & (df_test["length_text"] < 1000)]["length_text"])+ 
            ["PCL"]*len(df_test[(df_test["keyword"] == c) & (df_test["binary_label"] == 1) & (df_test["length_text"] < 1000)]["length_text"]),
            y=np.concatenate([
                df_test[(df_test["keyword"] == c) & (df_test["binary_label"] == 0) & (df_test["length_text"] < 1000)]["length_text"], 
                df_test[(df_test["keyword"] == c) & (df_test["binary_label"] == 1) & (df_test["length_text"] < 1000)]["length_text"]
            ]),
            boxpoints='outliers', # can also be outliers, or suspectedoutliers, or False
            jitter=1, # add some jitter for a better separation between points
        ) for c in article_type])
    fig.update_layout(
        showlegend=True,
        margin=dict(l=0, r=0, t=30, b=0),
        boxmode='group',
        title_text="Length of text for PCL and No PCL labels per article type")
    fig.update_traces(orientation='v') # v box plots
    fig.show()
    save_figure(fig, "label_length_text_box_per_article.png")

    # Length of text and label
    df["length_text"] = df["text"].apply(lambda x: len(x))
    country_codes = np.unique(df["country_code"])
    df_test = df.copy(deep=True)

    fig = go.Figure(data=[
        go.Box(
            name=c,
            x=["No PCL"]*len(df_test[(df_test["country_code"] == c) & (df_test["binary_label"] == 0) & (df_test["length_text"] < 1000)]["length_text"])+ 
            ["PCL"]*len(df_test[(df_test["country_code"] == c) & (df_test["binary_label"] == 1) & (df_test["length_text"] < 1000)]["length_text"]),
            y=np.concatenate([
                df_test[(df_test["country_code"] == c) & (df_test["binary_label"] == 0) & (df_test["length_text"] < 1000)]["length_text"], 
                df_test[(df_test["country_code"] == c) & (df_test["binary_label"] == 1) & (df_test["length_text"] < 1000)]["length_text"]
            ]),
            boxpoints='outliers', # can also be outliers, or suspectedoutliers, or False
            jitter=1, # add some jitter for a better separation between points
        ) for c in country_codes])
    fig.update_layout(
        showlegend=True,
        margin=dict(l=0, r=0, t=30, b=0),
        boxmode='group',
        title_text="Length of text for PCL and No PCL labels per country")
    fig.update_traces(orientation='v') # v box plots
    fig.show()
    save_figure(fig, "label_length_text_box_per_country.png")


    quantiles = [np.quantile(df["length_text"], i / 10) for i in range(11)]
    fig = go.Figure(data=[
        go.Bar(
            name="No PCL",
            x=[f"Quantile {int(10 * i)}%" for i in range(1, 11)],
            y=[len(df[(df["length_text"] >= quantiles[i]) & (df["length_text"] <= quantiles[i+1]) & (df["binary_label"] == 0)]) for i in range(10)]
        ),
        go.Bar(
            name="PCL",
            x=[f"Quantile {int(10 * i)}%" for i in range(1, 11)],
            y=[len(df[(df["length_text"] >= quantiles[i]) & (df["length_text"] <= quantiles[i+1]) & (df["binary_label"] == 1)]) for i in range(10)]
        ),
    ])
    fig.update_layout(
        barmode="group",
        showlegend=True,
        margin=dict(l=0, r=0, t=30, b=0),
        title_text="Label per length of text quantile",
        xaxis_tickangle=-45)
    save_figure(fig, "label_length_text_quantile.png")
        
    fig.show()

In [113]:
dataset_analysis(df)

Identifying patronizing or condescending language can be difficult because it often involves subtle cues and contextual factors that can be hard to discern. Here are some reasons why this can be challenging:

- Degree of patronizing or condescending language: We can also see that in the data there is a specturm of levels of condescending language. Some are much more subtle than others. As a result, it can be difficult to distinugish between small levels of patronizing or condescending language.

- Intent vs. impact: Sometimes, people may not intend to be patronizing or condescending, but their words or tone can still come across that way. Other times, people may deliberately use patronizing or condescending language as a way of asserting power or authority over someone else. In either case, it can be hard to distinguish between someone who is genuinely trying to be helpful and someone who is being condescending.

- Cultural and social norms: Different cultures and social groups may have different expectations around communication styles, and what one person perceives as patronizing or condescending may be seen as perfectly normal or even respectful by someone else. This can make it difficult to know whether a particular communication is intended to be patronizing or not.

- Context: The use of patronizing or condescending language is often tied to power dynamics, such as age, gender, race, or social status. For example, a younger person might feel patronized by an older person, or a person of color might feel condescended to by a white person. These power dynamics can make it difficult to challenge patronizing language directly, especially if the person using it has more power in the situation. Since the context in whihc the patronizing or condescending language is found can change a lot. We can see that patrionization can be different across different groups:

    - Women: Patronizing behavior towards women often involves assuming that they are less competent or intelligent than men, and treating them as if they need to be protected or guided. For example, a man might interrupt a woman during a conversation or explain something to her in a condescending tone, even if she already understands the topic.

    - People of color: Patronizing behavior towards people of color often involves assuming that they are less educated or knowledgeable than white people, and treating them as if they need to be taught or corrected. For example, a white person might speak slowly or use simplified language when talking to a person of color, even if the person is fluent in English.

    - Older adults: Patronizing behavior towards older adults often involves assuming that they are frail, forgetful, or out of touch with modern technology or culture. For example, a younger person might speak loudly or slowly to an older adult, or assume that they are unable to use a computer or smartphone.

- Nonverbal cues: Tone of voice, facial expressions, and body language can all contribute to the perception of patronizing or condescending language. However, these cues can be subtle and subjective, making it hard to determine whether someone is being intentionally condescending or not.

Overall, identifying patronizing or condescending language requires careful attention to both verbal and nonverbal cues, as well as an understanding of the cultural and social context in which the communication is taking place.

EXAMPLES: 
- Degree of patronizing or condescending language

    Level 2: "She reiterated her ministry's commitment to put in place the necessary legal and policy framework to address all issues that affect women's rights and gave a strong indication of hope for Ghanaian women."  (par_id:6249, art_id:1947926)		

    Level 4: "Women are generally not as competitive as men, and not as motivated by job prestige. We want to actualise but not necessarily in the same ways as men. While men may want to lead conglomerates , corporations and boards , women are generally content to have a good, secure job with an attractive salary. Some women do pursue power positions, but many are not interested. They don't want the hassle." (par_id:1194, art_id:2306623)

- Intent vs. impact

    Level 2: "Mombasa county team manager Anisa Abdala called on the corporate community to sponsor various teams as a way of showing solidarity with the disabled." (par_id:715, art_id:1852135) 	

- Cultural and social norms:

    Level 3: "The AIMMM registers its full sympathy with the hapless female victims of violence and rape crimes , especially the young lady brutalised by beasts on a moving Delhi bus in the night of 16 December which has rightly agitated a large section of Indians. All such crimes against women and weaker sections , especially minorities, Dalits and Adivasis are highly condemnable and show the whole country in extremely bad light. It is time people, police, judiciary and government agencies stood up to discharge their duties to protect the weak and vulnerable sections of society. The AIMMM urges the government to introduce deterrent punishments for such brutal crimes. It should also hasten with the overdue Police reforms to liberate the police force from the British Raj rules and make it capable and accountable to protect all citizens, human rights and honour of ordinary people." (par_id:2803, art_id:1975006, country_code='in')

    Level 4: "Instead of passively paying a sickness benefit for 40 years, for example, we want to take steps to intervene now to help vulnerable New Zealanders get a job, lead a better life, and save the Government money in the long run."(par_id:438, art_id:8782967, country_code='nz')

- Context

    Level 3: "The organizing committee also confirmed that Russian low-income citizens, including families with many children, orphans and disabled people, will be able to visit the Confederations Cup matches for free." (par_id:10371, art_id:19036460, keyword:"disabled")

    Level 4: "Her house now holds the memories and keepsakes of a migrant's life well lived, selflessly offering her home and love to more than 100 British foster children and making history in the process -- along with her late husband --  by being the first black couple in the Lambeth Council in South London to be allowed to foster white children." (par_id:734, art_id:4563003, country_code='migrant')

In [60]:
import plotly.graph_objects as go

y = ['day 1', 'day 1', 'day 1', 'day 1', 'day 1', 'day 1',
     'day 2', 'day 2', 'day 2', 'day 2', 'day 2', 'day 2']

fig = go.Figure()
fig.add_trace(go.Box(
    x=[0.2, 0.2, 0.6, 1.0, 0.5, 0.4, 0.2, 0.7, 0.9, 0.1, 0.5, 0.3],
    y=y,
    name='kale',
    marker_color='#3D9970'
))
fig.add_trace(go.Box(
    x=[0.6, 0.7, 0.3, 0.6, 0.0, 0.5, 0.7, 0.9, 0.5, 0.8, 0.7, 0.2],
    y=y,
    name='radishes',
    marker_color='#FF4136'
))
fig.add_trace(go.Box(
    x=[0.1, 0.3, 0.1, 0.9, 0.6, 0.6, 0.9, 1.0, 0.3, 0.6, 0.8, 0.5],
    y=y,
    name='carrots',
    marker_color='#FF851B'
))

fig.update_layout(
    xaxis=dict(title='normalized moisture', zeroline=False),
    boxmode='group'
)

fig.update_traces(orientation='h') # horizontal box plots
fig.show()