# Data exploration: Kumar dataset

In [99]:
import os
import sys
import json
import pandas as pd
from tqdm import tqdm
import pickle

sys.path.append('../modules/')

from data_utils import generate_aggregated_labels_dataset

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
RAW_DATA_PATH = '/data1/moscato/personalised-hate-boundaries-data/data/kumar_with_annotator_id.json'
DATA_PATH = '/data1/moscato/personalised-hate-boundaries-data/data/kumar.csv'
COMPLETE_DATA_PATH = '/data1/moscato/personalised-hate-boundaries-data/data/kumar_processed_with_ID_and_full_perspective.csv'

## Compare the raw data and the data from the csv file [SAFELY IGNORE THIS SECTION - NO LONGER NEEDED]

**Results:** the raw data contains two more fields, `gender_other` and `worker_id`.

In [None]:
with open(RAW_DATA_PATH, 'r') as f:
    kumar_raw_data = f.readlines()

kumar_raw_data_df = []

for line in tqdm(kumar_raw_data):
    line_json = json.loads(line)

    line_df = pd.DataFrame(line_json['ratings'])
    line_df['comment'] = line_json['comment']
    line_df['source'] = line_json['source']
    line_df['perspective_score'] = line_json['perspective_score']
    line_df['comment_id'] = line_json['comment_id']

    kumar_raw_data_df.append(line_df)

kumar_raw_data_df = pd.concat(kumar_raw_data_df)

kumar_data = pd.read_csv(DATA_PATH).drop(columns=['Unnamed: 0'])

In [None]:
fields_from_json = set(kumar_raw_data_df.columns)
fields_from_csv = set(kumar_data.drop(columns=['comment', 'source', 'perspective_score', 'comment_id']).columns)

In [None]:
fields_from_json - fields_from_csv

## Explore the complete dataset

In [36]:
complete_df = pd.read_csv(COMPLETE_DATA_PATH).drop(columns=['X.x'])

# Unique identifier for comments (still multiple annotations
# for each of them).
complete_df['instance_id'] = (
    complete_df['comment_id'].astype(str)
    + '_'
    + complete_df['source']
)

# Unique identifier for (comment, annotation) pairs.
complete_df['sample_id'] = (
    complete_df['comment_id'].astype(str)
    + '_'
    + complete_df['source']
    + '_'
    + complete_df['worker_id']
)

  complete_df = pd.read_csv(COMPLETE_DATA_PATH).drop(columns=['X.x'])


There exist comment IDs associated to different sources.

In [37]:
(complete_df.groupby('comment_id')['source'].nunique() == 1).all()

np.False_

In [38]:
comments_multiple_source = complete_df.groupby('comment_id').agg(
    n_sources=pd.NamedAgg('source', 'nunique'),
    n_comments=pd.NamedAgg('comment', 'nunique')
).reset_index().sort_values('n_sources', ascending=False).reset_index(drop=True)

comments_multiple_source

Unnamed: 0,comment_id,n_sources,n_comments
0,30121,2,2
1,62611,2,2
2,66415,2,2
3,66416,2,2
4,66417,2,2
...,...,...,...
104682,72577,1,1
104683,72576,1,1
104684,72575,1,1
104685,72571,1,1


In [39]:
# Comments with same comment ID, different source and different text.
complete_df[
    complete_df['comment_id'].isin(
        comments_multiple_source[comments_multiple_source['n_sources'] > 1]['comment_id']
    )
][['comment_id', 'source', 'comment']].drop_duplicates().sort_values('comment_id')

Unnamed: 0,comment_id,source,comment
35305,715,4chan,black hebrews calls the Holocaust a joke and a...
49155,715,reddit,It's a dog whistle you terrorist enabler nazi ...
49170,718,reddit,I am not this invested in battle rap
35320,718,twitter,shfkfiyfg why are you like that
112900,1011,reddit,"Damn, you sir are sexy as fuck!"
...,...,...,...
458715,77017,reddit,But we don't have any yet
458720,77018,reddit,"I'm not a 2A zealot, I'm a lawyer and a gun ow..."
6670,77018,twitter,"Yeah, her and Paultin, at least, shouldn't be ..."
111885,77121,twitter,Jess gotta call Shani when she needs her nails...


`instance_id` (concatenating `comment_id` and `source`) is not a good identifier for comments: there exist samples with same `instance_id` but different text.

In [40]:
(complete_df.groupby('instance_id')['comment'].nunique() >1).mean()

np.float64(0.01703885009025015)

In [41]:
t = (
    complete_df.groupby('instance_id')['comment']
    .nunique()
    .reset_index()
    .rename(columns={'comment': 'n_comments'})
)

t[t['n_comments'] > 1]

Unnamed: 0,instance_id,n_comments
20,10015_twitter,2
21,10016_twitter,2
202,10121_twitter,2
299,1016_twitter,2
320,1017_twitter,2
...,...,...
105041,9517_twitter,2
105042,9518_twitter,2
105043,9519_twitter,2
105056,9521_twitter,2


In [42]:
(
    complete_df[
        complete_df['instance_id'].isin(t[t['n_comments'] > 1]['instance_id'])
    ]
    [['instance_id', 'comment']]
    .drop_duplicates()
    .sort_values('instance_id')
)

Unnamed: 0,instance_id,comment
130505,10015_twitter,Ariana really fucked around and made one of th...
64455,10015_twitter,Sounds like you're a no one who's gonna die bi...
130510,10016_twitter,I’m pretty shocked at the gayness of my state.
64460,10016_twitter,People all over the world is supporting ‘City ...
112985,10121_twitter,I think we can all agree that Ghislaine Maxwel...
...,...,...
121825,9519_twitter,Ahhhh #Fredo are you still trying to be relevant?
105700,9521_twitter,Disappointed but not suprised
99985,9521_twitter,came back to discord to see this
125585,9621_twitter,"Captain Chrunch, Reese’s Puffs, and Cinnamon l..."


There exist two samples with same `sample_id` (concatenating `comment_id`, `source` and `worker_id`) but with different comments (text).

In [43]:
t2 = complete_df.groupby('sample_id')['comment'].nunique().reset_index()

t2[t2['comment'] > 1]

Unnamed: 0,sample_id,comment
434643,65615_4chan_5221b352f3ef42dbc6cac4fe080d46605f...,2


In [44]:
complete_df[complete_df['sample_id'].isin(t2[t2['comment'] > 1]['sample_id'])]

Unnamed: 0,comment,source,perspective_score,comment_id,personally_been_target,toxic_score,remove_from_online,personally_seen_toxic_content,religion_important,uses_media_video,...,TOXICITY,THREAT,INSULT,PROFANITY,IDENTITY_ATTACK,SEVERE_TOXICITY,instance_id,toxic_score_mean,toxic_score_sd,sample_id
123158,That's not what you're saying right now. Again...,4chan,0.837722,65615,False,1,It depends on the context,True,Not important,True,...,0.811563,0.010395,0.633669,0.709284,0.030691,0.191499,65615_4chan,1.1,1.449138,65615_4chan_5221b352f3ef42dbc6cac4fe080d46605f...
459709,who wants to feel special and as if you have p...,4chan,0.187817,65615,False,0,This comment should be allowed,True,Not important,True,...,0.811563,0.010395,0.633669,0.709284,0.030691,0.191499,65615_4chan,1.1,1.449138,65615_4chan_5221b352f3ef42dbc6cac4fe080d46605f...


Check: is `sample_id` a unique identifier?

In [45]:
t3 = complete_df.groupby('sample_id')['worker_id'].count().reset_index()

t3[t3['worker_id'] > 1]

Unnamed: 0,sample_id,worker_id
29880,134110_twitter_25b4b2033e0e1383bd01bd50040439f...,3
29883,134111_reddit_25b4b2033e0e1383bd01bd50040439f0...,3
29886,134112_twitter_25b4b2033e0e1383bd01bd50040439f...,3
29889,134113_twitter_25b4b2033e0e1383bd01bd50040439f...,3
29892,134114_4chan_25b4b2033e0e1383bd01bd50040439f05...,3
...,...,...
522150,8115_twitter_b02152d26c28b1d5a3836fc9faa75d730...,2
522160,8116_twitter_b02152d26c28b1d5a3836fc9faa75d730...,2
522164,8117_4chan_b02152d26c28b1d5a3836fc9faa75d73053...,2
522173,8118_reddit_b02152d26c28b1d5a3836fc9faa75d7305...,2


In [46]:
complete_df[
    complete_df['sample_id'].isin(t3[t3['worker_id'] > 1]['sample_id'])
][['comment_id', 'source', 'worker_id', 'comment', 'toxic_score']]

Unnamed: 0,comment_id,source,worker_id,comment,toxic_score
16103,16751,twitter,75724671bf72e9cf0bcbfa77d3924740aa0fbafab2e205...,For the culture and plus this man still doing ...,0
16104,16751,twitter,75724671bf72e9cf0bcbfa77d3924740aa0fbafab2e205...,For the culture and plus this man still doing ...,1
16108,16755,twitter,75724671bf72e9cf0bcbfa77d3924740aa0fbafab2e205...,"Easy for you to say, Jonah. Poor Pip!",1
16109,16755,twitter,75724671bf72e9cf0bcbfa77d3924740aa0fbafab2e205...,"Easy for you to say, Jonah. Poor Pip!",0
16113,16756,twitter,75724671bf72e9cf0bcbfa77d3924740aa0fbafab2e205...,if you rly think you’re just gonna catch a can...,1
...,...,...,...,...,...
505689,605921,reddit,3332bb75399ba7845a958d3fa22dbef4f68df7221ce477...,I wouldn't underestimate Tabasco. That shit ca...,0
505690,605922,reddit,3332bb75399ba7845a958d3fa22dbef4f68df7221ce477...,the documentary Who Killed The Electric Car ha...,2
505694,605922,reddit,3332bb75399ba7845a958d3fa22dbef4f68df7221ce477...,the documentary Who Killed The Electric Car ha...,0
505695,605923,reddit,3332bb75399ba7845a958d3fa22dbef4f68df7221ce477...,"I don’t think it’s defending this guy, more so...",0


## Build a clean dataset

Dataset cleaning:
1. Build two new identifiers,
- `text_id`: uniquely identifies a piece of text.
- `annotation_id`: uniquely identifies a (piece of text, annotation) pair (essentially encodes the `text_id` and `worker_id`, the latter of which we just need to trust from the original dataset).
2. Drop all the samples corresponding to pieces of text annotated more than once by the same annotator (i.e. the cases in which we have duplicated `annotation_id`s).

In [3]:
OUTPUT_PATH = '/data1/moscato/personalised-hate-boundaries-data/data/kumar_perspective_clean/'

In [4]:
complete_df_clean = pd.read_csv(COMPLETE_DATA_PATH).drop(columns=['X.x'])

# Define the `text_id` field.
text_ids_df = pd.DataFrame({
    'comment': complete_df_clean['comment'].unique(),
    'text_id': range(complete_df_clean['comment'].unique().shape[0])
})

text_ids_df['text_id'] = text_ids_df['text_id'].astype(str)

# Add the `text_id` field to the dataset.
complete_df_clean = pd.merge(
    left=complete_df_clean,
    right=text_ids_df,
    on='comment',
    how='left'
)

# Define the `annotation_id` field.
complete_df_clean['annotation_id'] = complete_df_clean['text_id'] + '_' + complete_df_clean['worker_id']

# Drop all the rows corresponding to duplicated `annotation_id`s
# (annotators labelling the same piece of text more than once).
rows_to_drop = complete_df_clean[complete_df_clean['annotation_id'].duplicated(keep=False)].reset_index(drop=True)

complete_df_clean = complete_df_clean[~complete_df_clean['annotation_id'].isin(rows_to_drop['annotation_id'])].reset_index(drop=True)

# Save dropped rows.
rows_to_drop.to_csv(os.path.join(OUTPUT_PATH, 'dropped_rows.csv'), index=None)

# Save clean dataset.
complete_df_clean.to_csv(os.path.join(OUTPUT_PATH, 'kumar_processed_with_ID_and_full_perspective_clean.csv'), index=None)

  complete_df_clean = pd.read_csv(COMPLETE_DATA_PATH).drop(columns=['X.x'])


## OLD VERSION: IGNORE!

Dataset cleaning:
1. Drop **all** the samples for which the same `(comment_id, source)` pair corresponds to more than one value in the `comment` field (i.e. the text).
2. Drop the **single** sample for which the same `(comment_id, source, worker_id)` triple (i.e. `sample_id` value) corresponds to more than one value in the `comment` field (i.e. the text).
3. Drop **all** the samples for which the `(comment_id, source, worker_id)` triple is not unique (annotators that annotated the same comment more than once).
4. Keep track of the sample IDs that have been dropped.

In [3]:
complete_df_clean = pd.read_csv(COMPLETE_DATA_PATH).drop(columns=['X.x', 'X.y', 'instance_id'])

complete_df_clean['sample_id'] = (
    complete_df_clean['comment_id'].astype(str)
    + '_'
    + complete_df_clean['source']
    + '_'
    + complete_df_clean['worker_id']
)


# Drop all the samples for which the same (comment_id, source) pair
# corresponds to multiple comments (text).
comment_id_source_duplicated_comments = (
    complete_df_clean.groupby(['comment_id', 'source'])['comment']
    .nunique()
    .rename('n_comments')
    .reset_index()
    .query('n_comments > 1')
)
comment_id_source_duplicated_comments['comment_id_source'] = (
    comment_id_source_duplicated_comments['comment_id'].astype(str)
    + '_'
    + comment_id_source_duplicated_comments['source']
)

complete_df_clean['comment_id_source'] = (
    complete_df_clean['comment_id'].astype(str)
    + '_'
    + complete_df_clean['source']
)

samples_to_drop_step_1 = complete_df_clean[
    complete_df_clean['comment_id_source'].isin(comment_id_source_duplicated_comments['comment_id_source'])
]['sample_id']

print(
    f'Dropping {samples_to_drop_step_1.shape[0]} samples'
    ' corresponding to same (comment_id, source) but different comments'
    ' (with multiple annotators for each)'
)

complete_df_clean = complete_df_clean[
    ~complete_df_clean['sample_id'].isin(samples_to_drop_step_1)
].reset_index(drop=True)


# Drop samples for which the same sample ID corresponds to multiple
# comments (text).
# This samples have already been dropped in the previous step.
sample_id_duplicated_comments = (
    complete_df_clean.groupby('sample_id')['comment']
    .nunique()
    .rename('n_comments')
    .reset_index()
    .query('n_comments > 1')
)

print('Samples to drop:', len(sample_id_duplicated_comments))


# Drop all samples with duplicated sample ID (i.e. annotators that
# annotated the same comment multiple times).
samples_to_drop_step_3 = complete_df_clean[complete_df_clean['sample_id'].duplicated(keep=False)]['sample_id']

print(
    f'Dropping {samples_to_drop_step_3.shape[0]} samples'
    ' corresponding to same duplicated sample IDs'
)

complete_df_clean = complete_df_clean[
    ~complete_df_clean['sample_id'].isin(samples_to_drop_step_3)
].reset_index(drop=True).drop(columns=['comment_id_source'])


# Save to disk.
# output_path = '../data/clean/kumar_perspective_clean/kumar_processed_with_ID_and_full_perspective_clean.csv'
output_path = '/data1/moscato/personalised-hate-boundaries-data/data/kumar_perspective_clean/kumar_processed_with_ID_and_full_perspective_clean.csv'

print('Saving clean data to:', output_path)

complete_df_clean.to_csv(output_path, index=None)

  complete_df_clean = pd.read_csv(COMPLETE_DATA_PATH).drop(columns=['X.x', 'X.y', 'instance_id'])


Dropping 18030 samples corresponding to same (comment_id, source) but different comments (with multiple annotators for each)
Samples to drop: 0
Dropping 272 samples corresponding to same duplicated sample IDs
Saving clean data to: /data1/moscato/personalised-hate-boundaries-data/data/kumar_perspective_clean/kumar_processed_with_ID_and_full_perspective_clean.csv


In [4]:
dropped_sample_ids = pd.concat([samples_to_drop_step_1, samples_to_drop_step_3]).drop_duplicates().reset_index(drop=True).tolist()

# dropped_sample_ids_output_path = '../data/clean/kumar_perspective_clean/dropped_sample_ids.pkl'
dropped_sample_ids_output_path = '/data1/moscato/personalised-hate-boundaries-data/data/kumar_perspective_clean/dropped_sample_ids.pkl'

print('Saving dropped samples ids to:', dropped_sample_ids_output_path)

with open(dropped_sample_ids_output_path, 'wb') as f:
    pickle.dump(dropped_sample_ids, f)

Saving dropped samples ids to: /data1/moscato/personalised-hate-boundaries-data/data/kumar_perspective_clean/dropped_sample_ids.pkl


## Exploration of the clean dataset

### Aggregated labels dataset

Options (from the `generate_aggregated_labels_dataset`):
- Binarize labels via a threshold.
    - Subsample to the minority class.
- Keep all labels.

In [103]:
CLEAN_DATA_PATH = '/data1/moscato/personalised-hate-boundaries-data/data/kumar_perspective_clean/kumar_processed_with_ID_and_full_perspective_clean.csv'

majority_vote_data = generate_aggregated_labels_dataset(
    'kumar',
    CLEAN_DATA_PATH,
    binarize_labels=True,
    subsample_majority_class=False
)

2025-01-17 14:47:19,026 - generate_aggregated_labels_dataset - INFO - Reading kumar dataset from: /data1/moscato/personalised-hate-boundaries-data/data/kumar_perspective_clean/kumar_processed_with_ID_and_full_perspective_clean.csv
  data_df = pd.read_csv(dataset_path)[
2025-01-17 14:47:23,743 - generate_aggregated_labels_dataset - INFO - Binarizing labels


In [104]:
majority_vote_data.groupby('label').agg(
    frac=pd.NamedAgg('text_id', lambda g: g.shape[0] / len(majority_vote_data)),
    count=pd.NamedAgg('text_id', 'count')
).reset_index()

Unnamed: 0,label,frac,count
0,0,0.780261,82735
1,1,0.219739,23300
