<a href="https://colab.research.google.com/github/MatteoFasulo/Sexism-detection/blob/main/assignment_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Libraries

In [5]:
import os
from pathlib import Path

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

### Constants and Setup

# Task 1: Corpus

1. Download the data
2. Load the JSON files and encode them as a DataFrame
3. Generate hard labels for Task 1 with majority voting
4. Filter the DataFrame for only english tweets
5. Remove unwanted columns
6. Encode the hard labels column as integers

### Download the data

In [4]:
if not Path("data").exists():
    os.mkdir("data")
    print("data directory created")

train_url = 'https://raw.githubusercontent.com/nlp-unibo/nlp-course-material/refs/heads/main/2024-2025/Assignment%201/data/training.json'
test_url = 'https://raw.githubusercontent.com/nlp-unibo/nlp-course-material/refs/heads/main/2024-2025/Assignment%201/data/test.json'
val_url = 'https://raw.githubusercontent.com/nlp-unibo/nlp-course-material/refs/heads/main/2024-2025/Assignment%201/data/validation.json'

os.system(f"wget {train_url} -O data/training.json")
os.system(f"wget {test_url} -O data/test.json")
os.system(f"wget {val_url} -O data/validation.json")

--2024-11-05 18:44:35--  https://raw.githubusercontent.com/nlp-unibo/nlp-course-material/refs/heads/main/2024-2025/Assignment%201/data/training.json
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.110.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 6217980 (5.9M) [text/plain]
Saving to: ‘data/training.json’

     0K .......... .......... .......... .......... ..........  0% 3.29M 2s
    50K .......... .......... .......... .......... ..........  1% 1.91M 2s
   100K .......... .......... .......... .......... ..........  2% 3.49M 2s
   150K .......... .......... .......... .......... ..........  3% 3.19M 2s
   200K .......... .......... .......... .......... ..........  4% 3.21M 2s
   250K .......... .......... .......... .......... ..........  4% 2.08M 2s
   300K .......... .......... .......... .......... ..

0

### Load the JSON files and encode them as a DataFrame

In [52]:
train = pd.read_json("data/training.json", orient='index', encoding='utf-8')
val = pd.read_json("data/validation.json", orient='index', encoding='utf-8')
test = pd.read_json("data/test.json", orient='index', encoding='utf-8')

In [53]:
train.head()

Unnamed: 0,id_EXIST,lang,tweet,number_annotators,annotators,gender_annotators,age_annotators,labels_task1,labels_task2,labels_task3,split
100001,100001,es,"@TheChiflis Ignora al otro, es un capullo.El p...",6,"[Annotator_1, Annotator_2, Annotator_3, Annota...","[F, F, F, M, M, M]","[18-22, 23-45, 46+, 46+, 23-45, 18-22]","[YES, YES, NO, YES, YES, YES]","[REPORTED, JUDGEMENTAL, -, REPORTED, JUDGEMENT...","[[OBJECTIFICATION], [OBJECTIFICATION, SEXUAL-V...",TRAIN_ES
100002,100002,es,@ultimonomada_ Si comicsgate se parece en algo...,6,"[Annotator_7, Annotator_8, Annotator_9, Annota...","[F, F, F, M, M, M]","[18-22, 23-45, 46+, 46+, 23-45, 18-22]","[NO, NO, NO, NO, YES, NO]","[-, -, -, -, DIRECT, -]","[[-], [-], [-], [-], [OBJECTIFICATION], [-]]",TRAIN_ES
100003,100003,es,"@Steven2897 Lee sobre Gamergate, y como eso ha...",6,"[Annotator_7, Annotator_8, Annotator_9, Annota...","[F, F, F, M, M, M]","[18-22, 23-45, 46+, 46+, 23-45, 18-22]","[NO, NO, NO, NO, NO, NO]","[-, -, -, -, -, -]","[[-], [-], [-], [-], [-], [-]]",TRAIN_ES
100004,100004,es,@Lunariita7 Un retraso social bastante lamenta...,6,"[Annotator_13, Annotator_14, Annotator_15, Ann...","[F, F, F, M, M, M]","[18-22, 23-45, 46+, 46+, 23-45, 18-22]","[NO, NO, YES, NO, YES, YES]","[-, -, DIRECT, -, REPORTED, REPORTED]","[[-], [-], [IDEOLOGICAL-INEQUALITY], [-], [IDE...",TRAIN_ES
100005,100005,es,@novadragon21 @icep4ck @TvDannyZ Entonces como...,6,"[Annotator_19, Annotator_20, Annotator_21, Ann...","[F, F, F, M, M, M]","[18-22, 23-45, 46+, 46+, 23-45, 18-22]","[YES, NO, YES, NO, YES, YES]","[REPORTED, -, JUDGEMENTAL, -, JUDGEMENTAL, DIR...","[[STEREOTYPING-DOMINANCE, OBJECTIFICATION], [-...",TRAIN_ES


In [54]:
train.shape, val.shape, test.shape

((6920, 11), (726, 11), (312, 11))

### Generate hard labels for Task 1 with majority voting

In [55]:
def majority_voting(votes: list[str]) -> str:
    total_num_votes = len(votes)
    yes_votes = votes.count("YES")
    no_votes = total_num_votes - yes_votes

    if yes_votes > no_votes:
        return "YES"
    elif no_votes > yes_votes:
        return "NO"
    else:
        return "NEUTRAL" # This will be the case when there is a tie (removed later)

In [56]:
train['hard_label_task1'] = train['labels_task1'].apply(majority_voting)
val['hard_label_task1'] = val['labels_task1'].apply(majority_voting)
test['hard_label_task1'] = test['labels_task1'].apply(majority_voting)

### Filter the DataFrame for only english tweets and remove unclear tweets

In [57]:
train = train[(train['hard_label_task1'] != "NEUTRAL") & (train['lang'] == "en")]
val = val[(val['hard_label_task1'] != "NEUTRAL") & (val['lang'] == "en")]
test = test[(test['hard_label_task1'] != "NEUTRAL") & (test['lang'] == "en")]

In [58]:
train.shape, val.shape, test.shape

((2870, 12), (158, 12), (286, 12))

### Remove unwanted columns

In [59]:
columns_to_maintain = ['id_EXIST', 'lang', 'tweet', 'hard_label_task1']

train = train[columns_to_maintain]
val = val[columns_to_maintain]
test = test[columns_to_maintain]

In [60]:
train.head()

Unnamed: 0,id_EXIST,lang,tweet,hard_label_task1
200002,200002,en,Writing a uni essay in my local pub with a cof...,YES
200003,200003,en,@UniversalORL it is 2021 not 1921. I dont appr...,YES
200006,200006,en,According to a customer I have plenty of time ...,YES
200007,200007,en,"So only 'blokes' drink beer? Sorry, but if you...",YES
200008,200008,en,New to the shelves this week - looking forward...,NO


### Encode the hard labels column as integers

In [62]:
train['hard_label_task1'] = train['hard_label_task1'].apply(lambda x: 1 if x == "YES" else 0)
val['hard_label_task1'] = val['hard_label_task1'].apply(lambda x: 1 if x == "YES" else 0)
test['hard_label_task1'] = test['hard_label_task1'].apply(lambda x: 1 if x == "YES" else 0)

In [64]:
train.hard_label_task1.value_counts()

hard_label_task1
0    1733
1    1137
Name: count, dtype: int64