In [1]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.12.5-py3-none-any.whl (3.1 MB)
[K     |████████████████████████████████| 3.1 MB 4.3 MB/s 
[?25hCollecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 42.4 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.1.2-py3-none-any.whl (59 kB)
[K     |████████████████████████████████| 59 kB 7.5 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 46.0 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 41.8 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attem

In [2]:
from google.colab import drive
#drive.mount("/content/drive/")
drive._mount("/content/drive/")

Mounted at /content/drive/


In [3]:
# load scripts to run the sexism classifiers
!cp /content/drive/MyDrive/Artistic_Content_Creation/WASABI_gender_experiments/utils/reproduce_sexism_classifier/text_preprocessor.py .
!cp /content/drive/MyDrive/Artistic_Content_Creation/WASABI_gender_experiments/utils/reproduce_sexism_classifier/bert_wrapper_transformers_inference.py .

# load the model checkpoint
!cp -r /content/drive/MyDrive/Artistic_Content_Creation/WASABI_gender_experiments/utils/reproduce_sexism_classifier/checkpoints/final_model .

# load datset
!cp -r "drive/MyDrive/Artistic_Content_Creation/WASABI_gender_experiments/WASABI_gender_experiments_definitive/dataset_10_no_duplicates/" .


In [4]:
import numpy as np
import re, glob
import pandas as pd
import os, shutil
from tqdm import tqdm
tqdm.pandas()

from bert_wrapper_transformers_inference import FinetunedBertClassifier

import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [29]:
# functions to split the lyrics and generate the input of the classifier
def make_batches_of_line(lines, n_items=4, stride=2):

    l = len(lines)
    line_batches = [" ".join(lines[i:i+n_items]) for i in range(0, l, stride)]
    return line_batches

def make_batch(line_batches, batch_size=64):

    l = len(line_batches)
    for batch in [line_batches[i:i+batch_size] for i in range(0, l, batch_size)]:
        yield batch

def clean_and_split_lyric(lyric):
    lyric = re.sub("\(.*?\)", "", lyric)
    lyric = re.sub("_", "", lyric)

    lines = re.split("(?:\n)+", lyric)
    lines = [l.strip() for l in lines if l.strip()!='']
    lines = [l for l in lines if len(l.split())>3]
    lines

    return lines

def classify(model, lyric):

    lines = clean_and_split_lyric(lyric)
    line_batches = make_batches_of_line(lines)
    sexist_lines = []
    for b in make_batch(line_batches):

        preds = model.predict(b)
        
        sexist_lines_ = [(round(pred, 4), line_batch) for pred, line_batch in zip(preds, b) if pred>=0.5]
        sexist_lines.extend(sexist_lines_)

    return sexist_lines

In [6]:
# call the model
model = FinetunedBertClassifier(from_checkpoint="final_model/checkpoint-153")

In [None]:
decades = [1960, 1970, 1980, 1990, 2000]

for decade in decades:
    lyrics_person_df = pd.read_json(f"dataset_10_no_duplicates/data_lyrics_person_decades/lyrics_{decade}.json.gz",
                                    orient='records', lines=True)
    lyrics_group_df = pd.read_json(f"dataset_10_no_duplicates/data_lyrics_group_decades/lyrics_{decade}.json.gz",
                                    orient='records', lines=True)

    lyrics_df = pd.concat([lyrics_person_df, lyrics_group_df])

    if decade in [1990]: # split dataset in two

        n_lyrics = lyrics_df.shape[0]
        n_lyrics_half = n_lyrics // 2
        lyrics_df1 , lyrics_df2 = lyrics_df[:n_lyrics_half], lyrics_df[n_lyrics_half:]

        lyrics_df1.loc[:, 'sexist_lines'] = lyrics_df1.lyrics.progress_apply(lambda l: classify(model, l))
        lyrics_df1.drop(columns=['lyrics'])
        lyrics_df1.to_json(f"lyrics_sexism_{decade}_first.json",
                                    orient='records', lines=True)
        shutil.copy(f"lyrics_sexism_{decade}_first.json", "drive/MyDrive/Artistic_Content_Creation/WASABI_gender_experiments/WASABI_gender_experiments_definitive/dataset_10_no_duplicates/Results_sexism_detection")

        lyrics_df2.loc[:, 'sexist_lines'] = lyrics_df2.lyrics.progress_apply(lambda l: classify(model, l))
        lyrics_df2.drop(columns=['lyrics'])
        lyrics_df2.to_json(f"lyrics_sexism_{decade}_second.json",
                                    orient='records', lines=True)
        shutil.copy(f"lyrics_sexism_{decade}_second.json", "drive/MyDrive/Artistic_Content_Creation/WASABI_gender_experiments/WASABI_gender_experiments_definitive/dataset_10_no_duplicates/Results_sexism_detection")

    elif decade in [2000]:  # split dataset in four
        n_lyrics = lyrics_df.shape[0]
        n_lyrics_half = n_lyrics // 2
        lyrics_df1 , lyrics_df2 = lyrics_df[:n_lyrics_half], lyrics_df[n_lyrics_half:]

        n_half_half = n_lyrics_half // 2
        lyrics_df11, lyrics_df12 = lyrics_df1[:n_half_half], lyrics_df1[n_half_half:]
        lyrics_df21, lyrics_df22 = lyrics_df2[:n_half_half], lyrics_df2[n_half_half:]

        
        lyrics_df11.loc[:, 'sexist_lines'] = lyrics_df11.lyrics.progress_apply(lambda l: classify(model, l))
        lyrics_df11.drop(columns=['lyrics'])
        lyrics_df11.to_json(f"lyrics_sexism_{decade}_first_first.json",
                                    orient='records', lines=True)
        shutil.copy(f"lyrics_sexism_{decade}_first_first.json", "drive/MyDrive/Artistic_Content_Creation/WASABI_gender_experiments/WASABI_gender_experiments_definitive/dataset_10_no_duplicates/Results_sexism_detection")


        lyrics_df12.loc[:, 'sexist_lines'] = lyrics_df12.lyrics.progress_apply(lambda l: classify(model, l))
        lyrics_df12.drop(columns=['lyrics'])
        lyrics_df12.to_json(f"lyrics_sexism_{decade}_first_second.json",
                                    orient='records', lines=True)
        shutil.copy(f"lyrics_sexism_{decade}_first_second.json", "drive/MyDrive/Artistic_Content_Creation/WASABI_gender_experiments/WASABI_gender_experiments_definitive/dataset_10_no_duplicates/Results_sexism_detection")


        lyrics_df21.loc[:, 'sexist_lines'] = lyrics_df21.lyrics.progress_apply(lambda l: classify(model, l))
        lyrics_df21.drop(columns=['lyrics'])
        lyrics_df21.to_json(f"lyrics_sexism_{decade}_second_first.json",
                                    orient='records', lines=True)
        shutil.copy(f"lyrics_sexism_{decade}_second_first.json", "drive/MyDrive/Artistic_Content_Creation/WASABI_gender_experiments/WASABI_gender_experiments_definitive/dataset_10_no_duplicates/Results_sexism_detection")
        

        lyrics_df22.loc[:, 'sexist_lines'] = lyrics_df22.lyrics.progress_apply(lambda l: classify(model, l))
        lyrics_df22.drop(columns=['lyrics'])
        lyrics_df22.to_json(f"lyrics_sexism_{decade}_second_second.json",
                                    orient='records', lines=True)
        shutil.copy(f"lyrics_sexism_{decade}_second_second.json", "drive/MyDrive/Artistic_Content_Creation/WASABI_gender_experiments/WASABI_gender_experiments_definitive/dataset_10_no_duplicates/Results_sexism_detection")


    else:

        lyrics_df.loc[:, 'sexist_lines'] = lyrics_df.lyrics.progress_apply(lambda l: classify(model, l))

        lyrics_df.drop(columns=['lyrics'])
        lyrics_df.to_json(f"lyrics_sexism_{decade}.json",
                                    orient='records', lines=True)
        shutil.copy(f"lyrics_sexism_{decade}.json", "drive/MyDrive/Artistic_Content_Creation/WASABI_gender_experiments/WASABI_gender_experiments_definitive/dataset_10_no_duplicates/Results_sexism_detection")



  0%|          | 0/23378 [00:00<?, ?it/s]

100%|██████████| 23378/23378 [36:26<00:00, 10.69it/s]
100%|██████████| 35033/35033 [1:01:52<00:00,  9.44it/s]
100%|██████████| 41498/41498 [1:19:47<00:00,  8.67it/s]
100%|██████████| 46515/46515 [1:39:35<00:00,  7.78it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)
 41%|████      | 19020/46515 [39:06<41:33, 11.03it/s]