In [1]:
# mount GDrive
from google.colab import drive
drive.mount('/content/drive')
#drive._mount('/content/drive')

Mounted at /content/drive


In [2]:
# load dataset
!cp "drive/MyDrive/WASABI_gender_experiments/WASABI_gender_experiments_definitive/dataset_10/lyrics_dataset.json.gz" .

# load sexism classifications
!cp -r "drive/MyDrive/WASABI_gender_experiments/WASABI_gender_experiments_definitive/dataset_10_no_duplicates/Results_sexism_detection" .

In [1]:
%pylab inline 
import pandas as pd
import glob

Populating the interactive namespace from numpy and matplotlib


In [2]:
def get_artist_gender_and_type(author_info):

    artist_type = author_info['type']

    if artist_type=='Person':
        artist_gender = author_info['gender']
        n_males, n_females = None, None
    else:
        n_males, n_females = author_info['n_male'], author_info['n_female']
        if n_males==author_info['n_members']:
            artist_gender = 'Male'
        elif n_females==author_info['n_members']:
            artist_gender = 'Female'
        elif n_males*n_females!=0 and n_females+n_males==author_info['n_members']:
            artist_gender = 'Mix'
        else:
            print('Problem with group gender assignation..')

    return artist_gender, artist_type, n_males, n_females

### Load and combine data

In [3]:
# load main dataset
song_lyrics = pd.read_json("lyrics_dataset.json.gz", orient='records', lines=True)
song_lyrics = song_lyrics.drop(columns=['lyrics'])

print("Number of song lyrics: ", song_lyrics.shape[0])

Number of song lyrics:  377808


In [4]:
# load sexism classifications
# we ran the classifier on unique lyrics (without duplicates and covers)
sexism_files = glob.glob("Results_sexism_detection/*.json")

lyrics_sexism = []
for file in sexism_files:
    data_chunk = pd.read_json(file, orient='records', lines=True, chunksize=5000 )
    for chunk in data_chunk:   
        chunk = chunk[['song_id', 'sexist_lines']]
        lyrics_sexism.append(chunk)

lyrics_sexism = pd.concat(lyrics_sexism)
#lyrics_sexism.loc[:, 'is_sexist'] = lyrics_sexism.sexist_lines.apply(lambda ls: len(ls)>0)

In [5]:
# merge with main dataset
# covers have not the sexism label, we need to propagate it
song_lyrics = song_lyrics.merge(lyrics_sexism, on='song_id', how='left')

# propagate sexism label to song covers
# this df used to get covered songs
cover_songs_is_sexist = song_lyrics[(song_lyrics.song_id.isin(song_lyrics.cover_of.unique()))][['song_id', 'cover_of', 'sexist_lines']]
cover_songs_is_sexist = cover_songs_is_sexist.set_index('song_id')

'''
song_lyrics.loc[:, 'is_sexist'] = song_lyrics.apply(lambda row: 
            row.is_sexist if not pd.isna(row.is_sexist) else cover_songs_is_sexist.loc[row.cover_of].is_sexist,
            axis=1)
'''
song_lyrics.loc[:, 'sexist_lines'] = song_lyrics.apply(lambda row: 
            row.sexist_lines if type(row.sexist_lines) is list else cover_songs_is_sexist.loc[row.cover_of].sexist_lines,
            axis=1)

print('Number of songs: ', song_lyrics.shape[0])

Number of songs:  377808


In [6]:
# extract info about artist gender and type
song_lyrics['artist_gender'], song_lyrics['artist_type'], song_lyrics['n_males'], song_lyrics['n_females'] =  \
                        zip(*song_lyrics.other_artist_info.apply(get_artist_gender_and_type))

song_lyrics.head()

Unnamed: 0,song_id,song_title,artist_id,artist_name,song_pubdate_combined,song_year_combined,song_decade_combined,other_artist_info,album_genre,genre,cover_of,is_cover,SongID,chart_positions,first_week_in_chart,is_billboard,dbp_genre,genre_combined_reduced,genre_combined_reduced_from,sexist_lines,artist_gender,artist_type,n_males,n_females
0,5714dec325ac0d8aee3863ff,Long Live The King,56d7ef2356847be81b3e8bce,Adrian Snell,1980-01-01,1980,1980,"{'n_albums': 6, 'n_songs': 101, 'languages': {...",,,,False,,,,False,,,,[],Male,Person,,
1,5714dec325ac0d8aee386400,The Last Supper,56d7ef2356847be81b3e8bce,Adrian Snell,1980-01-01,1980,1980,"{'n_albums': 6, 'n_songs': 101, 'languages': {...",,,,False,,,,False,,,,[],Male,Person,,
2,5714dec325ac0d8aee386401,Gethsemane,56d7ef2356847be81b3e8bce,Adrian Snell,1980-01-01,1980,1980,"{'n_albums': 6, 'n_songs': 101, 'languages': {...",,,,False,,,,False,,,,[],Male,Person,,
3,5714dec325ac0d8aee386402,Betrayal,56d7ef2356847be81b3e8bce,Adrian Snell,1980-01-01,1980,1980,"{'n_albums': 6, 'n_songs': 101, 'languages': {...",,,,False,,,,False,,,,"[[0.6468, But your weapons won't be needed You...",Male,Person,,
4,5714dec325ac0d8aee386403,Son Of The World,56d7ef2356847be81b3e8bce,Adrian Snell,1980-01-01,1980,1980,"{'n_albums': 6, 'n_songs': 101, 'languages': {...",,,,False,,,,False,,,,[],Male,Person,,


In [7]:
song_lyrics = song_lyrics.reset_index(drop=True)
song_lyrics.shape[0]

377808

### Split into 3 datasets: WASABI, Billboard, and BillboardTop10

In [8]:
# define lambda functions to return quickly subsets of data
is_topn = lambda chart_ranks, n: any([r<=n for r in chart_ranks]) if type(chart_ranks)==list else False
is_top10 = lambda chart_ranks: is_topn(chart_ranks, 10)

In [9]:
# make the billboard and billboard top10 datasets
song_lyrics_billboard = song_lyrics[song_lyrics.is_billboard]
song_lyrics_billboard_top10 = song_lyrics[(song_lyrics.is_billboard)&
                                          (song_lyrics.chart_positions.apply(is_top10))]

print('Number of songs in Billboard: ', song_lyrics_billboard.shape[0])
print('Number of songs in Billboard top10: ', song_lyrics_billboard_top10.shape[0])

Number of songs in Billboard:  10798
Number of songs in Billboard top10:  2608


### Basic statistics

##### Count sexist songs with different classification thresholds

#### Classification threshold: 0.50

In [10]:
class_threshold = 0.5

In [11]:
song_lyrics.loc[:, 'is_sexist'] = song_lyrics.sexist_lines.apply(lambda lines:
                                                                 any([l[0]>=class_threshold for l in lines]))
song_lyrics_billboard.loc[:, 'is_sexist'] = song_lyrics_billboard.sexist_lines.apply(lambda lines:
                                                                 any([l[0]>=class_threshold for l in lines]))
song_lyrics_billboard_top10.loc[:, 'is_sexist'] = song_lyrics_billboard_top10.sexist_lines.apply(lambda lines:
                                                                 any([l[0]>=class_threshold for l in lines]))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


In [12]:
# count number of sexist songs of each type of artist gender and type, and compute the fraction to the total number of songs in WASABI
song_lyrics.groupby(['artist_type', "artist_gender"]).apply(lambda rows: 
                                                            pd.Series({'n_sexist_songs':int(rows[rows.is_sexist].shape[0]), 
                                                                       'frac_sexist_songs':rows[rows.is_sexist].shape[0]/rows.shape[0]}) )

Unnamed: 0_level_0,Unnamed: 1_level_0,n_sexist_songs,frac_sexist_songs
artist_type,artist_gender,Unnamed: 2_level_1,Unnamed: 3_level_1
Group,Female,1616.0,0.25187
Group,Male,24044.0,0.231796
Group,Mix,5049.0,0.214696
Person,Female,18204.0,0.249979
Person,Male,62877.0,0.367006


In [13]:
# count number of sexist songs of each type of artist gender and type, and compute the fraction to the total number of songs in WASABI and Billboard and Billboard top 10
song_lyrics_billboard.groupby(['artist_type', "artist_gender"]).apply(lambda rows: 
                                                            pd.Series({'n_sexist_songs':int(rows[rows.is_sexist].shape[0]), 
                                                                       'frac_sexist_songs_bill':rows[rows.is_sexist].shape[0]/rows.shape[0],
                                                                       }) )

Unnamed: 0_level_0,Unnamed: 1_level_0,n_sexist_songs,frac_sexist_songs_bill
artist_type,artist_gender,Unnamed: 2_level_1,Unnamed: 3_level_1
Group,Female,90.0,0.424528
Group,Male,975.0,0.391095
Group,Mix,200.0,0.327869
Person,Female,861.0,0.356965
Person,Male,2525.0,0.497929


In [14]:
# count number of sexist songs of each type of artist gender and type, and compute the fraction to the total number of songs in WASABI and Billboard and Billboard top 10
song_lyrics_billboard_top10.groupby(['artist_type', "artist_gender"]).apply(lambda rows: 
                                                            pd.Series({'n_sexist_songs':int(rows[rows.is_sexist].shape[0]), 
                                                                       'frac_sexist_songs_bill':rows[rows.is_sexist].shape[0]/rows.shape[0],
                                                                       }) )

Unnamed: 0_level_0,Unnamed: 1_level_0,n_sexist_songs,frac_sexist_songs_bill
artist_type,artist_gender,Unnamed: 2_level_1,Unnamed: 3_level_1
Group,Female,28.0,0.5
Group,Male,250.0,0.397456
Group,Mix,58.0,0.371795
Person,Female,245.0,0.368976
Person,Male,597.0,0.541251


#### Classification threshold

In [15]:
class_threshold = 0.725

In [16]:
song_lyrics.loc[:, 'is_sexist'] = song_lyrics.sexist_lines.apply(lambda lines:
                                                                 any([l[0]>=class_threshold for l in lines]))
song_lyrics_billboard.loc[:, 'is_sexist'] = song_lyrics_billboard.sexist_lines.apply(lambda lines:
                                                                 any([l[0]>=class_threshold for l in lines]))
song_lyrics_billboard_top10.loc[:, 'is_sexist'] = song_lyrics_billboard_top10.sexist_lines.apply(lambda lines:
                                                                 any([l[0]>=class_threshold for l in lines]))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


In [17]:
# count number of sexist songs of each type of artist gender and type, and compute the fraction to the total number of songs in WASABI
song_lyrics.groupby(['artist_type', "artist_gender"]).apply(lambda rows: 
                                                            pd.Series({'n_sexist_songs':int(rows[rows.is_sexist].shape[0]), 
                                                                       'frac_sexist_songs':rows[rows.is_sexist].shape[0]/rows.shape[0]}) )

Unnamed: 0_level_0,Unnamed: 1_level_0,n_sexist_songs,frac_sexist_songs
artist_type,artist_gender,Unnamed: 2_level_1,Unnamed: 3_level_1
Group,Female,1281.0,0.199657
Group,Male,18651.0,0.179805
Group,Mix,3813.0,0.162138
Person,Female,14305.0,0.196438
Person,Male,51412.0,0.300086


In [18]:
# count number of sexist songs of each type of artist gender and type, and compute the fraction to the total number of songs in WASABI and Billboard and Billboard top 10
song_lyrics_billboard.groupby(['artist_type', "artist_gender"]).apply(lambda rows: 
                                                            pd.Series({'n_sexist_songs':int(rows[rows.is_sexist].shape[0]), 
                                                                       'frac_sexist_songs_bill':rows[rows.is_sexist].shape[0]/rows.shape[0],
                                                                       }) )

Unnamed: 0_level_0,Unnamed: 1_level_0,n_sexist_songs,frac_sexist_songs_bill
artist_type,artist_gender,Unnamed: 2_level_1,Unnamed: 3_level_1
Group,Female,73.0,0.34434
Group,Male,799.0,0.320497
Group,Mix,161.0,0.263934
Person,Female,719.0,0.298093
Person,Male,2199.0,0.433642


In [19]:
# count number of sexist songs of each type of artist gender and type, and compute the fraction to the total number of songs in WASABI and Billboard and Billboard top 10
song_lyrics_billboard_top10.groupby(['artist_type', "artist_gender"]).apply(lambda rows: 
                                                            pd.Series({'n_sexist_songs':int(rows[rows.is_sexist].shape[0]), 
                                                                       'frac_sexist_songs_bill':rows[rows.is_sexist].shape[0]/rows.shape[0],
                                                                       }) )

Unnamed: 0_level_0,Unnamed: 1_level_0,n_sexist_songs,frac_sexist_songs_bill
artist_type,artist_gender,Unnamed: 2_level_1,Unnamed: 3_level_1
Group,Female,24.0,0.428571
Group,Male,209.0,0.332273
Group,Mix,47.0,0.301282
Person,Female,205.0,0.308735
Person,Male,534.0,0.484134


#### Classification threshold: 0.90

In [20]:
class_threshold = 0.9

In [21]:
song_lyrics.loc[:, 'is_sexist'] = song_lyrics.sexist_lines.apply(lambda lines:
                                                                 any([l[0]>=class_threshold for l in lines]))
song_lyrics_billboard.loc[:, 'is_sexist'] = song_lyrics_billboard.sexist_lines.apply(lambda lines:
                                                                 any([l[0]>=class_threshold for l in lines]))
song_lyrics_billboard_top10.loc[:, 'is_sexist'] = song_lyrics_billboard_top10.sexist_lines.apply(lambda lines:
                                                                 any([l[0]>=class_threshold for l in lines]))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


In [24]:
# count number of sexist songs of each type of artist gender and type, and compute the fraction to the total number of songs in WASABI
song_lyrics.groupby(['artist_type', "artist_gender"]).apply(lambda rows: 
                                                            pd.Series({'n_sexist_songs':int(rows[rows.is_sexist].shape[0]), 
                                                                       'frac_sexist_songs':rows[rows.is_sexist].shape[0]/rows.shape[0]}) )

Unnamed: 0_level_0,Unnamed: 1_level_0,n_sexist_songs,frac_sexist_songs
artist_type,artist_gender,Unnamed: 2_level_1,Unnamed: 3_level_1
Group,Female,533.0,0.083074
Group,Male,6505.0,0.062711
Group,Mix,1227.0,0.052175
Person,Female,5650.0,0.077586
Person,Male,20465.0,0.119452


In [25]:
# count number of sexist songs of each type of artist gender and type, and compute the fraction to the total number of songs in WASABI and Billboard and Billboard top 10
song_lyrics_billboard.groupby(['artist_type', "artist_gender"]).apply(lambda rows: 
                                                            pd.Series({'n_sexist_songs':int(rows[rows.is_sexist].shape[0]), 
                                                                       'frac_sexist_songs_bill':rows[rows.is_sexist].shape[0]/rows.shape[0],
                                                                       }) )

Unnamed: 0_level_0,Unnamed: 1_level_0,n_sexist_songs,frac_sexist_songs_bill
artist_type,artist_gender,Unnamed: 2_level_1,Unnamed: 3_level_1
Group,Female,31.0,0.146226
Group,Male,339.0,0.135981
Group,Mix,81.0,0.132787
Person,Female,352.0,0.145937
Person,Male,1117.0,0.220272


In [26]:
# count number of sexist songs of each type of artist gender and type, and compute the fraction to the total number of songs in WASABI and Billboard and Billboard top 10
song_lyrics_billboard_top10.groupby(['artist_type', "artist_gender"]).apply(lambda rows: 
                                                            pd.Series({'n_sexist_songs':int(rows[rows.is_sexist].shape[0]), 
                                                                       'frac_sexist_songs_bill':rows[rows.is_sexist].shape[0]/rows.shape[0],
                                                                       }) )

Unnamed: 0_level_0,Unnamed: 1_level_0,n_sexist_songs,frac_sexist_songs_bill
artist_type,artist_gender,Unnamed: 2_level_1,Unnamed: 3_level_1
Group,Female,12.0,0.214286
Group,Male,93.0,0.147854
Group,Mix,25.0,0.160256
Person,Female,106.0,0.159639
Person,Male,309.0,0.280145
