# Get demographic details from an MCV dataset 

The purpose of this script is to get demographic details from an MCV downloaded dataset. 
This informs decision making around, for example, how much of the data in a particular language, has demographic details, and if so, what they are. 

In [3]:
# imports go here 

# io 
import io

# pandas 
import pandas as pd


In [7]:
# specify the path to the TSV file - this should be `validated.tsv` from the MCV download 
#filePath = '/media/kathyreid/Seagate Backup Plus Drive/cv-datasets/ru/validated.tsv'
#filePath = '/media/kathyreid/Seagate Backup Plus Drive/cv-datasets/fr/validated.tsv'
#filePath = '/media/kathyreid/Elements/de/validated.tsv'
#filePath = '/media/kathyreid/Elements/es/validated.tsv'
#filePath = '/media/kathyreid/Elements/en/validated.tsv'
filePath = '/media/kathyreid/Seagate Backup Plus Drive/cv-datasets/en-v9/validated.tsv'

# put it into a DataFrame 
df = pd.read_csv(filePath, sep='\t')

In [9]:
# summary data 
df.value_counts

<bound method DataFrame.value_counts of                                                  client_id  \
0        000abb3006b78ea4c1144e55d9d158f05a9db011016051...   
1        0013037a1d45cc33460806cc3f8ecee9d536c45639ba4c...   
2        0014c5a3e5715a54855257779b89c2bb498d470b225866...   
3        001509f4624a7dee75247f6a8b642c4a0d09f8be3eeea6...   
4        001519f234e04528a2b36158c205dbe61c8da45ab0242f...   
...                                                    ...   
1556249  372293e65cdab88771e028a4351651ab2eff64438ddafc...   
1556250  372293e65cdab88771e028a4351651ab2eff64438ddafc...   
1556251  372293e65cdab88771e028a4351651ab2eff64438ddafc...   
1556252  372293e65cdab88771e028a4351651ab2eff64438ddafc...   
1556253  372293e65cdab88771e028a4351651ab2eff64438ddafc...   

                                 path  \
0        common_voice_en_27710027.mp3   
1          common_voice_en_699711.mp3   
2        common_voice_en_21953345.mp3   
3        common_voice_en_18132047.mp3   
4        c

In [10]:
# unique contributors 
len(df['client_id'].unique())

69656

In [11]:
# rows that have metadata 
len(df[df['age'].notna()])


999855

In [12]:
# get all the age ranges 
df['age'].unique()

array([nan, 'twenties', 'thirties', 'fourties', 'seventies', 'teens',
       'sixties', 'fifties', 'eighties', 'nineties'], dtype=object)

In [37]:
# age ranges

print('teens: ', len(df.loc[df['age'] == 'teens']))
print('twenties: ', len(df.loc[df['age'] == 'twenties']))
print('thirties: ', len(df.loc[df['age'] == 'thirties']))
print('fourties: ', len(df.loc[df['age'] == 'fourties']))
print('fifties: ', len(df.loc[df['age'] == 'fifties']))
print('sixties: ', len(df.loc[df['age'] == 'sixties']))
print('seventies: ', len(df.loc[df['age'] == 'seventies']))
print('eighties: ', len(df.loc[df['age'] == 'eighties']))
print('nineties: ', len(df.loc[df['age'] == 'nineties']))

print('NaN: ', len(df.loc[df['age'].isna()]))

teens:  83073
twenties:  337944
thirties:  191258
fourties:  139187
fifties:  72327
sixties:  70423
seventies:  12364
eighties:  1120
nineties:  95
NaN:  517993


In [13]:
# get the genders 

df['gender'].unique()

array([nan, 'male', 'female', 'other'], dtype=object)

In [38]:
# genders

print('female: ', len(df.loc[df['gender'] == 'female']))
print('male: ', len(df.loc[df['gender'] == 'male']))
print('other: ', len(df.loc[df['gender'] == 'other']))

print('NaN: ', len(df.loc[df['gender'].isna()]))

female:  216797
male:  666819
other:  27243
NaN:  514925


In [20]:
# get the accents 

# df['accent'].unique()
# in CV9, this key changed to `accents`
df['accents'].unique()


array([nan, 'England English,United States English', 'Hong Kong English',
       'England English', 'United States English',
       'United States English,wolof', 'Australian English',
       'Southern African (South Africa, Zimbabwe, Namibia)',
       'India and South Asia (India, Pakistan, Sri Lanka)',
       'England English,Hong Kong English',
       'India and South Asia (India, Pakistan, Sri Lanka),Basic',
       'East London ', 'Canadian English', 'Eastern European English',
       'German', 'Scottish English', 'Filipino',
       'England English,yorkshire', 'Singaporean English',
       'United States English,England English',
       'United States English,Variable',
       'West Indies and Bermuda (Bahamas, Bermuda, Jamaica, Trinidad)',
       'New Zealand English', 'Malaysian English', 'Slavic',
       'United States English,England English,Hong Kong English',
       'Irish English',
       'India and South Asia (India, Pakistan, Sri Lanka),United States English',
       'Uni

In [19]:
df['accents'].nunique(axis=1)

TypeError: nunique() got an unexpected keyword argument 'axis'

In [15]:
# french accents 

french_accents = df['accent'].unique()
for accent in french_accents:
    accent_total = len(df.loc[df['accent'] == accent])
    print(accent, ': ', accent_total)

print('NaN: ', len(df.loc[df['accent'].isna()]))

nan :  0
canada :  8682
france :  347436
belgium :  9679
cote_d_ivoire :  104
senegal :  41
algeria :  354
burundi :  1
united_kingdom :  259
cameroon :  56
united_states :  751
reunion :  1053
germany :  274
romania :  121
morocco :  59
tunisia :  16
switzerland :  3989
martinique :  68
other :  43
guadeloupe :  133
new_caledonia :  144
benin :  996
congo_brazzaville :  5
monaco :  109
gabon :  5
luxembourg :  14
st_pierre_et_miquelon :  6
mayotte :  6
italy :  57
congo_kinshasa :  10
ireland :  14
haiti :  34
madagascar :  162
portugal :  17
netherlands :  81
french_guiana :  115
NaN:  167394


In [23]:
# german accents 

german_accents = df['accent'].unique()
for accent in german_accents:
    accent_total = len(df.loc[df['accent'] == accent])
    print(accent, ': ', accent_total)

print('NaN: ', len(df.loc[df['accent'].isna()]))

nan :  0
russia :  940
germany :  437712
france :  1405
switzerland :  8602
austria :  20659
bulgaria :  1
netherlands :  75
denmark :  1
poland :  103
turkey :  24
united_kingdom :  148
czechia :  37
united_states :  268
greece :  120
hungary :  151
other :  184
belgium :  9
slovakia :  62
lithuania :  5
luxembourg :  57
canada :  98
liechtenstein :  62
slovenia :  10
brazil :  12
italy :  978
finland :  31
NaN:  213040


In [30]:
# spanish accents 

spanish_accents = df['accent'].unique()
for accent in spanish_accents:
    accent_total = len(df.loc[df['accent'] == accent])
    print(accent, ': ', accent_total)

print('NaN: ', len(df.loc[df['accent'].isna()]))

mexicano :  16924
nan :  0
americacentral :  5532
andino :  13729
caribe :  8235
centrosurpeninsular :  8713
rioplatense :  12102
chileno :  5316
surpeninsular :  32622
nortepeninsular :  35345
canario :  952
filipinas :  342
NaN:  131198


In [39]:
# english accents 

english_accents = df['accent'].unique()
for accent in english_accents:
    accent_total = len(df.loc[df['accent'] == accent])
    print(accent, ': ', accent_total)

print('NaN: ', len(df.loc[df['accent'].isna()]))

nan :  0
hongkong :  2750
us :  351472
england :  118401
african :  8066
indian :  73030
other :  10505
australia :  46951
canada :  48453
scotland :  12676
philippines :  4158
singapore :  2967
bermuda :  643
newzealand :  11281
malaysia :  1685
ireland :  9233
wales :  1550
southatlandtic :  203
NaN:  721760
