# Get demographic details from an MCV dataset 

The purpose of this script is to get demographic details from an MCV downloaded dataset. 
This informs decision making around, for example, how much of the data in a particular language, has demographic details, and if so, what they are. 

In [3]:
# imports go here 

# io 
import io

# pandas 
import pandas as pd


In [17]:
# specify the path to the TSV file - this should be `validated.tsv` from the MCV download 
filePath = '../ja/validated.tsv'

# put it into a DataFrame 
df = pd.read_csv(filePath, sep='\t')

In [19]:
# summary data 
df.value_counts

<bound method DataFrame.value_counts of                                                client_id  \
0      033ede7ca4c60dc27cef421b4d33799d38924ed36fa8dd...   
1      087edae49ce1e0f600682ceccc7fc28e81e64ae890e647...   
2      09e6ae463786aae9071baa9044ac8b7466aa7c48dcdaf4...   
3      15b7d87a73d28b37664fdf7fea1ff232f89e80ce954c9b...   
4      1c6e8463b08279962ad37c0946d0b1df78a82a4c907f4b...   
...                                                  ...   
21021  02a8841a00d762472a4797b56ee01643e8d9ece5a225f2...   
21022  02a8841a00d762472a4797b56ee01643e8d9ece5a225f2...   
21023  02a8841a00d762472a4797b56ee01643e8d9ece5a225f2...   
21024  02a8841a00d762472a4797b56ee01643e8d9ece5a225f2...   
21025  02a8841a00d762472a4797b56ee01643e8d9ece5a225f2...   

                               path  \
0      common_voice_ja_21409740.mp3   
1      common_voice_ja_22072759.mp3   
2      common_voice_ja_23677003.mp3   
3      common_voice_ja_19499629.mp3   
4      common_voice_ja_22717324.mp3   
...  

In [23]:
# unique contributors 
len(df['client_id'].unique())

368

In [25]:
# rows that have metadata 
len(df[df['age'].notna()])


15728

In [30]:
# get all the age ranges 
df['age'].unique()

array([nan, 'teens', 'thirties', 'fourties', 'twenties', 'fifties'],
      dtype=object)

In [48]:
# age ranges

print('teens: ', len(df.loc[df['age'] == 'teens']))
print('twenties: ', len(df.loc[df['age'] == 'twenties']))
print('thirties: ', len(df.loc[df['age'] == 'thirties']))
print('fourties: ', len(df.loc[df['age'] == 'fourties']))
print('fifties: ', len(df.loc[df['age'] == 'fifties']))
print('sixties: ', len(df.loc[df['age'] == 'sixties']))
print('seventies: ', len(df.loc[df['age'] == 'seventies']))
print('eighties: ', len(df.loc[df['age'] == 'eighties']))
print('nineties: ', len(df.loc[df['age'] == 'nineties']))

print('NaN: ', len(df.loc[df['age'].isna()]))

teens:  704
twenties:  7586
thirties:  1980
fourties:  5308
fifties:  150
sixties:  0
seventies:  0
eighties:  0
nineties:  0
NaN:  5298


In [49]:
# get the genders 

df['gender'].unique()

array([nan, 'female', 'male', 'other'], dtype=object)

In [51]:
# genders

print('female: ', len(df.loc[df['gender'] == 'female']))
print('male: ', len(df.loc[df['gender'] == 'male']))
print('other: ', len(df.loc[df['gender'] == 'other']))

print('NaN: ', len(df.loc[df['gender'].isna()]))

female:  5427
male:  11101
other:  30
NaN:  4468


In [52]:
# get the accents 

df['accent'].unique()

array([nan])