In [66]:
import pandas as pd
import string, glob, os

In [67]:
# Load the genderize data
genderize = pd.read_csv("../source_data/genderizer_collected.csv").set_index('name')
genderize.head()
known_names = set(genderize.index.values)

In [128]:
def parse_df(df):
    lets = set(string.ascii_letters+"- .")
    data = []
    for f, names in zip(df.index, df.Names):
        if type(names) == float:
            continue
            
        for name in names.split(';'):
            if not all([a in lets for a in name]):
                continue
            if not name.strip():
                continue
            
            first_name = name.split().pop()
            if first_name not in known_names:
                continue

            x = genderize.loc[first_name]
            item = {"filename":f}
            item["first_name"] = first_name
            item["gcount"] = x["count"]
            item["ggender"] = x["gender"]
            item["gprobability"] = x["probability"]

            data.append(item)

    data = pd.DataFrame(data).dropna()
    return data

In [79]:
# Load the names from PMC ack
df = pd.read_csv("../parsed_data/Extracted Names A-B.csv", nrows=20,usecols=["filename","Names"]).set_index("filename")
df["Names"].fillna("",inplace=True)

data = parse_df(df)
data.head()

Unnamed: 0_level_0,first_name,gcount,ggender,gprobability
filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
20_Century_Br_Hist/PMC4804230.nxml,Thomson,7.0,male,1.0
20_Century_Br_Hist/PMC5905639.nxml,Thomson,7.0,male,1.0
3D_Print_Med/PMC5954789.nxml,Wong,215.0,female,0.5
3D_Print_Med/PMC5954789.nxml,Lee,2089.0,male,0.75
3D_Print_Med/PMC5954790.nxml,Smith,28.0,male,0.68


In [80]:
F_CSV = glob.glob("../parsed_data/Extracted Names*.csv")
save_dest = "parsed_gender_data"
os.system(f'mkdir -p {save_dest}')

for f in F_CSV:
    df = pd.read_csv("../parsed_data/Extracted Names A-B.csv", 
                     nrows=10**20,
                     usecols=["filename","Names"]).set_index("filename")
    df["Names"].fillna("",inplace=True)
    data = parse_df(df)
    
    f_csv = os.path.join(save_dest, os.path.basename(f))
    print(data.head())
    data.to_csv(f_csv)
    

                                   first_name  gcount ggender  gprobability
filename                                                                   
20_Century_Br_Hist/PMC4804230.nxml    Thomson     7.0    male          1.00
20_Century_Br_Hist/PMC5905639.nxml    Thomson     7.0    male          1.00
3D_Print_Med/PMC5954789.nxml             Wong   215.0  female          0.50
3D_Print_Med/PMC5954789.nxml              Lee  2089.0    male          0.75
3D_Print_Med/PMC5954790.nxml            Smith    28.0    male          0.68
                                   first_name  gcount ggender  gprobability
filename                                                                   
20_Century_Br_Hist/PMC4804230.nxml    Thomson     7.0    male          1.00
20_Century_Br_Hist/PMC5905639.nxml    Thomson     7.0    male          1.00
3D_Print_Med/PMC5954789.nxml             Wong   215.0  female          0.50
3D_Print_Med/PMC5954789.nxml              Lee  2089.0    male          0.75
3D_Print_Med

In [82]:
# Load all the results for a plotting
F_CSV = glob.glob("parsed_gender_data/*.csv")
df = pd.concat([pd.read_csv(f, nrows=10**20) for f in F_CSV]).reset_index()
df.tail()


Unnamed: 0,index,filename,first_name,gcount,ggender,gprobability
331807,110599,Burns_Trauma/PMC6040609.nxml,Lloyd,159.0,male,0.99
331808,110600,Burns_Trauma/PMC6040609.nxml,James,6359.0,male,0.99
331809,110601,Burns_Trauma/PMC6040609.nxml,Nelson,609.0,male,0.99
331810,110602,Bus_Soc/PMC4944588.nxml,Wilson,544.0,male,0.99
331811,110603,Bus_Soc/PMC4944588.nxml,Kennedy,37.0,male,0.57


In [85]:
df["is_female"] = df.ggender=="female"
print("Fraction of people on an ack that are female:", df.is_female.mean())

Fraction of people on an ack that are female: 0.22425047918700952


In [89]:
x = df.groupby("filename")["is_female"].mean()
(x.median(), x.mean())

(0.0, 0.2261571742468239)

In [120]:
info = pd.read_csv("../source_data/PMC_info-iSearch_-_Publications-export_2018-09-04-20-53-47.csv",
                   nrows=10**20, usecols = ["PMCID", "Author First Name"])
info.head()

Unnamed: 0,PMCID,Author First Name
0,PMC5897966,Linda A;Susan;Ravi;John
1,PMC5897967,Guillaume;Carolyn
2,PMC5897968,Jeffrey;Dondrae J;Gregory W;Gregory G
3,PMC5897969,Jeffrey D;Paul J;David C;Faisal;Marta B;Aamir ...
4,PMC5897970,Andrew W;Kofi M;Caspar M;Julie R;Ernest Victor...


In [121]:
info = info.rename(columns={"Author First Name":"Names","PMCID":"filename"}).set_index("filename")

In [None]:
pubgender = parse_df(info)
pubgender.tail()

In [111]:
x.columns

Index([], dtype='object')

In [None]:
a= 2
a