In [1]:
import pandas as pd
import numpy as np
import chardet

In [2]:
# Don't care! Just making sure the computer can read the file
path = 'appended_data_step1e_fixed_jtosubset_masterfile.csv'

with open(path, 'rb') as f:
    result = chardet.detect(f.read())
    print(result)

{'encoding': 'Windows-1252', 'confidence': 0.7268578897098108, 'language': ''}


In [3]:
# Reading the data into a pandas DataFrame, where we can do useful stuff to the data
df = pd.read_csv(path, encoding=result['encoding'])

df.head()

Unnamed: 0.1,Unnamed: 0,caper_ID,date,paper_code,outlet,title,jel,prog_areas,covid_decision,original_name,...,file_name,file_num,sample,check_not_missing_authors,error_seniority,fixed_caper,revision_date,name_flag,Name_corrections,Any_notes
0,1,345,28/12/2018,w25414,nber,The Creation And Evolution Of Entrepreneurial ...,"G24, G38, L26","CF, PR",,Abhishek Dev,...,RAs,1.0,,0.0,0.0,,,1,,
1,2,346,14/5/2021,,vox,The government as an (effective) venture capit...,"G24, G38, L26",,,,...,,,,,,fixed by Khoi,,1,,
2,3,663,10/12/2013,,vox,Fdi In Africa,,,,"Seric, Adnan",...,RAs,1.0,,0.0,0.0,fixed by Axel,,1,,
3,4,664,28/4/2020,,vox,Covid-19 Could Spur Automation And Reverse Glo...,,,,"Seric, Adnan",...,RAs,1.0,,0.0,0.0,fixed by Axel,,1,,
4,5,665,7/1/2021,,vox,"Risk, Resilience, And Recalibration In Global ...",,,,"Seric, Adnan",...,Chryssi,4.0,pooled,0.0,0.0,,,1,,


In [4]:
# These are only the columns that we want
columns = [
    'name',
    'title',
    'corrected_name',
    'yearPhDgraduation',
    'phd_institution',
    'curr_affiliation',
    'curr_position',
    'pub_affiliation',
    'pub_position'
]

# and we can drop all the other columns
df = df[columns]

df.head()

Unnamed: 0,name,title,corrected_name,yearPhDgraduation,phd_institution,curr_affiliation,curr_position,pub_affiliation,pub_position
0,Abhishek Dev,The Creation And Evolution Of Entrepreneurial ...,Abhishek Dev,,,harvard business school,research associate,harvard business school,research associate
1,Abhishek Dev,The government as an (effective) venture capit...,Abhishek Dev,,yale university,yale university,phd student,yale university,phd student
2,Adnan Seric,Fdi In Africa,Adnan Seric,2010.0,university of st andrews,united nations industrial development organiz...,Research and Industrial Policy Officer,industrial development officer,united nations industrial development organisa...
3,Adnan Seric,Covid-19 Could Spur Automation And Reverse Glo...,Adnan Seric,2010.0,university of st andrews,united nations industrial development organiz...,Research and Industrial Policy Officer,Research and Industrial Policy Officer,united nations industrial development organisa...
4,Adnan Seric,"Risk, Resilience, And Recalibration In Global ...",Adnan Seric,,university of st andrews,united nations industrial development organiza...,research manager,united tions industrial development organizati...,research manager


In [5]:
# Grouping the data by names!
df_gb = df.groupby(['name'])
names = df_gb.groups.keys()
print(len(names)) # Here we can see that we have 296 unique names in the data

296


In [6]:
# For each name (person) we check the few rows for consistency
for name in names:
    group = df_gb.get_group(name)
    
    # these rows
    col_consistency = [
        'yearPhDgraduation',
        'phd_institution',
        'curr_affiliation',
        'curr_position'
    ]
    
    # and replace them with the mode of the column!
    for col in col_consistency:
        try:
            df.loc[df['name']==name, col] = group[col].mode()[0]
        except:
            df.loc[df['name']==name, col] = np.nan # Just a contingency in case there is missing data (spoiler: there was)

df.head(10)

Unnamed: 0,name,title,corrected_name,yearPhDgraduation,phd_institution,curr_affiliation,curr_position,pub_affiliation,pub_position
0,Abhishek Dev,The Creation And Evolution Of Entrepreneurial ...,Abhishek Dev,,yale university,harvard business school,phd student,harvard business school,research associate
1,Abhishek Dev,The government as an (effective) venture capit...,Abhishek Dev,,yale university,harvard business school,phd student,yale university,phd student
2,Adnan Seric,Fdi In Africa,Adnan Seric,2010.0,university of st andrews,united nations industrial development organiz...,Research and Industrial Policy Officer,industrial development officer,united nations industrial development organisa...
3,Adnan Seric,Covid-19 Could Spur Automation And Reverse Glo...,Adnan Seric,2010.0,university of st andrews,united nations industrial development organiz...,Research and Industrial Policy Officer,Research and Industrial Policy Officer,united nations industrial development organisa...
4,Adnan Seric,"Risk, Resilience, And Recalibration In Global ...",Adnan Seric,2010.0,university of st andrews,united nations industrial development organiz...,Research and Industrial Policy Officer,united tions industrial development organizati...,research manager
5,Ahmed Mushfiq Mobarak,Skilled Immigration And Innovation: Evidence F...,Ahmed Mushfiq Mobarak,2002.0,university of maryland,yale university,professor,yale university,assistant professor
6,Ahmed Mushfiq Mobarak,Development Effects Of Electrification: Eviden...,Ahmed Mushfiq Mobarak,2002.0,university of maryland,yale university,professor,yale university,assistant professor
7,Ahmed Mushfiq Mobarak,Seasonal Migration And Risk Aversion,Ahmed Mushfiq Mobarak,2002.0,university of maryland,yale university,professor,yale university,assistant professor
8,Ahmed Mushfiq Mobarak,"Gender Differences In Preferences, Intra-House...",Ahmed Mushfiq Mobarak,2002.0,university of maryland,yale university,professor,yale university,associate professor
9,Ahmed Mushfiq Mobarak,International Graduate Students Are Critical T...,Ahmed Mushfiq Mobarak,2002.0,university of maryland,yale university,professor,yale university,associate professor


In [7]:
# Function to check if the corrected name is longer than the original. If it is, lock it in as 'corrected_name'
def correct_name(row):
    # Less than 31 characters cause the extra python script doesn't like it
    if len(row['corrected_name']) >= len(row['name']) and len(row['corrected_name']) <= 31:
        return row['corrected_name']
    else:
        return row['name']

df['name_corrections'] = df.apply(correct_name, axis=1)

# Drop the old name columns, we don't need them anymore
df = df.drop(['name', 'corrected_name'], axis=1)

df.head()

Unnamed: 0,title,yearPhDgraduation,phd_institution,curr_affiliation,curr_position,pub_affiliation,pub_position,name_corrections
0,The Creation And Evolution Of Entrepreneurial ...,,yale university,harvard business school,phd student,harvard business school,research associate,Abhishek Dev
1,The government as an (effective) venture capit...,,yale university,harvard business school,phd student,yale university,phd student,Abhishek Dev
2,Fdi In Africa,2010.0,university of st andrews,united nations industrial development organiz...,Research and Industrial Policy Officer,industrial development officer,united nations industrial development organisa...,Adnan Seric
3,Covid-19 Could Spur Automation And Reverse Glo...,2010.0,university of st andrews,united nations industrial development organiz...,Research and Industrial Policy Officer,Research and Industrial Policy Officer,united nations industrial development organisa...,Adnan Seric
4,"Risk, Resilience, And Recalibration In Global ...",2010.0,university of st andrews,united nations industrial development organiz...,Research and Industrial Policy Officer,united tions industrial development organizati...,research manager,Adnan Seric


In [8]:
# df.to_csv('processed_productivity_data.csv')

# Grouping the data by names again
df_gb = df.groupby(['name_corrections'])
names = df_gb.groups.keys()

# and writing each to a separate csv!
for name in names:
    group = df_gb.get_group(name)
    group.to_csv(f'data/{name}.csv')