**CONTRIBUTOR PROFILE**


This notebook aims to describe the general characteristics of group of contributors participating in a selection of HOT projects.

In [None]:
#import libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from tqdm.autonotebook import tqdm
from IPython.display import Markdown, display

In [None]:
# Access folder containing project data
data_folder = "XXXX"
in_drive = True  # True to mount a drive while working in Google Colab
if in_drive:
    from google.colab import drive
    drive.mount("/content/drive", force_remount=True)

Mounted at /content/drive


In [None]:
#Dictionary of HOT regions https://wiki.openstreetmap.org/wiki/Humanitarian_OSM_Team/Priority_countries
country_to_abbrev = {
    "Andorra": "AD",
    "United Arab Emirates": "AE",
    "Afghanistan": "AF",
    "Antigua and Barbuda": "AG",
    "Anguilla": "AI",
    "Albania": "AL",
    "Armenia": "AM",
    "Angola": "AO",
    "Antarctica": "AQ",
    "Argentina": "AR",
    "American Samoa": "AS",
    "Austria": "AT",
    "Australia": "AU",
    "Aruba": "AW",
    "Åland Islands": "AX",
    "Azerbaijan": "AZ",
    "Bosnia and Herzegovina": "BA",
    "Barbados": "BB",
    "Bangladesh": "BD",
    "Belgium": "BE",
    "Burkina Faso": "BF",
    "Bulgaria": "BG",
    "Bahrain": "BH",
    "Burundi": "BI",
    "Benin": "BJ",
    "Saint Barthélemy": "BL",
    "Bermuda": "BM",
    "Brunei Darussalam": "BN",
    "Bolivia": "BO",
    "Bonaire, Sint Eustatius and Saba": "BQ",
    "Brazil": "BR",
    "The Bahamas": "BS",
    "Bhutan": "BT",
    "Bouvet Island": "BV",
    "Botswana": "BW",
    "Belarus": "BY",
    "Belize": "BZ",
    "Canada": "CA",
    "Cocos (Keeling) Islands": "CC",
    "Democratic Republic of the Congo": "CD",
    "Central African Republic": "CF",
    "Congo-Brazzaville": "CG",
    "Switzerland": "CH",
    "Côte dIvoire": "CI",
    "Cook Islands": "CK",
    "Chile": "CL",
    "Cameroon": "CM",
    "China": "CN",
    "Colombia": "CO",
    "Costa Rica": "CR",
    "Cuba": "CU",
    "Cabo Verde": "CV",
    "Curaçao": "CW",
    "Christmas Island": "CX",
    "Cyprus": "CY",
    "Czechia": "CZ",
    "Germany": "DE",
    "Djibouti": "DJ",
    "Denmark": "DK",
    "Dominica": "DM",
    "Dominican Republic": "DO",
    "Algeria": "DZ",
    "Ecuador": "EC",
    "Estonia": "EE",
    "Egypt": "EG",
    "Western Sahara": "EH",
    "Eritrea": "ER",
    "Spain": "ES",
    "Ethiopia": "ET",
    "Finland": "FI",
    "Fiji": "FJ",
    "Falkland Islands (Malvinas)": "FK",
    "Federated States of Micronesia": "FM",
    "Faroe Islands": "FO",
    "France": "FR",
    "Gabon": "GA",
    "United Kingdom": "GB",
    "Grenada": "GD",
    "Georgia": "GE",
    "French Guiana": "GF",
    "Guernsey": "GG",
    "Ghana": "GH",
    "Gibraltar": "GI",
    "Greenland": "GL",
    "The Gambia": "GM",
    "Guinea": "GN",
    "Guadeloupe": "GP",
    "Equatorial Guinea": "GQ",
    "Greece": "GR",
    "South Georgia and the South Sandwich Islands": "GS",
    "Guatemala": "GT",
    "Guam": "GU",
    "Guinea-Bissau": "GW",
    "Guyana": "GY",
    "Hong Kong": "HK",
    "Heard Island and McDonald Islands": "HM",
    "Honduras": "HN",
    "Croatia": "HR",
    "Haiti": "HT",
    "Hungary": "HU",
    "Indonesia": "ID",
    "Ireland": "IE",
    "Israel": "IL",
    "Isle of Man": "IM",
    "India": "IN",
    "British Indian Ocean Territory": "IO",
    "Iraq": "IQ",
    "Iran (Islamic Republic of)": "IR",
    "Iceland": "IS",
    "Italy": "IT",
    "Jersey": "JE",
    "Jamaica": "JM",
    "Jordan": "JO",
    "Japan": "JP",
    "Kenya": "KE",
    "Kyrgyzstan": "KG",
    "Cambodia": "KH",
    "Kiribati": "KI",
    "Comoros": "KM",
    "Saint Kitts and Nevis": "KN",
    "Korea (Democratic People's Republic of)": "KP",
    "Korea, Republic of": "KR",
    "Kuwait": "KW",
    "Cayman Islands": "KY",
    "Kazakhstan": "KZ",
    "Laos": "LA",
    "Lebanon": "LB",
    "Saint Lucia": "LC",
    "Liechtenstein": "LI",
    "Sri Lanka": "LK",
    "Liberia": "LR",
    "Lesotho": "LS",
    "Lithuania": "LT",
    "Luxembourg": "LU",
    "Latvia": "LV",
    "Libya": "LY",
    "Morocco": "MA",
    "Monaco": "MC",
    "Moldova": "MD",
    "Montenegro": "ME",
    "Saint Martin (French part)": "MF",
    "Madagascar": "MG",
    "Marshall Islands": "MH",
    "North Macedonia": "MK",
    "Mali": "ML",
    "Myanmar": "MM",
    "Mongolia": "MN",
    "Macao": "MO",
    "Northern Mariana Islands": "MP",
    "Martinique": "MQ",
    "Mauritania": "MR",
    "Montserrat": "MS",
    "Malta": "MT",
    "Mauritius": "MU",
    "Maldives": "MV",
    "Malawi": "MW",
    "Mexico": "MX",
    "Malaysia": "MY",
    "Mozambique": "MZ",
    "Namibia": "NA",
    "New Caledonia": "NC",
    "Niger": "NE",
    "Norfolk Island": "NF",
    "Nigeria": "NG",
    "Nicaragua": "NI",
    "Netherlands": "NL",
    "Netherlands Antilles": "AN",
    "Norway": "NO",
    "Nepal": "NP",
    "Nauru": "NR",
    "Niue": "NU",
    "New Zealand": "NZ",
    "Oman": "OM",
    "Panama": "PA",
    "Peru": "PE",
    "French Polynesia": "PF",
    "Papua New Guinea": "PG",
    "Philippines": "PH",
    "Pakistan": "PK",
    "Poland": "PL",
    "Saint Pierre and Miquelon": "PM",
    "Pitcairn": "PN",
    "Puerto Rico": "PR",
    "Palestine": "PS",
    "Portugal": "PT",
    "Palau": "PW",
    "Paraguay": "PY",
    "Qatar": "QA",
    "Réunion": "RE",
    "Romania": "RO",
    "Serbia": "RS",
    "Russia": "RU",
    "Rwanda": "RW",
    "Saudi Arabia": "SA",
    "Solomon Islands": "SB",
    "Seychelles": "SC",
    "Sudan": "SD",
    "Sweden": "SE",
    "Singapore": "SG",
    "Saint Helena, Ascension and Tristan da Cunha": "SH",
    "Slovenia": "SI",
    "Svalbard and Jan Mayen": "SJ",
    "Slovakia": "SK",
    "Sierra Leone": "SL",
    "San Marino": "SM",
    "Senegal": "SN",
    "Somalia": "SO",
    "Somaliland": "SO",
    "Suriname": "SR",
    "South Sudan": "SS",
    "Sao Tome and Principe": "ST",
    "El Salvador": "SV",
    "Sint Maarten (Dutch part)": "SX",
    "Syria": "SY",
    "Eswatini": "SZ",
    "Turks and Caicos Islands": "TC",
    "Chad": "TD",
    "French Southern Territories": "TF",
    "Togo": "TG",
    "Thailand": "TH",
    "Tajikistan": "TJ",
    "Tokelau": "TK",
    "East Timor": "TL",
    "Turkmenistan": "TM",
    "Tunisia": "TN",
    "Tonga": "TO",
    "Turkey": "TR",
    "Trinidad and Tobago": "TT",
    "Tuvalu": "TV",
    "Taiwan, Province of China": "TW",
    "Tanzania": "TZ",
    "Ukraine": "UA",
    "Uganda": "UG",
    "United States Minor Outlying Islands": "UM",
    "United States": "US",
    "Uruguay": "UY",
    "Uzbekistan": "UZ",
    "Holy See": "VA",
    "Saint Vincent and the Grenadines": "VC",
    "Venezuela": "VE",
    "British Virgin Islands": "VG",
    "United States Virgin Islands": "VI",
    "Vietnam": "VN",
    "Vanuatu": "VU",
    "Wallis and Futuna": "WF",
    "Samoa": "WS",
    "Yemen": "YE",
    "Mayotte": "YT",
    "South Africa": "ZA",
    "Zambia": "ZM",
    "Zimbabwe": "ZW",
}

In [None]:
#Dictionary of HOT regions
country_region = {
    "Andorra": "OTHER",
    "United Arab Emirates": "OTHER",
    "Afghanistan": "AP",
    "Antigua and Barbuda": "LAC",
    "Anguilla": "OTHER",
    "Albania": "OTHER",
    "Armenia": "OTHER",
    "Angola": "ESA",
    "Antarctica": "OTHER",
    "Argentina": "OTHER",
    "American Samoa": "OTHER",
    "Austria": "OTHER",
    "Australia": "OTHER",
    "Aruba": "OTHER",
    "Åland Islands": "OTHER",
    "Azerbaijan": "OTHER",
    "Bosnia and Herzegovina": "OTHER",
    "Barbados": "OTHER",
    "Bangladesh": "AP",
    "Belgium": "OTHER",
    "Burkina Faso": "WNA",
    "Bulgaria": "OTHER",
    "Bahrain": "OTHER",
    "Burundi": "ESA",
    "Benin": "WNA",
    "Saint Barthélemy": "OTHER",
    "Bermuda": "OTHER",
    "Brunei Darussalam": "AP",
    "Bolivia": "LAC",
    "Bonaire, Sint Eustatius and Saba": "OTHER",
    "Brazil": "LAC",
    "The Bahamas": "OTHER",
    "Bhutan": "AP",
    "Bouvet Island": "OTHER",
    "Botswana": "OTHER",
    "Belarus": "OTHER",
    "Belize": "LAC",
    "Canada": "OTHER",
    "Cocos (Keeling) Islands": "OTHER",
    "Democratic Republic of the Congo": "ESA",
    "Central African Republic": "WNA",
    "Congo-Brazzaville": "WNA",
    "Switzerland": "OTHER",
    "Côte dIvoire": "WNA",
    "Cook Islands": "OTHER",
    "Chile": "LAC",
    "Cameroon": "WNA",
    "China": "OTHER",
    "Colombia": "OTHER",
    "Costa Rica": "LAC",
    "Cuba": "OTHER",
    "Cabo Verde": "WNA",
    "Curaçao": "OTHER",
    "Christmas Island": "OTHER",
    "Cyprus": "OTHER",
    "Czechia": "OTHER",
    "Germany": "OTHER",
    "Djibouti": "ESA",
    "Denmark": "OTHER",
    "Dominica": "LAC",
    "Dominican Republic": "LAC",
    "Algeria": "WNA",
    "Ecuador": "LAC",
    "Estonia": "OTHER",
    "Egypt": "ESA",
    "Western Sahara": "OTHER",
    "Eritrea": "OTHER",
    "Spain": "OTHER",
    "Ethiopia": "ESA",
    "Finland": "OTHER",
    "Fiji": "AP",
    "Falkland Islands (Malvinas)": "OTHER",
    "Federated States of Micronesia": "AP",
    "Faroe Islands": "OTHER",
    "France": "OTHER",
    "Gabon": "OTHER",
    "United Kingdom": "OTHER",
    "Grenada": "OTHER",
    "Georgia": "OTHER",
    "French Guiana": "OTHER",
    "Guernsey": "OTHER",
    "Ghana": "WNA",
    "Gibraltar": "OTHER",
    "Greenland": "OTHER",
    "The Gambia": "WNA",
    "Guinea": "WNA",
    "Guadeloupe": "OTHER",
    "Equatorial Guinea": "WNA",
    "Greece": "OTHER",
    "South Georgia and the South Sandwich Islands": "OTHER",
    "Guatemala": "LAC",
    "Guam": "OTHER",
    "Guinea-Bissau": "WNA",
    "Guyana": "LAC",
    "Hong Kong": "OTHER",
    "Heard Island and McDonald Islands": "OTHER",
    "Honduras": "LAC",
    "Croatia": "OTHER",
    "Haiti": "LAC",
    "Hungary": "OTHER",
    "Indonesia": "AP",
    "Ireland": "OTHER",
    "Israel": "OTHER",
    "Isle of Man": "OTHER",
    "India": "AP",
    "British Indian Ocean Territory": "OTHER",
    "Iraq": "OTHER",
    "Iran (Islamic Republic of)": "OTHER",
    "Iceland": "OTHER",
    "Italy": "OTHER",
    "Jersey": "OTHER",
    "Jamaica": "LAC",
    "Jordan": "OTHER",
    "Japan": "OTHER",
    "Kenya": "ESA",
    "Kyrgyzstan": "OTHER",
    "Cambodia": "AP",
    "Kiribati": "AP",
    "Comoros": "ESA",
    "Saint Kitts and Nevis": "OTHER",
    "Korea (Democratic People's Republic of)": "OTHER",
    "Korea, Republic of": "OTHER",
    "Kuwait": "OTHER",
    "Cayman Islands": "OTHER",
    "Kazakhstan": "OTHER",
    "Laos": "AP",
    "Lebanon": "OTHER",
    "Saint Lucia": "OTHER",
    "Liechtenstein": "OTHER",
    "Sri Lanka": "AP",
    "Liberia": "WNA",
    "Lesotho": "ESA",
    "Lithuania": "OTHER",
    "Luxembourg": "OTHER",
    "Latvia": "OTHER",
    "Libya": "OTHER",
    "Morocco": "WNA",
    "Monaco": "OTHER",
    "Moldova": "OTHER",
    "Montenegro": "OTHER",
    "Saint Martin (French part)": "OTHER",
    "Madagascar": "ESA",
    "Marshall Islands": "OTHER",
    "North Macedonia": "OTHER",
    "Mali": "WNA",
    "Myanmar": "AP",
    "Mongolia": "OTHER",
    "Macao": "OTHER",
    "Northern Mariana Islands": "OTHER",
    "Martinique": "OTHER",
    "Mauritania": "WNA",
    "Montserrat": "OTHER",
    "Malta": "OTHER",
    "Mauritius": "ESA",
    "Maldives": "OTHER",
    "Malawi": "ESA",
    "Mexico": "LAC",
    "Malaysia": "AP",
    "Mozambique": "ESA",
    "Namibia": "ESA",
    "New Caledonia": "OTHER",
    "Niger": "WNA",
    "Norfolk Island": "OTHER",
    "Nigeria": "WNA",
    "Nicaragua": "LAC",
    "Netherlands": "OTHER",
    "Netherlands Antilles": "OTHER",
    "Norway": "OTHER",
    "Nepal": "AP",
    "Nauru": "OTHER",
    "Niue": "OTHER",
    "New Zealand": "OTHER",
    "Oman": "OTHER",
    "Panama": "LAC",
    "Peru": "LAC",
    "French Polynesia": "OTHER",
    "Papua New Guinea": "AP",
    "Philippines": "AP",
    "Pakistan": "AP",
    "Poland": "OTHER",
    "Saint Pierre and Miquelon": "OTHER",
    "Pitcairn": "OTHER",
    "Puerto Rico": "OTHER",
    "Palestine": "OTHER",
    "Portugal": "OTHER",
    "Palau": "OTHER",
    "Paraguay": "OTHER",
    "Qatar": "OTHER",
    "Réunion": "OTHER",
    "Romania": "OTHER",
    "Serbia": "OTHER",
    "Russia": "OTHER",
    "Rwanda": "ESA",
    "Saudi Arabia": "OTHER",
    "Solomon Islands": "AP",
    "Seychelles": "OTHER",
    "Sudan": "ESA",
    "Sweden": "OTHER",
    "Singapore": "OTHER",
    "Saint Helena, Ascension and Tristan da Cunha": "OTHER",
    "Slovenia": "OTHER",
    "Svalbard and Jan Mayen": "OTHER",
    "Slovakia": "OTHER",
    "Sierra Leone": "WNA",
    "San Marino": "OTHER",
    "Senegal": "WNA",
    "Somalia": "ESA",
    "Somaliland": "ESA",
    "Suriname": "OTHER",
    "South Sudan": "ESA",
    "Sao Tome and Principe": "WNA",
    "El Salvador": "LAC",
    "Sint Maarten (Dutch part)": "OTHER",
    "Syria": "OTHER",
    "Eswatini": "ESA",
    "Turks and Caicos Islands": "OTHER",
    "Chad": "WNA",
    "French Southern Territories": "OTHER",
    "Togo": "WNA",
    "Thailand": "OTHER",
    "Tajikistan": "OTHER",
    "Tokelau": "OTHER",
    "East Timor": "AP",
    "Turkmenistan": "OTHER",
    "Tunisia": "OTHER",
    "Tonga": "AP",
    "Turkey": "OTHER",
    "Trinidad and Tobago": "LAC",
    "Tuvalu": "OTHER",
    "Taiwan, Province of China": "OTHER",
    "Tanzania": "ESA",
    "Ukraine": "OTHER",
    "Uganda": "ESA",
    "United States Minor Outlying Islands": "OTHER",
    "United States": "OTHER",
    "Uruguay": "LAC",
    "Uzbekistan": "AP",
    "Holy See": "OTHER",
    "Saint Vincent and the Grenadines": "OTHER",
    "Venezuela": "LAC",
    "British Virgin Islands": "OTHER",
    "United States Virgin Islands": "OTHER",
    "Vietnam": "AP",
    "Vanuatu": "AP",
    "Wallis and Futuna": "OTHER",
    "Samoa": "OTHER",
    "Yemen": "AP",
    "Mayotte": "OTHER",
    "South Africa": "OTHER",
    "Zambia": "ESA",
    "Zimbabwe": "ESA",
}

In [None]:
# Read project data
display(Markdown("Reading selected project ids"))
input_data = pd.read_csv(data_folder + "output_archived_projs_selected_ids.csv")
projects = pd.read_csv(data_folder + "output_archived_projs.csv")
countries=[]
for index, row in projects.reset_index().iterrows():
    try:
        countries.append(row['country'].strip('][').split(', ')[0].replace("'","").replace('"',""))
    except:
        countries.append(row['country'])
projects['country']=countries
projects['country_code']= projects['country'].map(country_to_abbrev)
input_data=input_data.merge(projects[["projectId","country","country_code","difficulty","priority"]], on="projectId", how="left")
stats = pd.read_csv(data_folder + "output_archived_projs_stats.csv")
input_data=input_data.merge(stats[["projectId","totalTasks"]], on="projectId", how="left")
input_data['quantile'] = pd.qcut(input_data['totalTasks'], 4, labels=["Q1","Q2","Q3","Q4"])
input_data['region']= input_data['country'].map(country_region)
input_data

Reading selected project ids

Unnamed: 0,projectId,country,country_code,difficulty,priority,totalTasks,quantile,region
0,11875,Nepal,NP,EASY,MEDIUM,89,Q1,AP
1,11884,Philippines,PH,EASY,MEDIUM,194,Q2,AP
2,11890,Guatemala,GT,EASY,LOW,734,Q4,LAC
3,11891,Guatemala,GT,EASY,LOW,606,Q4,LAC
4,11892,Nigeria,NG,EASY,MEDIUM,600,Q4,WNA
...,...,...,...,...,...,...,...,...
741,15689,Malawi,MW,MODERATE,MEDIUM,102,Q1,ESA
742,15705,Ghana,GH,EASY,HIGH,258,Q2,WNA
743,15744,Ghana,GH,EASY,HIGH,213,Q2,WNA
744,15745,Ghana,GH,MODERATE,URGENT,396,Q3,WNA


In [None]:
# Read contributors data
output=pd.DataFrame()
display(Markdown("DOWNLOADING TASK GRIDS FOR THE SELECTED PROJECTS"))
with tqdm(total=len(input_data["projectId"]), unit=" project") as pbar:
  for index, row in input_data.iterrows():
      input_users_filename = data_folder + "output_users_proj_" + str(row["projectId"]) + ".csv"
      users = pd.read_csv(input_users_filename)
      users["projectId"]=row["projectId"]
      users["locality"]=np.where(users["country"].isnull(), "UNKNOWN", np.where(users["country"]==row["country_code"], "NATIONAL", "INTERNATIONAL"))
      output = pd.concat([output, pd.DataFrame(users)], ignore_index=True)
      pbar.update(1)
output

DOWNLOADING TASK GRIDS FOR THE SELECTED PROJECTS

  0%|          | 0/746 [00:00<?, ? project/s]

Unnamed: 0,id,username,role,mappingLevel,projectsMapped,emailAddress,isExpert,twitterId,facebookId,linkedinId,...,questionsAndCommentsNotifications,projectsNotifications,tasksNotifications,taskCommentsNotifications,teamsAnnouncementNotifications,gender,selfDescriptionGender,projectId,locality,isEmailVerified
0,11491881,PraKH,MAPPER,ADVANCED,83.0,,True,,prabin.khatiwada.332,prabin-khatiwada-5109a7202,...,False,True,False,False,False,,,11875,NATIONAL,
1,12284349,PraKH2,MAPPER,ADVANCED,40.0,,False,,,,...,False,True,True,False,False,,,11875,UNKNOWN,
2,13116824,Vivek Dumre,MAPPER,ADVANCED,171.0,,True,,Vivek Dumre,Vivek Dumre,...,False,True,True,False,False,,,11875,NATIONAL,
3,11807790,Aarogya Pandey,MAPPER,ADVANCED,128.0,,False,,Aarogya Pandey,Gorakh Nath Pandey,...,False,True,False,False,True,,,11875,NATIONAL,
4,13255920,samrat02,MAPPER,ADVANCED,66.0,,False,@SamratA94826787,श्री सम्राट आचार्य,samrat acharya,...,False,True,True,False,False,,,11875,NATIONAL,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68067,223047,Bill32,MAPPER,ADVANCED,53.0,,False,,,,...,False,True,True,False,False,,,15797,UNKNOWN,
68068,20386900,DrakeB,MAPPER,BEGINNER,1.0,,False,,,,...,False,True,True,False,True,,,15797,UNKNOWN,
68069,20499240,hahsas,MAPPER,BEGINNER,1.0,,False,,,,...,False,True,True,False,True,,,15797,INTERNATIONAL,
68070,6534559,JohanHoogland,MAPPER,ADVANCED,571.0,,True,,,,...,False,True,True,True,True,,,15797,INTERNATIONAL,


In [None]:
# Figure 2. Completeness of the contributor profile by mapping level -\% of contributors-
completeness=output.drop_duplicates("username")[["id","twitterId","facebookId","linkedinId","slackId","city","country","name","pictureUrl",'mappingLevel']].groupby('mappingLevel').count().reset_index()
completeness["twitterId"]=round(completeness["twitterId"]/completeness["id"]*100,1)
completeness["facebookId"]=round(completeness["facebookId"]/completeness["id"]*100,1)
completeness["linkedinId"]=round(completeness["linkedinId"]/completeness["id"]*100,1)
completeness["slackId"]=round(completeness["slackId"]/completeness["id"]*100,1)
completeness["city"]=round(completeness["city"]/completeness["id"]*100,1)
completeness["country"]=round(completeness["country"]/completeness["id"]*100,1)
completeness["name"]=round(completeness["name"]/completeness["id"]*100,1)
completeness["pictureUrl"]=round(completeness["pictureUrl"]/completeness["id"]*100,1)
completeness

Unnamed: 0,mappingLevel,id,twitterId,facebookId,linkedinId,slackId,city,country,name,pictureUrl
0,ADVANCED,2526,23.3,21.2,20.2,24.8,49.1,100.0,65.0,50.5
1,BEGINNER,35387,2.4,2.6,3.2,8.0,27.5,100.0,40.8,5.6
2,INTERMEDIATE,980,17.1,16.6,17.7,23.5,51.0,100.0,70.3,33.5


In [None]:
# Calculate contributors per project level
# Table 1. Overview of total contributors by mapping level
mappinglevel=pd.DataFrame(output.drop_duplicates("username").groupby("mappingLevel", dropna=False)["username"].count()).reset_index()
mappinglevel.columns=["mappinglevel","n"]
mappinglevel['%'] = 100 * mappinglevel['n'] / mappinglevel['n'].sum()
mappinglevel.sort_values("n",ascending=False).sort_values("n", ascending=False)

Unnamed: 0,mappinglevel,n,%
1,BEGINNER,35387,90.985524
0,ADVANCED,2526,6.494742
2,INTERMEDIATE,980,2.519734


In [None]:
# Calculate projectsMapped by mapping level
# Table 1. Overview of total contributors by mapping level
output["projectsMapped"]=output["projectsMapped"].replace(np.nan, 0)
output.drop_duplicates("id").projectsMapped.describe()

count    38893.000000
mean         4.390096
std         22.261369
min          0.000000
25%          0.000000
50%          1.000000
75%          3.000000
max       1451.000000
Name: projectsMapped, dtype: float64

In [None]:
# Calculate projectsMapped per experience level
# Table 1. Overview of total contributors by mapping level
output.drop_duplicates("username").groupby('mappingLevel').agg({'projectsMapped':['mean', 'max', "min", "median","std"]})

Unnamed: 0_level_0,projectsMapped,projectsMapped,projectsMapped,projectsMapped,projectsMapped
Unnamed: 0_level_1,mean,max,min,median,std
mappingLevel,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
ADVANCED,35.7308,1451.0,0.0,12.0,79.432245
BEGINNER,1.937237,111.0,0.0,1.0,3.422671
INTERMEDIATE,12.178571,130.0,0.0,8.0,13.338668


In [None]:
# Weighted average of contributors mapping level per project
# Fig. 3. Participation structure in projects -N=38893, average % of contributors per project-
users_project=pd.pivot_table(output, values='username', index='projectId',columns=['mappingLevel'], aggfunc="count",fill_value=0)
users_project=users_project.merge(input_data[["projectId","difficulty","country","country_code","totalTasks","quantile","region","priority"]], on="projectId", how="left")
mappinglevel=users_project[["ADVANCED",	"BEGINNER",	"INTERMEDIATE"]].sum().reset_index()
mappinglevel["%"]=round(mappinglevel[0]/sum(mappinglevel[0])*100,1)
mappinglevel

Unnamed: 0,index,0,%
0,ADVANCED,15077,22.1
1,BEGINNER,49662,73.0
2,INTERMEDIATE,3333,4.9


In [None]:
# Weighted average of contributors mapping level per project segregated by difficulty
# Fig. 3. Participation structure in projects -N=38893, average % of contributors per project-
mappinglevelXdifficulty=users_project[["ADVANCED",	"BEGINNER",	"INTERMEDIATE","difficulty"]].groupby("difficulty").sum().reset_index()
mappinglevelXdifficulty[["ADVANCED",	"BEGINNER",	"INTERMEDIATE"]] = round(mappinglevelXdifficulty[["ADVANCED",	"BEGINNER",	"INTERMEDIATE"]].div(mappinglevelXdifficulty.ADVANCED+mappinglevelXdifficulty.BEGINNER+mappinglevelXdifficulty.INTERMEDIATE, axis=0)*100,1)
mappinglevelXdifficulty

Unnamed: 0,difficulty,ADVANCED,BEGINNER,INTERMEDIATE
0,CHALLENGING,91.7,0.9,7.4
1,EASY,16.0,79.8,4.2
2,MODERATE,37.4,56.0,6.5


In [None]:
# Weighted average of contributors mapping level per project segregated by priority
# Fig. 3. Participation structure in projects -N=38893, average % of contributors per project-
mappinglevelXpriority=users_project[["ADVANCED",	"BEGINNER",	"INTERMEDIATE","priority"]].groupby("priority").sum().reset_index()
mappinglevelXpriority[["ADVANCED",	"BEGINNER",	"INTERMEDIATE"]] = round(mappinglevelXpriority[["ADVANCED",	"BEGINNER",	"INTERMEDIATE"]].div(mappinglevelXpriority.ADVANCED+mappinglevelXpriority.BEGINNER+mappinglevelXpriority.INTERMEDIATE, axis=0)*100,1)
mappinglevelXpriority

Unnamed: 0,priority,ADVANCED,BEGINNER,INTERMEDIATE
0,HIGH,21.1,73.7,5.2
1,LOW,21.2,74.2,4.6
2,MEDIUM,24.6,70.8,4.6
3,URGENT,23.0,71.5,5.5


In [None]:
# Weighted average of contributors mapping level per project segregated by quantile
# Fig. 3. Participation structure in projects -N=38893, average % of contributors per project-
mappinglevelXquantile=users_project[["ADVANCED",	"BEGINNER",	"INTERMEDIATE","quantile"]].groupby("quantile").sum().reset_index()
mappinglevelXquantile[["ADVANCED",	"BEGINNER",	"INTERMEDIATE"]] = round(mappinglevelXquantile[["ADVANCED",	"BEGINNER",	"INTERMEDIATE"]].div(mappinglevelXquantile.ADVANCED+mappinglevelXquantile.BEGINNER+mappinglevelXquantile.INTERMEDIATE, axis=0)*100,1)
mappinglevelXquantile

Unnamed: 0,quantile,ADVANCED,BEGINNER,INTERMEDIATE
0,Q1,42.8,51.0,6.2
1,Q2,32.4,61.7,5.9
2,Q3,20.3,74.9,4.8
3,Q4,19.0,76.4,4.6


In [None]:
# Weighted average of contributors mapping level per project segregated by region
# Fig. 3. Participation structure in projects -N=38893, average % of contributors per project-
mappinglevelXregion=users_project[["ADVANCED",	"BEGINNER",	"INTERMEDIATE","region"]].groupby("region").sum().reset_index()
mappinglevelXregion[["ADVANCED",	"BEGINNER",	"INTERMEDIATE"]] = round(mappinglevelXregion[["ADVANCED",	"BEGINNER",	"INTERMEDIATE"]].div(mappinglevelXregion.ADVANCED+mappinglevelXregion.BEGINNER+mappinglevelXregion.INTERMEDIATE, axis=0)*100,1)
mappinglevelXregion

Unnamed: 0,region,ADVANCED,BEGINNER,INTERMEDIATE
0,AP,26.4,67.6,6.0
1,ESA,24.8,69.9,5.3
2,LAC,15.5,81.0,3.5
3,OTHER,18.8,76.3,4.9
4,WNA,29.8,64.9,5.3


In [None]:
# Weighted average of contributors country per project
# Fig. 3. Participation structure in projects -N=38893, average % of contributors per project-
users_project=pd.pivot_table(output, values='username', index='projectId',columns=['locality'], aggfunc="count",fill_value=0)
users_project=users_project.merge(input_data[["projectId","difficulty","country","country_code","totalTasks","quantile","region","priority"]], on="projectId", how="left")
location=users_project[["INTERNATIONAL",	"NATIONAL",	"UNKNOWN"]].sum().reset_index()
location["%"]=round(location[0]/sum(location[0])*100,1)
location

Unnamed: 0,index,0,%
0,INTERNATIONAL,25137,36.9
1,NATIONAL,2567,3.8
2,UNKNOWN,40368,59.3


In [None]:
# Weighted average of contributors country per project segregated by difficulty
# Fig. 3. Participation structure in projects -N=38893, average % of contributors per project-
locationXdifficulty=users_project[["INTERNATIONAL",	"NATIONAL",	"UNKNOWN","difficulty"]].groupby("difficulty").sum().reset_index()
locationXdifficulty[["INTERNATIONAL",	"NATIONAL",	"UNKNOWN"]] = round(locationXdifficulty[["INTERNATIONAL",	"NATIONAL",	"UNKNOWN"]].div(locationXdifficulty.INTERNATIONAL+locationXdifficulty.NATIONAL+locationXdifficulty.UNKNOWN, axis=0)*100,1)
locationXdifficulty

Unnamed: 0,difficulty,INTERNATIONAL,NATIONAL,UNKNOWN
0,CHALLENGING,73.1,8.3,18.5
1,EASY,35.3,3.0,61.7
2,MODERATE,40.9,5.7,53.4


In [None]:
# Weighted average of contributors country per project segregated by priority
# Fig. 3. Participation structure in projects -N=38893, average % of contributors per project-
locationXpriority=users_project[["INTERNATIONAL",	"NATIONAL",	"UNKNOWN","priority"]].groupby("priority").sum().reset_index()
locationXpriority[["INTERNATIONAL",	"NATIONAL",	"UNKNOWN"]] = round(locationXpriority[["INTERNATIONAL",	"NATIONAL",	"UNKNOWN"]].div(locationXpriority.INTERNATIONAL+locationXpriority.NATIONAL+locationXpriority.UNKNOWN, axis=0)*100,1)
locationXpriority

Unnamed: 0,priority,INTERNATIONAL,NATIONAL,UNKNOWN
0,HIGH,40.2,1.2,58.5
1,LOW,34.5,4.0,61.5
2,MEDIUM,36.1,8.2,55.7
3,URGENT,40.2,2.1,57.8


In [None]:
# Weighted average of contributors country per project segregated by quantile
# Fig. 3. Participation structure in projects -N=38893, average % of contributors per project-
locationXquantile=users_project[["INTERNATIONAL",	"NATIONAL",	"UNKNOWN","quantile"]].groupby("quantile").sum().reset_index()
locationXquantile[["INTERNATIONAL",	"NATIONAL",	"UNKNOWN"]] = round(locationXquantile[["INTERNATIONAL",	"NATIONAL",	"UNKNOWN"]].div(locationXquantile.INTERNATIONAL+locationXquantile.NATIONAL+locationXquantile.UNKNOWN, axis=0)*100,1)
locationXquantile

Unnamed: 0,quantile,INTERNATIONAL,NATIONAL,UNKNOWN
0,Q1,42.8,10.9,46.3
1,Q2,38.4,8.8,52.8
2,Q3,38.0,3.3,58.6
3,Q4,35.6,2.3,62.1


In [None]:
# Weighted average of contributors country per project segregated by region
# Fig. 3. Participation structure in projects -N=38893, average % of contributors per project-
locationXregion=users_project[["INTERNATIONAL",	"NATIONAL",	"UNKNOWN","region"]].groupby("region").sum().reset_index()
locationXregion[["INTERNATIONAL",	"NATIONAL",	"UNKNOWN"]] = round(locationXregion[["INTERNATIONAL",	"NATIONAL",	"UNKNOWN"]].div(locationXregion.INTERNATIONAL+locationXregion.NATIONAL+locationXregion.UNKNOWN, axis=0)*100,1)
locationXregion

Unnamed: 0,region,INTERNATIONAL,NATIONAL,UNKNOWN
0,AP,39.6,7.1,53.4
1,ESA,40.8,4.3,54.9
2,LAC,32.9,1.0,66.1
3,OTHER,31.6,3.6,64.8
4,WNA,40.9,4.6,54.5


In [None]:
# Calculate frequency of contributor´s countries
# Table 2. Location of contributors by reported country
location=output
location["country"]=location["country"].replace(np.nan, 'UNKNOWN')
location=location.merge(input_data[["projectId","region","country"]], on="projectId", how="left")
country=pd.DataFrame(location.groupby("country_x")["id"].nunique()).reset_index().sort_values("id", ascending=False)
country.columns=["country_x","n"]
country['%'] = 100 * country['n'] / country['n'].sum()
country.head(20)

Unnamed: 0,country_x,n,%
151,UNKNOWN,27668,71.138765
152,US,2326,5.980511
49,GB,1183,3.041678
66,IN,933,2.398889
113,PH,869,2.234335
107,NL,493,1.26758
24,CA,313,0.804772
36,DE,306,0.786774
63,IE,282,0.725066
47,FR,279,0.717353


In [None]:
# Calculate frequency of contributor´s countries
# Table 2. Location of contributors by reported country
table = pd.pivot_table(location, index=['country_x'],values='id', columns=['region'], aggfunc=lambda x: len(x.unique())).reset_index()
table=table.replace(np.nan, 0)
table['total']=table['AP']+table['ESA']+table['LAC']+table['OTHER']+table['WNA']
abbrev_to_country = dict((v,k) for k,v in country_to_abbrev.items())
table['country_name']= table['country_x'].map(abbrev_to_country)
table['region']= table['country_name'].map(country_region)
table["AP"]=round(table["AP"]/table["AP"].sum()*100,1)
table["ESA"]=round(table["ESA"]/table["ESA"].sum()*100,1)
table["LAC"]=round(table["LAC"]/table["LAC"].sum()*100,1)
table["OTHER"]=round(table["OTHER"]/table["OTHER"].sum()*100,1)
table["WNA"]=round(table["WNA"]/table["WNA"].sum()*100,1)
table.sort_values("total",ascending=False)

region,country_x,AP,ESA,LAC,OTHER,WNA,total,country_name,region.1
151,UNKNOWN,59.9,66.2,71.1,70.3,63.7,32130.0,,
152,US,7.5,7.5,6.3,5.8,5.1,3133.0,United States,OTHER
49,GB,5.0,4.4,2.9,3.4,3.9,1843.0,United Kingdom,OTHER
66,IN,4.2,0.8,4.4,1.0,0.7,1046.0,India,AP
113,PH,0.9,1.3,5.3,0.8,0.5,955.0,Philippines,AP
...,...,...,...,...,...,...,...,...,...
65,IM,0.0,0.0,0.0,0.0,0.0,1.0,Isle of Man,OTHER
53,GN,0.0,0.0,0.0,0.0,0.0,1.0,Guinea,WNA
60,HT,0.0,0.0,0.0,0.0,0.0,1.0,Haiti,LAC
137,SZ,0.0,0.0,0.0,0.0,0.0,1.0,Eswatini,ESA
