In [1]:
import numpy as np
import os
import pandas as pd
import plotly

In [2]:
%run ../tasking_manager_stats/data_management
%run ../tasking_manager_stats/map_tools

Data file found and the project has not been changed. It won't be downloaded again.


## Load data

In [3]:
df = pd.read_csv(os.path.join(get_data_dir(), 'merged_stats.csv'), encoding='ANSI')
df.head()

Unnamed: 0,Project,Year,Month,Day,Rel. Day,Hour,Minute,Second,Duration,Author,Type
0,10054,2021,1,14,31,11,39,11,69,naranjasol,VALIDATION
1,10054,2021,1,14,31,10,59,33,2347,JYL45,MAPPING
2,10054,2021,1,14,31,10,46,31,772,JYL45,MAPPING
3,10054,2021,1,14,31,10,11,38,312,Lmercier,VALIDATION
4,10054,2020,12,20,6,17,13,55,2811,Claudia Casey,MAPPING


In [27]:
df_author_tasks = pd.read_csv(os.path.join(get_data_dir(), 'merged_stats_one_author_by_task_type.csv'), encoding='ANSI')
df_author_tasks.head()

Unnamed: 0,Project,Task,Year,Month,Day,Rel. Day,Hour,Minute,Second,Author,Type
0,10054.0,1.0,2020.0,12.0,14.0,0.0,21.0,5.0,15.0,jyw160030,MAPPING
1,10054.0,2.0,2020.0,12.0,18.0,4.0,16.0,25.0,23.0,pedr0faria,MAPPING
2,10054.0,3.0,2020.0,12.0,15.0,1.0,2.0,14.0,33.0,Diane Bahati,MAPPING
3,10054.0,4.0,2020.0,12.0,15.0,1.0,13.0,16.0,2.0,max gpain,MAPPING
4,10054.0,10.0,2020.0,12.0,15.0,1.0,22.0,2.0,9.0,jyw160030,MAPPING


## Filter on a year

In [4]:
year = 2021

In [33]:
os.makedirs(str(year), exist_ok=True)

In [5]:
df = df[df["Year"] == year]

In [36]:
df.to_csv(os.path.join(str(year), "raw_data.csv"), index=None)

In [28]:
df_author_tasks = df_author_tasks[df_author_tasks["Year"] == year]

In [37]:
df_author_tasks.to_csv(os.path.join(str(year), "task_authors.csv"), index=None)

## Contributors by project

In [6]:
df_pro_user = df[['Project', 'Author']].drop_duplicates()
df_pro_user.head()

Unnamed: 0,Project,Author
0,10054,naranjasol
1,10054,JYL45
3,10054,Lmercier
6,10054,ManonVi
20,10054,Anaximandre


In [8]:
g[g['Project'] >= 10].sort_values('Project', ascending=False)

Unnamed: 0_level_0,Project
Author,Unnamed: 1_level_1
naranjasol,25
JYL45,24
nicojp,22
Lmercier,21
Anaximandre,21
Sophie Talbot,20
Jean LUQUET,19
remimage,18
ManonVi,17
eliotsotty,15


In [9]:
more_than_three_projects = (g['Project'] >= 3).sum()
print(f"{len(g)} contributors in {year}")
print(f"{more_than_three_projects} contributors on 3 projects or more in {year} ({more_than_three_projects/len(g):.0%}) ")

1610 contributors in 2021
131 contributors on 3 projects or more in 2021 (8%) 


In [34]:
g.sort_values('Project', ascending=False).reset_index().to_csv(os.path.join(str(year), "Contributor_by_project_number.csv"), index=None)

### Restriction to mapping

In [10]:
df_pro_user_map = df[df['Type'] == 'MAPPING'][['Project', 'Author']].drop_duplicates()
df_pro_user_map.head()

Unnamed: 0,Project,Author
1,10054,JYL45
21,10054,nicojp
147,10054,ayraa674
148,10054,Harej07
165,10054,Co Meijer


In [11]:
g_map = df_pro_user_map.groupby('Author').count()
g_map[g_map['Project'] > 1].sort_values('Project', ascending=False).head(10)

Unnamed: 0_level_0,Project
Author,Unnamed: 1_level_1
JYL45,19
naranjasol,16
remimage,15
Lmercier,14
RCD49,14
eliotsotty,14
ManonVi,14
Anton Timoshenko,13
ARiviere,12
Sophie Talbot,11


In [12]:
more_than_three_projects = (g_map['Project'] >= 3).sum()
print(f"{len(g_map)} mapping contributors in {year}")
print(f"{more_than_three_projects} mapping contributors on 3 projects or more in {year} ({more_than_three_projects/len(g_map):.0%}) ")

1591 mapping contributors in 2021
127 mapping contributors on 3 projects or more in 2021 (8%) 


### Restriction to validation

In [13]:
df_pro_user_valid = df[df['Type'] == 'VALIDATION'][['Project', 'Author']].drop_duplicates()
df_pro_user_valid.head()

Unnamed: 0,Project,Author
0,10054,naranjasol
3,10054,Lmercier
6,10054,ManonVi
20,10054,Anaximandre
38,10054,JYL45


In [14]:
g_valid = df_pro_user_valid.groupby('Author').count()
g_valid[g_valid['Project'] > 1].sort_values('Project', ascending=False).head(10)

Unnamed: 0_level_0,Project
Author,Unnamed: 1_level_1
nicojp,22
JYL45,21
Anaximandre,20
Sophie Talbot,19
naranjasol,17
Lmercier,16
remimage,14
Michel Bouillot,13
eliotsotty,12
Jean LUQUET,12


In [15]:
more_than_three_projects = (g_valid['Project'] >= 3).sum()
print(f"{len(g_valid)} validators in {year}")
print(f"{more_than_three_projects} validators on 3 projects or more in {year} ({more_than_three_projects/len(g_valid):.0%}) ")

85 validators in 2021
23 validators on 3 projects or more in 2021 (27%) 


In [35]:
g_valid.sort_values('Project', ascending=False).reset_index().to_csv(os.path.join(str(year), "Validator_by_project_number.csv"), index=None)

### Ideas  :
- Keep only people who have participated to a mapathon
- Compute the number of mapathons by people
- Compute the number of contributions after a mapathons (time, tasks, projects)

## Number of day contribution

In [49]:
df_day = df[["Author", "Year", "Month", "Day"]].drop_duplicates()[["Author","Day"]].groupby("Author").count()
df_day[df_day["Day"] > 30].sort_values("Day", ascending=False)

Unnamed: 0_level_0,Day
Author,Unnamed: 1_level_1
Anaximandre,150
JYL45,82
Aurel3185,73
eliotsotty,70
remimage,61
Lmercier,47
Jean Yves Garinet,45
NicolasGrosjean,44
Sophie Talbot,43
naranjasol,39


In [65]:
contrib_5_days = (df_day['Day'] >= 5).sum()
print(f"{contrib_5_days} contributors on 5 days or more in 2021 ({contrib_5_days/len(df_day):.1%})")

70 contributors on 5 days or more in 2021 (4.3%)


In [71]:
df_day_map = df[df["Type"] == "MAPPING"][["Author", "Year", "Month", "Day"]].drop_duplicates()[["Author","Day"]].groupby("Author").count()
df_day_map[df_day_map["Day"] > 30].sort_values("Day", ascending=False)

Unnamed: 0_level_0,Day
Author,Unnamed: 1_level_1
Aurel3185,73
remimage,43
eliotsotty,41
Jean Yves Garinet,38
JYL45,37


In [73]:
map_5_days = (df_day_map['Day'] >= 5).sum()
print(f"{map_5_days} mapping contributors on 5 days or more in 2021 ({map_5_days/len(df_day_map):.1%})")

65 mapping contributors on 5 days or more in 2021 (4.1%)


In [66]:
df_day_valid = df[df["Type"] == "VALIDATION"][["Author", "Year", "Month", "Day"]].drop_duplicates()[["Author","Day"]].groupby("Author").count()
df_day_valid[df_day_valid["Day"] > 30].sort_values("Day", ascending=False)

Unnamed: 0_level_0,Day
Author,Unnamed: 1_level_1
Anaximandre,146
JYL45,61
eliotsotty,47
Sophie Talbot,36
remimage,33
nicojp,32
Lmercier,31
NicolasGrosjean,31


In [69]:
valid_5_days = (df_day_valid['Day'] >= 5).sum()
print(f"{valid_5_days} validators on 5 days or more in 2021 ({valid_5_days/len(df_day_valid):.1%})")

24 validators on 5 days or more in 2021 (28.2%)


In [50]:
df_day.sort_values("Day", ascending=False).reset_index().to_csv(os.path.join(str(year), "Contributor_by_contribution_days.csv"), index=None)

Index(['Project', 'Year', 'Month', 'Day', 'Rel. Day', 'Hour', 'Minute',
       'Second', 'Duration', 'Author', 'Type'],
      dtype='object')

## Contributions

In [16]:
total_dur = df['Duration'].sum() / 3600
total_map = df[df['Type'] == 'MAPPING']['Duration'].sum() / 3600
total_val = df[df['Type'] == 'VALIDATION']['Duration'].sum() / 3600
print(f"{total_dur:.5} hours of contributions ({total_dur/7:.4} working days, {total_dur/7/229:.3} ETP)")
print(f"{total_map:.5} hours of mapping ({total_map/7:.4} working days, {total_map/7/229:.3} ETP)")
print(f"{total_val:.5} hours of validation ({total_val/7:.4} working days, {total_val/7/229:.3} ETP)")

3002.8 hours of contributions (429.0 working days, 1.87 ETP)
2213.4 hours of mapping (316.2 working days, 1.38 ETP)
789.36 hours of validation (112.8 working days, 0.492 ETP)


In [18]:
print(f"{len(df['Project'].unique())} projects")

46 projects


## Duration by project

In [23]:
project_dur = df[["Project", "Duration"]].groupby("Project").sum()
project_dur["Duration_Hour"] = project_dur["Duration"] / 3600
project_dur["Duration_Working_Days"] = project_dur["Duration_Hour"] / 7
project_dur["Duration_Working_Days"] = project_dur["Duration_Hour"] / 7
project_dur.sort_values("Duration", ascending=False).head(10)

Unnamed: 0_level_0,Duration,Duration_Hour,Duration_Working_Days
Project,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
10645,2630529,730.7025,104.386071
10669,1512394,420.109444,60.015635
10699,963895,267.748611,38.249802
10359,670328,186.202222,26.600317
10668,379926,105.535,15.076429
11041,280232,77.842222,11.120317
10054,253142,70.317222,10.045317
11816,221939,61.649722,8.807103
10299,208887,58.024167,8.289167
10670,202442,56.233889,8.033413


In [24]:
project_dur.sort_values("Duration").head(10)

Unnamed: 0_level_0,Duration,Duration_Hour,Duration_Working_Days
Project,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
5290,93,0.025833,0.00369
4618,1990,0.552778,0.078968
10663,12800,3.555556,0.507937
11347,25541,7.094722,1.013532
10890,29047,8.068611,1.152659
10752,29565,8.2125,1.173214
10798,33549,9.319167,1.33131
11826,45061,12.516944,1.788135
11088,51446,14.290556,2.041508
10797,52200,14.5,2.071429
