In [None]:
import numpy as np
import os
import pandas as pd
import plotly.express as px

In [None]:
%run ../tasking_manager_stats/data_management
%run ../tasking_manager_stats/map_tools

## Load data

In [None]:
df = pd.read_csv(os.path.join(get_data_dir(), 'merged_stats.csv'), encoding='ANSI')
df.head()

## Filter on a year

In [None]:
year = 2021

In [None]:
os.makedirs(str(year), exist_ok=True)

In [None]:
df = df[df["Year"] == year]

In [None]:
df.to_csv(os.path.join(str(year), "raw_data.csv"), index=None)

## Remove a week of Nuit de la Géo

In [None]:
ndg_month = 4
ndg_day = 8
df = df[(df['Month'] != ndg_month) | (~df['Day'].isin(np.arange(ndg_day, ndg_day + 8)))]

## New contributors by day

In [None]:
date_cols = ["Year", "Month", "Day"]
df['Date'] = pd.to_datetime(df[date_cols].apply(lambda x: '-'.join(x.values.astype(str)), axis="columns"))
df.head()

In [None]:
author_first_date = df[["Author", "Date"]].groupby("Author").min()
author_first_date.head()

In [None]:
contributors_first_date = author_first_date.reset_index().groupby('Date').count().cumsum()
contributors_first_date.head()

In [None]:
fig = px.line(contributors_first_date.reset_index(), x="Date", y="Author")
fig.update_xaxes(title_text = 'Date of first contribution in 2021')
fig.update_yaxes(title_text = "Contributors number")
fig.update_layout(title={'text':f'Cumulative sum of contributors in 2021'})

## Duration by date

In [None]:
df["Duration_h"] = df["Duration"] / 3600

In [None]:
fig = px.bar(df[["Month", "Duration_h"]].groupby("Month").sum().reset_index(), x="Month", y="Duration_h")
fig.update_xaxes(title_text = 'Month')
fig.update_yaxes(title_text = "Contribution time in hours")
fig.update_layout(title={'text':f'Contribution time by month in 2021'})

In [None]:
df2 = df[["Month", "Day", "Hour", "Duration_h"]].copy()
# Convert UTC to FR-Paris
df2['Hour'] += 1
df2.loc[(df2["Month"] < 11) & (df2["Month"] > 3), "Hour"] += 1
# Offset for better graph
df2['Hour'] -= 0.5
fig = px.bar(df2.groupby("Hour").sum().reset_index(), x="Hour", y="Duration_h")
fig.update_xaxes(title_text = 'Hour of the Day')
fig.update_yaxes(title_text = "Contribution time in hours")
fig.update_layout(title={'text':f'Contribution time by hour of the day in 2021'})

In [None]:
df["DayOfWeek"] = df["Date"].apply(lambda dt: dt.weekday())
df.head()

In [None]:
df["DayOfWeek"].unique()

In [None]:
fig = px.bar(df[["DayOfWeek", "Duration_h"]].groupby("DayOfWeek").sum().reset_index(), x="DayOfWeek", y="Duration_h")
fig.update_xaxes(title_text = 'DayOfWeek')
fig.update_yaxes(title_text = "Contribution time in hours")
fig.update_layout(title={'text':f'Contribution time by day of week in 2021'})
fig.update_xaxes(tickmode="array",
                 tickvals=np.arange(7),
                 ticktext=["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"])

## Contributors by project

In [None]:
df_pro_user = df[['Project', 'Author']].drop_duplicates()
df_pro_user.head()

In [None]:
g = df_pro_user.groupby('Author').count()
g[g['Project'] >= 10].sort_values('Project', ascending=False)

In [None]:
more_than_three_projects = (g['Project'] >= 3).sum()
print(f"{len(g)} contributors in {year}")
print(f"{more_than_three_projects} contributors on 3 projects or more in {year} ({more_than_three_projects/len(g):.0%}) ")

In [None]:
g.sort_values('Project', ascending=False).reset_index().to_csv(os.path.join(str(year), "Contributor_by_project_number.csv"), index=None)

### Restriction to mapping

In [None]:
df_pro_user_map = df[df['Type'] == 'MAPPING'][['Project', 'Author']].drop_duplicates()
df_pro_user_map.head()

In [None]:
g_map = df_pro_user_map.groupby('Author').count()
g_map[g_map['Project'] > 1].sort_values('Project', ascending=False).head(10)

In [None]:
more_than_three_projects = (g_map['Project'] >= 3).sum()
print(f"{len(g_map)} mapping contributors in {year}")
print(f"{more_than_three_projects} mapping contributors on 3 projects or more in {year} ({more_than_three_projects/len(g_map):.0%}) ")

### Restriction to validation

In [None]:
df_pro_user_valid = df[df['Type'] == 'VALIDATION'][['Project', 'Author']].drop_duplicates()
df_pro_user_valid.head()

In [None]:
g_valid = df_pro_user_valid.groupby('Author').count()
g_valid[g_valid['Project'] > 1].sort_values('Project', ascending=False).head(10)

In [None]:
more_than_three_projects = (g_valid['Project'] >= 3).sum()
print(f"{len(g_valid)} validators in {year}")
print(f"{more_than_three_projects} validators on 3 projects or more in {year} ({more_than_three_projects/len(g_valid):.0%}) ")

In [None]:
g_valid.sort_values('Project', ascending=False).reset_index().to_csv(os.path.join(str(year), "Validator_by_project_number.csv"), index=None)

### Ideas  :
- Keep only people who have participated to a mapathon
- Compute the number of mapathons by people
- Compute the number of contributions after a mapathons (time, tasks, projects)

## Number of day contribution

In [None]:
df_day = df[["Author", "Year", "Month", "Day"]].drop_duplicates()[["Author","Day"]].groupby("Author").count()
df_day[df_day["Day"] > 30].sort_values("Day", ascending=False)

In [None]:
contrib_5_days = (df_day['Day'] >= 5).sum()
print(f"{contrib_5_days} contributors on 5 days or more in 2021 ({contrib_5_days/len(df_day):.1%})")

In [None]:
df_day_map = df[df["Type"] == "MAPPING"][["Author", "Year", "Month", "Day"]].drop_duplicates()[["Author","Day"]].groupby("Author").count()
df_day_map[df_day_map["Day"] > 30].sort_values("Day", ascending=False)

In [None]:
map_5_days = (df_day_map['Day'] >= 5).sum()
print(f"{map_5_days} mapping contributors on 5 days or more in 2021 ({map_5_days/len(df_day_map):.1%})")

In [None]:
df_day_valid = df[df["Type"] == "VALIDATION"][["Author", "Year", "Month", "Day"]].drop_duplicates()[["Author","Day"]].groupby("Author").count()
df_day_valid[df_day_valid["Day"] > 30].sort_values("Day", ascending=False)

In [None]:
valid_5_days = (df_day_valid['Day'] >= 5).sum()
print(f"{valid_5_days} validators on 5 days or more in 2021 ({valid_5_days/len(df_day_valid):.1%})")

In [None]:
df_day.sort_values("Day", ascending=False).reset_index().to_csv(os.path.join(str(year), "Contributor_by_contribution_days.csv"), index=None)

## Contributions

In [None]:
total_dur = df['Duration'].sum() / 3600
total_map = df[df['Type'] == 'MAPPING']['Duration'].sum() / 3600
total_val = df[df['Type'] == 'VALIDATION']['Duration'].sum() / 3600
print(f"{total_dur:.5} hours of contributions ({total_dur/7:.4} working days, {total_dur/7/229:.3} ETP)")
print(f"{total_map:.5} hours of mapping ({total_map/7:.4} working days, {total_map/7/229:.3} ETP)")
print(f"{total_val:.5} hours of validation ({total_val/7:.4} working days, {total_val/7/229:.3} ETP)")

In [None]:
print(f"{len(df['Project'].unique())} projects")

In [None]:
validators = df[df['Type'] == 'VALIDATION']["Author"].unique()
mappers = df[df['Type'] == 'MAPPING']["Author"].unique()
print("Validators who are not mappers")
for validator in validators:
    if validator not in mappers:
        print(validator)

## Duration by project

In [None]:
project_dur = df[["Project", "Duration"]].groupby("Project").sum()
project_dur["Duration_Hour"] = project_dur["Duration"] / 3600
project_dur["Duration_Working_Days"] = project_dur["Duration_Hour"] / 7
project_dur["Duration_Working_Days"] = project_dur["Duration_Hour"] / 7
project_dur.sort_values("Duration", ascending=False).head(10)

In [None]:
project_dur.sort_values("Duration").head(10)