In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
%run ../tasking_manager_stats/data_management

# Get summaries of a lot of projects

In [None]:
summaries_file = os.path.join(get_data_dir(), 'summaries.json')

In [None]:
summaries = {} 
with open(summaries_file) as f:
    summaries = json.load(f)

In [None]:
for project_id in tqdm(range(5636, 5650)):
    if project_id not in summaries or summaries[project_id]['status'] != 'ARCHIVED':
        summaries[project_id] = download_summary_data(project_id)
        time.sleep(0.5 + random.random())

In [None]:
for project_id in tqdm(range(5650, 6100)):
    if project_id not in summaries or summaries[project_id]['status'] != 'ARCHIVED':
        summaries[project_id] = download_summary_data(project_id)
        time.sleep(0.5 + random.random())

In [None]:
with open(summaries_file, 'w') as outfile:
    json.dump(summaries, outfile)

# Data extraction and cleaning

In [None]:
summary_df = pd.DataFrame()
for project_id in summaries:
    if 'Error' in summaries[project_id]:
        print(str(project_id) + ' : ' + summaries[project_id]['Error'])
        continue
    summary_df = pd.concat([summary_df, pd.DataFrame(data=[(project_id, summaries[project_id]['created'],
                                                            summaries[project_id]['lastUpdated'],
                                                            summaries[project_id]['status'],
                                                            summaries[project_id]['percentValidated'],
                                                            summaries[project_id]['organisationTag'],
                                                            summaries[project_id]['projectArea(in sq.km)'])],
                                                     columns=['project_id', 'created', 'lastUpdated', 'status',
                                                              'percentValidated', 'organisation', 'area'])],
                           axis=0, ignore_index=True)
summary_df = summary_df.set_index('project_id')

In [None]:
summary_df['created'] = pd.to_datetime(summary_df['created'])
summary_df['lastUpdated'] = pd.to_datetime(summary_df['lastUpdated'])
summary_df.info()

In [None]:
summary_df.head()

### Qualitative analysis

In [None]:
summary_df.hist('percentValidated')

In [None]:
summary_df['status'].unique()

In [None]:
summary_df['organisation'].unique()

In [None]:
summary_df['organisation'] = summary_df['organisation'].apply(lambda s: 'MSF' if s=='Médecins Sans Frontières' else s)
summary_df['organisation'].unique()

### Restrict to S1 2019

Definitions of the restriction
- lastUpdated after 1st January 2019
- lastUpdated before 1st July 2019
- validated > 90%

In [None]:
summary_df[summary_df['organisation'] == 'CartONG']

In [None]:
summary_df[summary_df['organisation'] == 'AIT']

In [None]:
df = summary_df[(summary_df['lastUpdated'] > '2019-01-01') &
                (summary_df['lastUpdated'] < '2019-07-19') &
                (summary_df['percentValidated'] >= 90)]
df.head()

In [None]:
df['organisation'].unique()

In [None]:
pd.DataFrame(df.groupby('organisation').count().area.sort_values(ascending=False))

In [None]:
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(10, 10), dpi=100, sharex=True)
g = df.groupby('organisation').count().area.sort_values()
ax.bar(np.arange(len(g)), g.values, color=['black' if index == 'CartONG' else 'lightgray' for index in g.index])
plt.xticks(np.arange(len(g)), g.index, rotation=90)
ax.set_xlabel('Organisations')
ax.set_ylabel('Number of finished projects')
ax.set_title('Number of finished projects in S1 2019')
plt.savefig('finished_project_nb.png', dpi=100)

In [None]:
df.groupby('organisation').sum().sort_values('area', ascending=False)

In [None]:
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(10, 10), dpi=100, sharex=True)
g = df.groupby('organisation').sum().sort_values('area')
ax.bar(np.arange(len(g)), g['area'], color=['black' if index == 'CartONG' else 'lightgray' for index in g.index])
plt.xticks(np.arange(len(g)), g.index, rotation=90)
ax.set_xlabel('Organisations')
ax.set_ylabel('Area in km²')
ax.set_title('Total area of finished projects in S1 2019')
plt.savefig('finished_project_total_area.png', dpi=100)

In [None]:
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(10, 10), dpi=100, sharex=True)
g = df.groupby('organisation').sum().sort_values('area')[:6]
ax.bar(np.arange(len(g)), g['area'], color=['black' if index == 'CartONG' else 'lightgray' for index in g.index])
plt.xticks(np.arange(len(g)), g.index, rotation=90)
ax.set_xlabel('Organisations')
ax.set_ylabel('Area in km²')
ax.set_title('[ZOOM IN] Total area of finished projects in S1 2019')
plt.savefig('finished_project_total_area_bis.png', dpi=100)

In [None]:
df.groupby('organisation').mean().sort_values('area', ascending=False)

In [None]:
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(10, 10), dpi=100, sharex=True)
g = df.groupby('organisation').mean().sort_values('area')
ax.bar(np.arange(len(g)), g['area'], color=['black' if index == 'CartONG' else 'lightgray' for index in g.index])
plt.xticks(np.arange(len(g)), g.index, rotation=90)
ax.set_xlabel('Organisations')
ax.set_ylabel('Area in km²')
ax.set_title('Mean area of finished projects in S1 2019')
plt.savefig('finished_project_mean_area.png', dpi=100)

In [None]:
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(10, 10), dpi=100, sharex=True)
g = df.groupby('organisation').mean().sort_values('area')[:4]
ax.bar(np.arange(len(g)), g['area'], color=['black' if index == 'CartONG' else 'lightgray' for index in g.index])
plt.xticks(np.arange(len(g)), g.index, rotation=90)
ax.set_xlabel('Organisations')
ax.set_ylabel('Area in km²')
ax.set_title('[ZOOM IN] Mean area of finished projects in S1 2019')
plt.savefig('finished_project_mean_area_bis.png', dpi=100)