# Grantee Data Analysis

In [None]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import matplotlib.dates as mdates
import seaborn as sns
import pandas as pd
import numpy as np

%load_ext sql_magic
%matplotlib inline
plt.rcParams.update({'figure.max_open_warning': 0})

In [None]:
participants_tally_2019 = pd.read_csv("../../data/processed/participants_list_received_clean/2019/participants_tally_2019.csv", sep=',', encoding = 'utf-8')
participants_tally_2018 = pd.read_csv("../../data/processed/participants_list_received_clean/2018/PTP_participants_grantees_article_counts_clean.csv", sep=',', encoding = 'utf-8')

In [None]:
# read data from inner join of 2019 and 2018 participants: participants in this year's GLOW a) that also were grantees b) that were also participants last year
participants_last_grantees = pd.read_csv("../../data/processed/participants_list_received_clean/2019/participants_last_grantees.csv", sep=',', encoding = 'utf-8')
participants_last_participants = pd.read_csv("../../data/processed/participants_list_received_clean/2019/participants_last_participants.csv", sep=',', encoding = 'utf-8')

In [None]:
editor_reg_contest_2019 = pd.read_csv("../../data/processed/query_results/editors/editor_reg_contest.csv", sep=',', encoding = 'utf-8', infer_datetime_format=True, parse_dates=['reg_date'])

In [None]:
editor_reg_contest_2018 = pd.read_csv("../../data/processed/query_results/editors/editor_reg_contest_18.csv", sep=',', encoding = 'utf-8', infer_datetime_format=True, parse_dates=['reg_date'])

In [None]:
#2018 weekly edits by ptp chromebook grantees
weekly_edits_ptp_grantees = pd.read_csv("../../data/processed/editors/weekly_edits_ptp_grantees.csv", sep=',', encoding = 'utf-8', infer_datetime_format=True)

In [None]:
#2017-2020 edits by ptp chromebook grantees
edits_ptp_grantees_edits_summed = pd.read_csv("../../data/processed/editors/edits_ptp_grantees_edits_summed.csv", sep=',', encoding = 'utf-8', infer_datetime_format=True, parse_dates=['editor_edits_per_month'])

In [None]:
edits_ptp_grantees_edits_summed['editor_edits_per_month'] = edits_ptp_grantees_edits_summed['editor_edits_per_month'].astype(str).astype(int)

In [None]:
GLOW_editor_edits = pd.read_csv("../../data/processed/editors/GLOW_editor_edits.csv", sep=',', encoding = 'utf-8', infer_datetime_format=True, parse_dates=['event_timestamp'])

In [None]:
GLOW_editor_edits_summed = pd.read_csv("../../data/processed/editors/GLOW_editor_edits_summed.csv", sep=',', encoding = 'utf-8', infer_datetime_format=True)

In [None]:
participants_tr_2019 = pd.merge(participants_tally_2019, editor_reg_contest_2019, on='username', how='left')
participants_tr_2018 = pd.merge(participants_tally_2018, editor_reg_contest_2018, on='username', how='left')

In [None]:
participants_tr_2018.columns = [str(col) + '_2018' for col in participants_tr_2018.columns]
participants_tr_2019.columns = [str(col) + '_2019' for col in participants_tr_2019.columns]
participants_tr_2018 = participants_tr_2018.rename(columns={'username_2018':'username', 'reg_date_2018':'reg_date'})
participants_tr_2019 = participants_tr_2019.rename(columns={'username_2019':'username', 'reg_date_2019':'reg_date'});

In [None]:
all_participants = pd.merge(participants_tr_2018, participants_tr_2019, on=['username', 'reg_date'], how='outer')

### GLOW signup

In [None]:
participants_tally_2019['glow_editor_signup'].value_counts()

In [None]:
participants_tally_2018['glow_editor_signup'].value_counts()

In [None]:
participants_tally_2019.groupby(['incentive_recipient','glow_editor_signup'])['total_articles'].describe()

In [None]:
participants_tally_2019.loc[(participants_tally_2019['incentive_recipient']==True) & (participants_tally_2019['glow_editor_signup']==True)].info()

### incentive_recipient

In [None]:
participants_tally_2019['incentive_recipient'].value_counts()

In [None]:
participants_tally_2018['incentive_recipient'].value_counts()

### incentive_type count

In [None]:
participants_tally_2019['incentive_type'].value_counts()

In [None]:
participants_tally_2018['incentive_type'].value_counts()

### Create a mask for GLOW editors

In [None]:
glow_2019_m = participants_tally_2019['glow_editor_signup']==True
glow_2018_m = participants_tally_2018['glow_editor_signup']==True

In [None]:
g18 = participants_tally_2018.loc[glow_2018_m]
g19 = participants_tally_2019.loc[glow_2019_m]

glow_2019 = g19.copy()
glow_2018 = g18.copy()

In [None]:
glow_2019['incentive_recipient'].value_counts()

### output by incentive - broad

In [None]:
glow_2019.groupby('incentive_recipient')['total_articles'].sum()

In [None]:
glow_2018.groupby('incentive_recipient')['article_count'].sum()

### output by incentive - detailed

In [None]:
glow_2019.groupby('incentive_type')['total_articles'].sum().sort_values()

In [None]:
glow_2018.groupby('incentive_type')['article_count'].sum().sort_values()

### Descriptive statistics

In [None]:
glow_2019.groupby('incentive_type')['total_articles'].describe()

In [None]:
glow_2018.groupby('incentive_type')['article_count'].describe()

In [None]:
#2019: 8,863 (not including 2880 from single high output editor = 5983
8863-2880

In [None]:
bins = [0, 1, 5, 10, 25, 50, 100, 300, 600, 1000, 2000, 3000]
participants_output_binned_2019 = pd.cut(glow_2019['total_articles'], bins=bins).value_counts()
participants_output_binned_2018 = pd.cut(glow_2018['article_count'], bins=bins).value_counts()

In [None]:
participants_output_binned_2019

In [None]:
participants_output_binned_2019.plot();

In [None]:
participants_output_binned_2018

In [None]:
participants_output_binned_2018.plot();

In [None]:
glow_2019['total_articles'].hist();

In [None]:
glow_2018['article_count'].hist();

### Review

In [None]:
P2019 = (glow_2019.groupby(['incentive_type', 'incentive_recipient'], sort=False)['total_articles']
        .agg([('incentive_count', 'count'),('articles_total', 'sum')])
        .reset_index())

P2019['articles_pct'] = 100 * P2019['articles_total']  / P2019['articles_total'].sum()

In [None]:
P2019

In [None]:
P2019.sort_values(by='articles_pct', ascending=False)

In [None]:
P2018 = (glow_2018.groupby(['incentive_type', 'incentive_recipient'], sort=False)['article_count']
        .agg([('incentive_count', 'count'),('articles_total', 'sum')])
        .reset_index())

P2018['articles_pct'] = 100 * P2018['articles_total']  / P2018['articles_total'].sum()

In [None]:
P2018

In [None]:
P2018.sort_values(by='articles_pct', ascending=False)

### Questions

How many of this year's participants participated last year? 
> 82

How many of this year's participants were grantees last year?
> 30

How many of this year's editor grantees were editor grantees last year?
> 11 (9 of these received internet in GLOW)

*Does one incentive outperform another?*
Incentive performance, as far as articles submitted, is roughly correlated with the frequency of that incentive. 
The fewer counts of an incentive, the fewer total articles that will be associated with that incentive.

How did returning editors perform?
> 38 wrote less

> 3 wrote the same amount

> 41 wrote more

In [None]:
bins2 = [0, 1, 5, 99, 300, 3000]
names = ['<2', '2-5', '6-99', '100-300', '301+']

z = pd.cut(glow_2019['total_articles'], bins=bins2, labels=names).value_counts()
z18 = pd.cut(glow_2018['article_count'], bins=bins2, labels=names).value_counts()

In [None]:
z

In [None]:
z.plot(kind='bar');

In [None]:
z18

In [None]:
z18.plot(kind='bar');

In [None]:
glow_2019['article_cnt_range'] = pd.cut(glow_2019['total_articles'], bins2, labels=names)
glow_2018['article_cnt_range'] = pd.cut(glow_2018['article_count'], bins2, labels=names)

#participants_tally_2019['binned'] = pd.cut(glow_2019['total_articles'], bins)

In [None]:
glow_18s = glow_2018[['username', 'article_count']]
glow_19s = glow_2019[['username', 'total_articles']]

#common participants 
cp = participants_last_participants.merge(glow_18s, how='left', on='username')
cp = cp.merge(glow_19s, how='left', on='username')
cp.fillna(0)

#add columns
cp['pct_change_output'] =100 * cp[['article_count', 'total_articles']].pct_change(axis=1)['total_articles']
cp['article_cnt_range18'] = pd.cut(cp['article_count'], bins2, labels=names)
cp['article_cnt_range19'] = pd.cut(cp['total_articles'], bins2, labels=names)

In [None]:
change_matrix = cp.groupby('pp_inctv').pct_change_output.apply(lambda x: pd.Series([(x < 0).sum(), (x == 0).sum(), (x > 0).sum()])).unstack().reset_index().rename(columns={0:'-',
                                                                                                                                                                          1 : '=%',
                                                                                                                                                                         2:'+%',
                                                                                                                                                                         })


In [None]:
change_matrix

In [None]:
change_matrix.sum(numeric_only=True)

In [None]:
sig_change_matrix = cp.groupby('pp_inctv').pct_change_output.apply(lambda x: pd.Series([(x >=20).sum(), (x >=50).sum(),(x >=100).sum()])).unstack().reset_index().rename(columns={
                              0 : '>25%',
                             1:'>75%',
                            2:'>100%',
                             })

In [None]:
sig_change_matrix

In [None]:
sig_change_matrix.sum(numeric_only=True)

Did Project Tiger Pilot editors increase their GLOW article count after receiving Chromebook grants? 
>Seven 2018 Chromebook grantees submitted more articles during the GLOW contest. 

Did returning grantees perform similarly this year?
>2018 laptop recipients - 16 awarded in 2018, (12 returned to edit):
7 increased output; 5 decreased output
7 received internet grants in 2019

>2018 laptop_internet recipients  - 14 awarded in 2018, (6 returned to edit):
1 increased output; 5 decreased output
1 received internet grant in 2019

>2018 internet recipients  - 9 awarded in 2018, (6 returned to edit):
2 increased output; 1 maintained output; 3 decreased output
2 received laptop_internet grants; 1 received internet in 2019


Are any returning grantees included in this year's non-grantee cohort?

In [None]:
#get count of 2019 non-grantee editors who were grantees in 2018

last_years_recipients_this_years_nongrantees = all_participants.loc[(all_participants['incentive_recipient_2018']==True) & 
                     (all_participants['incentive_recipient_2019']==False)]

In [None]:
last_years_recipients_this_years_nongrantees.sort_values(by='total_articles_2019', ascending=False).info()

In [None]:
last_years_recipients_this_years_nongrantees.loc[last_years_recipients_this_years_nongrantees['total_articles_2019']>= 100].info()

In [None]:
last_years_recipients_this_years_nongrantees.groupby('incentive_type_2018')['total_articles_2019'].describe()

How many articles does this group account for?

In [None]:
last_years_recipients_this_years_nongrantees['total_articles_2019'].sum()

Did the contest borrow productivity from a future period?
>more research is needed; 

Did the contest change future behavior? 
>maybe for subsets of editors

Did we change the edit trajectory of editors via inclusion in glow generally?
>more research is needed where we look at groups of editors based on their edit count bins

In [None]:
GLOW_editor_edits = GLOW_editor_edits.set_index('event_timestamp')
total_edits_by_date_r = GLOW_editor_edits[['event_user_text']]
total_edits_by_date = total_edits_by_date_r.groupby(total_edits_by_date_r.index.date).count().rename(columns={'event_user_text':'occurrences_by_date'})

In [None]:

total_edits_by_date['occurrences_by_date'].plot(style='.-', 
                                                figsize=(18, 16),
                                               title='Total edits by GLOW editors per month');
#fig=plt.figure(figsize=(18, 16), dpi= 80, facecolor='w', edgecolor='k')

### Edits by PTP Chromebook grantees

In [None]:
laptop_grantees_PTP18 = all_participants.loc[(all_participants['incentive_recipient_2018']==True) & (all_participants['incentive_type_2018']=='laptop')].sort_values(by='reg_date')

In [None]:
laptop_grantees_PTP18.info()

In [None]:
laptop_grantees_PTP18['article_count_pct_change']= 100*(laptop_grantees_PTP18[['article_count_2018','total_articles_2019']].pct_change(axis=1)['total_articles_2019'])

In [None]:
laptop_grantees_PTP18_change = laptop_grantees_PTP18['article_count_pct_change'].reset_index().sort_values(ascending=False, by='article_count_pct_change')

In [None]:
del laptop_grantees_PTP18_change['index']
laptop_grantees_PTP18_change

Did Project Tiger Pilot Chromebook grantee editors increase their 2018 non-minor edit count peaks after receiving Chromebook grants? 
> Yes: 16 of 20 editors that received Chromebook grants reached higher 2018 edit counts post Project Tiger contest. However, all Chromebook grantees continued to edit in spurts and the peaks were not largely maintained. 

In [None]:
weekly_edits_ptp_grantees['event_user_text'].nunique()

In [None]:
weekly_edits_ptp_grantees = weekly_edits_ptp_grantees.sort_values(by = ['event_user_text', 'week_n'])

Note: PTP Contest weeks = 10-22

In [None]:
fig, ax = plt.subplots(1,1);
weekly_edits_ptp_grantees.groupby("event_user_text").plot.scatter(x="week_n", y="edits", ax=ax);
plt.savefig('../../results/figs/editors/2018_all_laptop_grantees_major_edits.png');

In [None]:
weekly_edits_ptp_grantees.groupby("event_user_text").plot.scatter(x="week_n", y="edits");
plt.savefig('../../results/figs/editors/2018_laptop_grantee_major_edits.png');

Did Project Tiger Pilot Chromebook grantee editors continue editing through March 2020? 
> Yes, 16 of 20 Chromebook grantee editors continue editing as of April 1st. 

Did we change the edit trajectory of editors by giving them laptop grants? 
> Yes, for some editors. Those editors that had reached a max total of 600-2k edits per month saw positive increases in their edit trajectories. Very prolific editors that had already seen months where they made more than 2,000 edits did not see a change in their edit trajectory. Those editors that had not yet reached 600 edits per month prior to the contest did not see a change in their edit trajectory. 

In [None]:
#2018
pd.pivot_table(weekly_edits_ptp_grantees.reset_index(),
               index='week_n', columns='event_user_text', values='edits'
              ).plot(subplots=True, figsize=(15,21));

In [None]:
#2017-2020
pd.pivot_table(edits_ptp_grantees_edits_summed.reset_index(),
               index='month/year', columns='event_user_text', values='editor_edits_per_month'
              ).plot(subplots=True, figsize=(15,21));

In [None]:
over_400_mask = (edits_ptp_grantees_edits_summed['month/year'] < 2018.3) & (edits_ptp_grantees_edits_summed['editor_edits_per_month'] >= 400)
over_800_mask = (edits_ptp_grantees_edits_summed['month/year'] < 2018.3) & (edits_ptp_grantees_edits_summed['editor_edits_per_month'] >= 800)
over_2k_mask = (edits_ptp_grantees_edits_summed['month/year'] < 2018.3) & (edits_ptp_grantees_edits_summed['editor_edits_per_month'] >= 2000)

mask_selects400 = edits_ptp_grantees_edits_summed[over_400_mask]
mask_selects_800 = edits_ptp_grantees_edits_summed[over_800_mask]
mask_selects_2k = edits_ptp_grantees_edits_summed[over_2k_mask]

grantees_w_over_400_pm_pre = list(mask_selects400['event_user_text'].unique())
grantees_w_over_800_pm_pre = list(mask_selects_800['event_user_text'].unique())
grantees_w_over_2k_pm_pre = list(mask_selects_2k['event_user_text'].unique())
grantees_w_800_2k_pm_pre = [x for x in grantees_w_over_800_pm_pre if x not in grantees_w_over_2k_pm_pre]


In [None]:
2001+
801-2000
100-800


2001+
1001-2000
100-1000

#### Segment into editors that had not reached 400+ edits per month prior to the contest

In [None]:
under_400_df = edits_ptp_grantees_edits_summed[~edits_ptp_grantees_edits_summed['event_user_text'].isin(grantees_w_over_400_pm_pre)]

In [None]:
sns.set() 
fig = plt.figure(figsize=(15,11))
ax = fig.add_subplot(111)

sns.lineplot(x="month/year", y="editor_edits_per_month", hue="event_user_text", 
            data=under_400_df)

plt.xticks(rotation=45)
plt.title('---')

plt.tight_layout()
plt.show()

#### Segment into editors that had <800 edits per month prior to the contest

In [None]:
small_df = edits_ptp_grantees_edits_summed[~edits_ptp_grantees_edits_summed['event_user_text'].isin(grantees_w_over_800_pm_pre)]

In [None]:
sns.set() 
fig = plt.figure(figsize=(15,11))
ax = fig.add_subplot(111)

sns.lineplot(x="month/year", y="editor_edits_per_month", hue="event_user_text", 
            data=small_df)

plt.xticks(rotation=45)
plt.title('---')

plt.tight_layout()
plt.show()

#### Segment into editors that had reached 800-2000 edits per month prior to the contest

In [None]:
mid_df = edits_ptp_grantees_edits_summed[edits_ptp_grantees_edits_summed['event_user_text'].isin(grantees_w_800_2k_pm_pre)]

In [None]:
sns.set() 
fig = plt.figure(figsize=(15,11))
ax = fig.add_subplot(111)

sns.lineplot(x="month/year", y="editor_edits_per_month", hue="event_user_text", 
            data=mid_df)

plt.xticks(rotation=45)
plt.title('---')

plt.tight_layout()
plt.show()

#### Segment into editors that had reached 2000 edits per month prior to the contest

In [None]:
over2k_df = edits_ptp_grantees_edits_summed[edits_ptp_grantees_edits_summed['event_user_text'].isin(grantees_w_over_2k_pm_pre)]

In [None]:
sns.set() 
fig = plt.figure(figsize=(15,11))
ax = fig.add_subplot(111)

sns.lineplot(x="month/year", y="editor_edits_per_month", hue="event_user_text", 
            data=over2k_df)

plt.xticks(rotation=45)
plt.title('---')

plt.tight_layout()
plt.show()

In [None]:
plt.close('all')