In [None]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import matplotlib.dates as mdates
from matplotlib import pylab
import seaborn as sns
#sns.set_style("whitegrid")
#sns.set(style="darkgrid")
from  matplotlib.ticker import FuncFormatter
import matplotlib.ticker as mtick

import pandas as pd
import numpy as np
from scipy.stats import mannwhitneyu

In [None]:
raw_editor_data = pd.read_csv("../../data/processed/survey_data/Indonesian_contributor_survey_compiled.csv")

In [None]:
#drop nulls
raw_editor_data['saraswati_participant'] = raw_editor_data['saraswati_participant'].astype('bool')

raw_editor_data['blockers'] = raw_editor_data['blockers'].replace(0, np.nan)
raw_editor_data['prior_events_online'] = raw_editor_data['prior_events_online'].replace(np.nan, False).astype('bool')
raw_editor_data['previous_events_offline'] = raw_editor_data['previous_events_offline'].replace(np.nan, False).astype('bool')
raw_editor_data['saraswati_grant'] = raw_editor_data['saraswati_grant'].replace(np.nan, False).astype('bool')

filtered_red = raw_editor_data[raw_editor_data['blockers'].notnull()]
filtered_red = filtered_red[filtered_red.blockers != 0]
#z=filtered_red.loc[(filtered_red['blockers']!=0) & (filtered_red['blockers'].notna())]

duration_bins = [1, 5, 10, 15, 20]
filtered_red['duration_binned'] = pd.cut(filtered_red['Time_on_wiki_asof_2020'], duration_bins)

motivation_df = filtered_red [['saraswati_participant', 'saraswati_grant', 'duration_binned', 'contrib_enjoyable', 'contrib_benefits_me', 'contrib_benefits_world']]
data_df = raw_editor_data.dropna(subset=['blockers'])

In [None]:
saraswati_participants_df = motivation_df[motivation_df['saraswati_participant']==True]
non_saraswati_df = motivation_df[motivation_df['saraswati_participant']==False]

In [None]:
ax = raw_editor_data['saraswati_participant'].value_counts().plot(kind='bar', title='Saraswati Participation', color=["#FF8C00", "#1f77b4"])
ax.set_xlabel("Did you participate in GLOW Saraswati?")
ax.set_ylabel("Number of editors")
for p in ax.patches:
    width, height = p.get_width(), p.get_height()
    x, y = p.get_xy() 
    ax.text(x+width/2, 
            y+height/2, 
            '{:.0f}'.format(height), 
            horizontalalignment='center', 
            verticalalignment='center')
for p in ax.patches:
    width, height = p.get_width(), p.get_height()
    x, y = p.get_xy() 
    ax.text(x+width/2, 
            y+height*1.02, 
            '{:.0f}%'.format(height*100/len(raw_editor_data)), 
            horizontalalignment='center')
x = ['No', 'Yes']
#ax.tick_params(axis='x', colors='blue')
#ax.tick_params(axis='y', colors='red')

ax.set_xticklabels(x, rotation = 45);
#ax.annotate('{:.0%}'.format(height), (p.get_x()+.5*width, p.get_y() + height + 0.01), ha = 'center') # for series objects

In [None]:
#remove 'other' answers, set df for viz
filtered_red_blocker_other = filtered_red[filtered_red.blockers != 5]

ax = sns.countplot(x="blockers", 
                   hue="saraswati_participant", 
                   data=filtered_red_blocker_other,
                   order = filtered_red_blocker_other['blockers'].value_counts().index
                  )
plt.legend(title='GLOW Saraswati editor', loc='upper right', labels=['No', 'Yes'])

plt.gca().yaxis.set_major_formatter(FuncFormatter(lambda x, _: int(x)))
xlabels = ['lack of time','tech access','skills gap','newcomer integration']
ax.set_ylabel('Number of editors')
ax.set_xlabel('Factors preventing wiki participation')
ax.set_xticklabels(xlabels)
ax.set_ylim([0,30])

#bar counts
for p in ax.patches:
    width, height = p.get_width(), p.get_height()
    x, y = p.get_xy() 
    ax.text(x+width/2, 
            y+height/2, 
            '{:.0f}'.format(height), 
            horizontalalignment='center', 
            verticalalignment='center')



#plt.show(g);
#https://stackoverflow.com/questions/31749448/how-to-add-percentages-on-top-of-bars-in-seaborn

In [None]:
filtered_red_blocker_other['blockers'].unique().size

In [None]:
#TO DO FIX GROUP PERCENTAGES

totals = filtered_red['saraswati_participant'].value_counts()
n_hues = filtered_red['prior_events_online'].unique().size

ax = sns.countplot(x="prior_events_online", hue="saraswati_participant", data=filtered_red)
plt.legend(title='GLOW Saraswati editor', loc='upper right', labels=['No', 'Yes'])

#annotate counts
for p in ax.patches:
    width, height = p.get_width(), p.get_height()
    x, y = p.get_xy() 
    ax.text(x+width/2, 
            y+height/2, 
            '{:.0f}'.format(height), 
            horizontalalignment='center', 
            verticalalignment='center')

#annotate % of full total
for p in ax.patches:
    width, height = p.get_width(), p.get_height()
    x, y = p.get_xy() 
    ax.text(x+width/2, 
            y+height*1.02, 
            '{:.0f}% of all'.format(height*100/len(raw_editor_data)), 
            horizontalalignment='center')#('top', 'bottom', 'center', 'baseline', 'center_baseline')

#annotate % of group total 
temp_totals = totals.values.tolist()*n_hues
for p,t in zip(ax.patches,temp_totals):
    height = p.get_height()
    ax.text(p.get_x()+p.get_width()/2.,
        height + 3,
        '{0:.1%}of group - TO DO'.format(height/t),
        ha="center") 
    
    
#temp_totals = totals.values.tolist()*n_hues
#for group in ax.containers:
#    for bar in group:
#        #label the bar graphs based on the coordinates of the bar patches
#        ax.text(
#            bar.get_xy()[0]+bar.get_width()/2,
#            bar.get_height(), 
#            '{:.1f}%'.format(100*bar.get_height()), 
#            color='black',
#            horizontalalignment='center')

        
plt.gca().yaxis.set_major_formatter(FuncFormatter(lambda x, _: int(x)))
plt.tick_params(
    axis='x',          # changes apply to the x-axis
    which='both',      # both major and minor ticks are affected
    #bottom=False,      # ticks along the bottom edge are off
    top=False,         # ticks along the top edge are off
    #labelbottom=False
)
ax.set_ylabel('Number of editors')
ax.set_xlabel('prior_events_online');

#https://stackoverflow.com/questions/31749448/how-to-add-percentages-on-top-of-bars-in-seaborn

In [None]:
online = filtered_red.groupby(['saraswati_participant','prior_events_online'])
s = online.ngroup().value_counts(normalize=True,sort=False)
s.index = online.groups.keys()

online_rates = online.size().to_frame('online_count').assign(online_pct=s.mul(100).round(2)).reset_index()
online_rates

In [None]:
offline = filtered_red.groupby(['saraswati_participant','previous_events_offline'])
s = offline.ngroup().value_counts(normalize=True,sort=False)
s.index = offline.groups.keys()

offline_rates = offline.size().to_frame('offline_count').assign(offline_pct=s.mul(100).round(2)).reset_index()
offline_rates

In [None]:
motivation = motivation_df.groupby('saraswati_participant').agg('count')

In [None]:
motivation

## Motivation 

In [None]:
ax = motivation_df[['contrib_enjoyable','contrib_benefits_me','contrib_benefits_world']].plot.barh(stacked=True, figsize=(10,12))
for p in ax.patches:
    width, height = p.get_width(), p.get_height()
    x, y = p.get_xy() 
    #ax.text(x+width/2, 
            #y+height/2, 
            #'{:.0f} %'.format(height), 
            #horizontalalignment='center', 
            #verticalalignment='center')

#### Motivation percentages

In [None]:
motivation_df['contrib_enjoyable'].value_counts(normalize=True) * 100

In [None]:
motivation_df['contrib_benefits_me'].value_counts(normalize=True) * 100

In [None]:
motivation_df['contrib_benefits_world'].value_counts(normalize=True) * 100

#### Motivation by Saraswati participation

In [None]:
motivation_df.groupby('saraswati_participant')['contrib_enjoyable'].value_counts(normalize=True) * 100

In [None]:
motivation_df.groupby('saraswati_participant')['contrib_benefits_me'].value_counts(normalize=True) * 100

In [None]:
motivation_df.groupby('saraswati_participant')['contrib_benefits_world'].value_counts(normalize=True) * 100

In [None]:
mdt = motivation_df.transpose()

In [None]:
# add four new columns to store the total number of response corresponding to the scale
mdt['strongly_disagree'] = (mdt.iloc[3:6, :] == 1).sum(axis =1)
mdt['disagree'] = (mdt.iloc[3:6, :] == 2).sum(axis =1)
mdt['mid'] = (mdt.iloc[3:6, :] == 3).sum(axis =1)
mdt['agree'] = (mdt.iloc[3:6, :] == 4).sum(axis =1)
mdt['strongly_agree'] = (mdt.iloc[3:6, :] == 5).sum(axis =1)

# create a new column named Sum of Values to add the scales together
mdt['SMV'] = mdt['strongly_disagree'] + mdt['disagree'] + mdt['mid']+ mdt['agree'] + mdt['strongly_agree']

# define a function to calculate the mean weight
def mwv(a,b,c,d,e):
    weight = ((a * 1) + (b * 2) + (c * 3) + (d * 4) +(e * 5))/(a+b+c+d+e)
    return weight
# create a new column to hold the mean weight
mdt['MWV'] = mdt.apply( lambda row: mwv(row['strongly_disagree'], row['disagree'], row['mid'], row['agree'], row['strongly_agree']), axis = 1)


In [None]:
#MWV_df = mdt[mdt.columns[-7:]]
MWV_df = mdt.iloc[3:6,-7:]

In [None]:
MWV_df

##### Motivation percentages by Saraswati participation

In [None]:
color = ["#1f77b4","#FF8C00"]
ax = motivation_df.groupby('saraswati_participant')['contrib_enjoyable'].apply(lambda c: (c>3)
                .sum()/len(c)).plot(kind='bar', color=color)

xlabels = ['non GLOW Saraswati editor','GLOW Saraswati editor']
ax.set_xticklabels(xlabels, rotation = 360)
ax.yaxis.set_major_formatter(mtick.PercentFormatter());

In [None]:
saraswati_participants_df['contrib_enjoyable'].value_counts(normalize=True) * 100

In [None]:
non_saraswati_df['contrib_enjoyable'].value_counts(normalize=True) * 100

In [None]:
#H0 The two groups report same enjoyment
#H1 The two groups report different enjoyment

# compare samples
stat, p = mannwhitneyu(saraswati_participants_df['contrib_enjoyable'], non_saraswati_df['contrib_enjoyable'])
print('Statistics=%.3f, p=%.3f' % (stat, p))
# interpret
alpha = 0.05
if p > alpha:
	print('Same distribution (fail to reject H0)')
else:
	print('Different distribution (reject H0)')
    
#We fail to reject the null hypothesis, conclude that the there’s no significant evidence to state that two groups report different enjoyment.

In [None]:
plt.hist(saraswati_participants_df['contrib_enjoyable'], alpha=.5);
plt.hist(non_saraswati_df['contrib_enjoyable'], alpha=.5);

In [None]:
# Review  distribution 
plt.figure();
hist = motivation_df['contrib_enjoyable'].plot.hist(bins=10);

In [None]:
plt.figure();
hist = motivation_df['contrib_benefits_world'].plot.hist(bins=10);

In [None]:
plt.figure();
hist = motivation_df['contrib_benefits_me'].plot.hist(bins=10);

In [None]:
# Exclude age_group from the list of benefits
all_groups = list(saraswati_participant.columns[:-1])

for benefit in all_benefits:
    
    fig, ax = plt.subplots(nrows=1, 
                           ncols=2, 
                           figsize=(8, 6), 
                           sharey=True, 
                           sharex=True)

    for a, age_group in zip(ax.flatten(), buckets_except_under30):
        plot_benefit_question(benefits, 
                              benefit, 
                              age_group=age_group, 
                              ax=a
                             )
        
        # Keeps x-axis tick labels for each group of plots
        a.xaxis.set_tick_params(which='both', labelbottom=True)
        
        # Suppresses displaying the question along the y-axis
        a.yaxis.label.set_visible(False)

    plt.tight_layout()

In [None]:


g = sns.FacetGrid(filtered_red, col="saraswati_participant", height=4, aspect=1) 
g = g.map(plt.hist, "blockers")

for ax, title in zip(g.axes.flat, ['Non-Saraswati participants', 'Saraswati participants']):
    ax.set_title(title)
g.axes[0,0].set_ylabel('Number of editors')


labels = ['time','skills','newcomers','tech','other']
ax.set_xticklabels(labels);

plt.xlim(1, 6) 
plt.ylim(1, 25)

;

In [None]:
fg = sns.catplot(x='blockers', y='saraswati_participant', 
                        data=filtered_red, kind='bar')
fg.set_xlabels('')

In [None]:
filtered_red.groupby('saraswati_participant').blockers.plot(kind='barh');

In [None]:
filtered_red.groupby('saraswati_participant').blockers.value_counts().plot(kind='barh');