In [None]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import matplotlib.dates as mdates
from matplotlib import pylab
import pandas as pd

%matplotlib inline

In [None]:
import seaborn as sns
sns.set_style("whitegrid")
import numpy as np

In [None]:
%run ../.././data_wrangling/2b_data_handling.ipynb

In [None]:
import datetime
d = datetime.date(2019, 10, 10)
isinstance(d, datetime.date)

In [None]:
final_frame_updated_pv_topics_suggs = pd.read_csv("../../../data/processed/query_results/content_quality/b5_final_frame_updated_pv_topics_g_suggs.csv", sep=',', encoding = 'utf-8') 

In [None]:
interim_frame_updated_pv_topics_suggs = pd.read_csv("../../../data/processed/query_results/content_quality/5_interim_frame_updated_w_g_suggs.csv", sep=',', encoding = 'utf-8') 

In [None]:
feb_global_views_referrer = pd.read_csv("../../../data/processed/query_results/content_quality/feb_global_views_referrer.csv", sep=',', encoding = 'utf-8') 

In [None]:
indonesia = pd.read_csv("../../../data/raw/articles/2019/query_results/content_quality/per_wiki_full/Indonesia/articles_w_july_incountry_pageviews_recs.csv", sep=',')

## Handle Data

In [None]:
ifupvts_lim = interim_frame_updated_pv_topics_suggs[[
    'page_id',
    'database_code',
    'page_title',
    'page_len',
    'article_type',
    'g_suggestion',
]].copy()

In [None]:
feb_global_views_referrer = feb_global_views_referrer.rename(columns = {'wikicode':'database_code',
                                                                      'referer_class':'referrer_class'})

In [None]:
fgvr = feb_global_views_referrer.copy()
fgvr.replace({'referrer_class' : { 'none' : 'none_unknown', 
                                                       'unknown' : 'none_unknown',
                                                       'external (search engine)': 'external_comb',
                                                       'external': 'external_comb'
                                                      }}, inplace=True)

In [None]:
rdf = pd.merge(ifupvts_lim, feb_global_views_referrer, how='left', on=['page_id', 'page_title','database_code'])

In [None]:
interim_df = interim_frame_updated_pv_topics_suggs[[
    'database_code',
    'page_len',
    'relative_page_len',
    'first_edited',
    'article_type',
    'translation_tool',
    'views_1M',
    'views_1M_all',
    'predicted_label_1',
    'g_suggestion',
]].copy()

#### page_len

In [None]:
#Creating the page_len_bins
bins = [0, 1000, 5000, 10000, 20000, 30000, 40000, 50000, 100000, 200000]
interim_df['page_len_bins'] = pd.cut(interim_df['page_len'], bins)

In [None]:
b = pd.cut(interim_df['page_len'], bins=bins).value_counts()
b.plot()
plt.xticks(rotation=45);

#### page_len_relative

In [None]:
#Creating the page_len_bins
plr_bins = [0, .2, .5, .7, .9, 1]
names= ['1','2','3','4','5']

#create bin column
interim_df['rel_page_len_bins'] = pd.cut(interim_df['relative_page_len'], plr_bins)

#create renamed bin column for charting and change dtype to into
interim_df['rplb_charting'] = pd.cut(interim_df['relative_page_len'], plr_bins, labels=names)
interim_df.rplb_charting = pd.to_numeric(interim_df.rplb_charting, errors='coerce')

In [None]:
#attention: PA wiki had 1 featured article
#attention: tcywiki and satwiki had zero featured articles, so these use the median article length as their anchor
plr_att_list = ['tcywiki', 'satwiki', 'pawiki']
plr_limited_list = ['pawiki']
solid_plr_wikis_df = interim_df[~interim_df['database_code'].isin(plr_att_list)]
mid_plr_wikis_df = interim_df[~interim_df['database_code'].isin(plr_limited_list)]

#### stub_lifetime_M

In [None]:
#handling first_edited
#interim_df.first_edited = interim_df.first_edited.apply(np.datetime64) #creates datetime index column
interim_df['first_edited'] = pd.to_datetime(interim_df['first_edited']).dt.date

#create mask
expanded_articles = interim_df['article_type'] == 'expanded'
new_articles = interim_df['article_type'] == 'new'

In [None]:
#create a time delta column, negative if the article existed as a stub prior to the start of the contest
interim_df['stub_lifetime_M'] = (interim_df['first_edited']-d).astype('timedelta64[M]')
#interim_df['stub_lifetime'] = (interim_df['first_edited']-d)
#z = (d - interim_df['first_edited']).dt.seconds

In [None]:
#check dispersion in stub_lifetime_M
sns.stripplot(data = interim_df, y = 'stub_lifetime_M', jitter = True);
#sns.plt.ylim(0, 500)
#sns.plt.show()

In [None]:
# Review 'page_len' distribution 
plt.figure();
age_hist = interim_df['stub_lifetime_M'].plot.hist(bins=10)
age_hist.set_xlabel("stub_lifetime_M")
age_hist.set_ylabel("Articles")
age_hist.set_title('Distribution of stub_lifetime_M');


In [None]:
#Creating the stub_lifetime_bins
slb_bins = [-200, -150, -100, -50, 0, 5]
slb_names = ['4','3','2','1','0']

interim_df['stub_lifetime_bins'] = pd.cut(interim_df['stub_lifetime_M'], slb_bins)

interim_df['stub_stlb_charting'] = pd.cut(interim_df['stub_lifetime_M'], slb_bins, labels=slb_names)
interim_df.stub_stlb_charting = pd.to_numeric(interim_df.stub_stlb_charting, errors='coerce')

In [None]:
t = pd.cut(interim_df['stub_lifetime_M'], bins=slb_bins).value_counts()
t.plot();

#### Topics

In [None]:
#create mask
expanded_articles = interim_df['article_type'] == 'expanded'
new_articles = interim_df['article_type'] == 'new'

expanded_final_frame = final_frame_updated_pv_topics_suggs['article_type'] == 'expanded'
new_final_frame = final_frame_updated_pv_topics_suggs['article_type'] == 'new'

### General Review

In [None]:
#unique value counts per column
for c in interim_df.columns:
    print ("---- %s ---" % c)
    print (interim_df[c].value_counts())

In [None]:
#all accepted and surviving articles, core metrics
plt.figure(figsize = (20,8))
sns.heatmap(indonesia.corr(), annot=True, cmap='coolwarm');

In [None]:
c = indonesia.corr().abs()

s = c.unstack()
so = s.sort_values(kind="quicksort")

In [None]:
so

In [None]:
plt.figure(figsize = (20,8))
sns.heatmap(indonesia[['july_view_count_internal', 'g_suggestion', 
                       'editors_nm', 'iwsitelinks', 'talk_page_edits',
                      'QID', 'micro_editors', 'recs_given_order', 'relative_page_len', 'page_len','editors_nm',
                      ]].corr(), annot=True, cmap='coolwarm');

In [None]:
#heatmap excluding PAwiki articles which only had 1 'featured article' (generally...not specific to GLOW)
sns.heatmap(mid_plr_wikis_df[['views_1M_all', 'g_suggestion', 'rplb_charting', 'page_len', 'relative_page_len']].corr(), annot=True, cmap='coolwarm');


In [None]:
#heatmap using only articles from wikis with 'featured articles' (generally...not specific to GLOW)
sns.heatmap(solid_plr_wikis_df[['views_1M_all', 'g_suggestion', 'rplb_charting', 'page_len', 'relative_page_len']].corr(), annot=True, cmap='coolwarm');


In [None]:
interim_df.describe()

In [None]:
def describe(df, stats):
    d = df.describe()
    return d.append(df.reindex(d.columns, axis=1).agg(stats))


#describe(interim_df, ['skew', 'mad', 'kurt'])

#### Select columns

In [None]:
list(interim_df)

In [None]:
select_columns = [
 'g_suggestion',
 'article_type',
 'translation_tool',
 ]

columns = [
 'g_suggestion',
 'article_type',
 'stub_lifetime_bins',
 'translation_tool',
 'page_len_bins',
 'database_code',
 'predicted_label_1',
 ]

In [None]:
for r in select_columns :
    print(interim_df.groupby(r)['views_1M_all'].describe().sort_values(by=['count','mean'],ascending=False))
    print('*********************************')

In [None]:
#avg views for each unique value by columns
for r in columns :
    print(interim_df.groupby(r)['views_1M_all'].mean().sort_values(ascending=False))
    print('*********************************')

In [None]:
#all pages average pageviews
interim_df["views_1M_all"].mean()

In [None]:
#all pages average pageviews, from within India
interim_df["views_1M"].mean()

In [None]:
#new pages average pageviews
final_frame_updated_pv_topics_suggs["views_1M_all"].mean()

In [None]:
#new pages average pageviews, from within India
final_frame_updated_pv_topics_suggs["views_1M"].mean()

## Article type

In [None]:
interim_df.groupby('article_type', sort=True)['views_1M_all'].agg(['mean','count','sum']).sort_values(by=['mean'], ascending=False)

In [None]:
interim_df.groupby('article_type')['views_1M_all'].agg(['mean'])

In [None]:
#calculate how many more views expanded articles received on average over new articles
e = interim_df.groupby('article_type')['views_1M_all'].mean()
e_impact = e[0]/e[1]
e_impact

In [None]:
interim_df.groupby('article_type').views_1M_all.mean().plot(kind='bar');

In [None]:
#Total Views by Wiki: Expanded vs New
indonesia.groupby(["database_code", "article_type"])["july_view_count_internal"].sum().unstack().plot(kind="bar", color=['b', 'g'],stacked=True);

In [None]:
#Total Views by Wiki: Expanded vs New
interim_df.groupby(["database_code", "article_type"])["views_1M_all"].sum().unstack().plot(kind="bar", color=['b', 'g'],stacked=True);

In [None]:
#Avg Views by Wiki: Expanded vs New
interim_df.groupby(["database_code", "article_type"])["views_1M_all"].mean().unstack().plot(kind="bar", color=['b', 'g'], stacked=True);

## G_suggestions

In [None]:
interim_df.groupby('g_suggestion', sort=True)['views_1M_all'].agg(['mean','count','sum']).sort_values(by=['mean'], ascending=False)

In [None]:
#calculate how many more views g_suggested articles received on average over non g_suggested articles
g = interim_df.groupby('g_suggestion')['views_1M_all'].mean()
g_impact = g[1]/g[0]
g_impact

In [None]:
interim_df.groupby('g_suggestion').views_1M_all.mean().plot(kind='bar')


plt.title('Average Pageviews, Google Suggestions',fontsize=16)
plt.xlabel('Google Suggestion',fontsize=13)
plt.ylabel('Average Views',fontsize=13)
plt.xticks(rotation=360)
plt.tick_params(labelsize=12)
#plt.savefig('../../results/figs/pageviews/g_suggestion.png');

In [None]:
interim_df.groupby('g_suggestion').views_1M_all.mean()

In [None]:
interim_df.groupby('g_suggestion').views_1M.mean().plot(kind='bar', color=["#FF8C00", "#1f77b4"]);

In [None]:
subset.groupby('g_suggestion').july_view_count_internal.mean().plot(kind='bar', color=["#FF8C00", "#1f77b4"]);

## G_suggestions & Article Type

In [None]:
interim_df.groupby(['g_suggestion', 'article_type'], sort=True)['views_1M_all'].describe()

In [None]:
g = sns.catplot(x="article_type", y="views_1M_all", hue="g_suggestion", data=interim_df,
                height=6, kind="bar", palette=["#FF8C00", "#1f77b4"],legend_out = False) #palette="muted", 
g.despine(left=True)

# check axes and find which have legend
leg = g.axes.flat[0].get_legend()
new_title = 'List Type'
leg.set_title(new_title)
new_labels = ['Non-Google List', 'Google List']
for t, l in zip(leg.texts, new_labels): t.set_text(l)

plt.title("Pageviews by Article Type & List Type",fontsize=16)
plt.xlabel('Article Type: new or expanded',fontsize=13)
plt.ylabel('Average Views',fontsize=13)

plt.savefig('../../../results/figs/pageviews/avg_pv_type_suggestion.svg')
plt.show(g)

In [None]:
f = sns.catplot(x="article_type", y="views_1M", hue="g_suggestion", data=interim_df,
                height=6, kind="bar", palette=["#FF8C00", "#1f77b4"],legend_out = False) #palette="muted", 
f.despine(left=True)

# check axes and find which have legend
leg = f.axes.flat[0].get_legend()
new_title = 'List Type'
leg.set_title(new_title)
new_labels = ['Non-Google List', 'Google List']
for t, l in zip(leg.texts, new_labels): t.set_text(l)

plt.title("Pageviews by Article Type & List Type",fontsize=16)
plt.xlabel('Article Type: new or expanded',fontsize=13)
plt.ylabel('Average Views',fontsize=13)

#plt.savefig('../../../results/figs/pageviews/avg_pv_type_suggestion.svg')
plt.show(f)

In [None]:
interim_df.groupby(['g_suggestion', 'article_type'], sort=True)['views_1M'].describe()

In [None]:
subset.groupby(['g_suggestion', 'article_type'], sort=True)['july_view_count_internal'].describe()

In [None]:
subset = indonesia.copy(deep=True)
subset['article_type'] = subset['article_type'].str.replace('post', 'new')

e = sns.catplot(x="article_type", y="july_view_count_internal", hue="g_suggestion", data=subset,
                height=6, kind="bar", palette=["#FF8C00", "#1f77b4"],legend_out = False) #palette="muted", 
e.despine(left=True)

# check axes and find which have legend
leg = e.axes.flat[0].get_legend()
new_title = 'List Type'
leg.set_title(new_title)
new_labels = ['Non-Google List', 'Google List']
for t, l in zip(leg.texts, new_labels): t.set_text(l)

plt.title("Pageviews by Article Type & List Type",fontsize=16)
plt.xlabel('Article Type: new or expanded',fontsize=13)
plt.ylabel('Average Views',fontsize=13)

#plt.savefig('../../../results/figs/pageviews/avg_pv_type_suggestion.svg')
plt.show(e)

In [None]:
interim_df.groupby(['database_code','g_suggestion'], sort=True)['views_1M_all'].describe()

In [None]:
pd.pivot_table(interim_df, values = 'views_1M_all', index = 'database_code',
               columns = 'g_suggestion',aggfunc ='mean').plot.bar().set_title("Pageviews per Wiki by Suggestion");

In [None]:
interim_df.groupby(['database_code','g_suggestion', 'article_type'], sort=True)['views_1M_all'].describe()

In [None]:
#Avg Views by Wiki: g_suggestions vs. internal lists
interim_df.groupby(["database_code", "g_suggestion", "article_type"], sort=False)["views_1M_all"].mean().unstack().plot(kind="bar", figsize=(15,9), color=['g', 'b'],stacked=True);

In [None]:
#Avg Views by Wiki: Expanded vs New
n = interim_df.groupby(["database_code", "g_suggestion", "article_type"], sort=False)["views_1M_all"].mean().unstack().sort_values(['expanded'],ascending=False).plot(kind="bar", figsize=(15,9), color=['g', 'b'], stacked=True);

In [None]:
#Avg Views by Wiki: Expanded vs New
interim_df.groupby(["database_code", "article_type", "g_suggestion"], sort=False)["views_1M_all"].mean().unstack().plot(kind="bar", figsize=(15,9),stacked=True);

In [None]:
#Avg Views by Wiki: Expanded vs New - ordered
interim_df.groupby(["database_code", "article_type", "g_suggestion"], sort=False)["views_1M_all"].mean().unstack().sort_values([True],ascending=False).plot(kind="bar", figsize=(15,9), stacked=True);

### Stub lifetime

In [None]:
interim_df.groupby('stub_lifetime_bins')['views_1M_all'].describe()

In [None]:
interim_df.groupby('stub_lifetime_bins')['views_1M_all'].agg(['mean','count','sum']).sort_values(by=['sum'], ascending=False)

In [None]:
interim_df.groupby(['database_code','g_suggestion', 'stub_lifetime_bins'], sort=True)['views_1M_all'].describe()[['count','mean']]


In [None]:
sns.lmplot(data = interim_df, x = 'stub_lifetime_M', y = 'views_1M_all', col='g_suggestion', fit_reg = True);

In [None]:
f = sns.lmplot(x = 'stub_lifetime_M', y = 'views_1M_all', hue='g_suggestion', col='database_code',
               data=interim_df, height=6, col_wrap=5, aspect=.4, x_jitter=.5)

f = (f.set_axis_labels("Stub Lifetime", "Pageviews")
      .set(xlim=(5, -200), ylim=(0, 3000),)
           #xticks=[---], yticks=[---])
      .fig.subplots_adjust(wspace=.1))

## Page_length & Relative page length

In [None]:
interim_df.groupby('page_len_bins')['views_1M_all'].describe()

In [None]:
sns.lmplot(data = interim_df, x = 'page_len', y = 'views_1M_all', fit_reg = True);

In [None]:
#Higher values here denote proximity to the median relative length of a wiki's featured article, 
#for example a '5' here is equal to a 1.0 score noting a full matches to the length of the median length of a featured article in that wiki  
solid_plr_wikis_df.groupby('rplb_charting').views_1M_all.mean().plot(kind='bar');

In [None]:
#pageviews by rel_page_len_bins per wiki
#attention: PA wiki had 1 featured article
#attention: tcywiki and satwiki had zero featured articles, so these use the median article length as their anchor
pv_given_rplb_by_wiki = interim_df.groupby(['database_code', 'rel_page_len_bins'], sort=True)['views_1M_all'].describe()

In [None]:
pv_given_rplb_by_wiki

In [None]:
interim_df.groupby(['database_code','g_suggestion', 'rel_page_len_bins'], sort=False)['views_1M_all'].describe()[['count','mean']]

In [None]:
interim_df.groupby(['database_code','g_suggestion', 'rel_page_len_bins'], sort=False)['views_1M_all'].describe()[['count','mean']].sort_values(by=['count', 'mean'], ascending=False)

In [None]:
interim_df.groupby(['database_code','g_suggestion', 'rel_page_len_bins'], sort=False)['views_1M_all'].agg(['mean','count','sum']).sort_values(by=['mean'], ascending=False)

In [None]:
interim_df['rel_page_len_bins'].unique()

In [None]:
sns.lmplot(data = interim_df, x = 'rplb_charting', y = 'views_1M_all', fit_reg = True);
#sns.plt.xlim(0, 6)
#sns.plt.title('---')
#sns.plt.show()

In [None]:
#Expanded articles, article length and views
sns.lmplot(data = interim_df[expanded_articles], x = 'rplb_charting', y = 'views_1M_all', fit_reg = True);

In [None]:
#New articles, article length and views
sns.lmplot(data = interim_df[new_articles], x = 'rplb_charting', y = 'views_1M_all', fit_reg = True);

## Topics

In [None]:
indonesia.groupby('predicted_label_1')['july_view_count_internal'].describe().sort_values(by=['count', 'mean'], ascending=False)

In [None]:
interim_df.groupby('predicted_label_1')['views_1M_all'].describe().sort_values(by=['count', 'mean'], ascending=False)

In [None]:
#get only topics with more than 25 associated articles
topics_grouped = interim_df.groupby('predicted_label_1')
topics_over_25 = interim_df[interim_df.groupby('predicted_label_1')['views_1M_all'].transform('count') > 25]

In [None]:
#top topics by number of associated articles
topics_over_25.groupby('predicted_label_1').size().sort_values( ascending=False)

In [None]:
topics_over_25.groupby('predicted_label_1')['views_1M_all'].describe().sort_values(by=['mean'], ascending=False)

In [None]:
topics_over_25.groupby('predicted_label_1')['views_1M_all'].agg(['mean','count','sum']).sort_values(by=['mean'], ascending=False)

In [None]:
topics_over_25.groupby('predicted_label_1')['views_1M_all'].agg(['mean','count','sum']).sort_values(by=['sum'], ascending=False)

In [None]:
#get top ~10 topics

top_topics_df = interim_df[interim_df.groupby('predicted_label_1')['views_1M_all'].transform('count') > 100]

#topics_grouped = interim_df.groupby('predicted_label_1')
#df1 = interim_df[interim_df.groupby('predicted_label_1')['views_1M_all'].transform('count') > 25]

In [None]:
top_topics_df.groupby(["article_type", 'predicted_label_1'])['views_1M_all'].describe().sort_values(by=['predicted_label_1'], ascending=False)

In [None]:
top_topics_df.groupby(['predicted_label_1',"g_suggestion", ])['views_1M_all'].describe().sort_values(by=['predicted_label_1'], ascending=False)

In [None]:
top_topics_df.groupby(["article_type", 'predicted_label_1', "g_suggestion"])['views_1M_all'].describe().sort_values(by=['predicted_label_1', "g_suggestion"], ascending=False)

## Referral Source

In [None]:
feb_global_views_referrer.groupby('referrer_class')['feb_view_count_global'].agg(['mean','count','sum']).sort_values(by=['mean'], ascending=False)

In [None]:
rdf.groupby('referrer_class')['feb_view_count_global'].agg(['mean','count','sum']).sort_values(by=['mean'], ascending=False)

In [None]:
t = feb_global_views_referrer.groupby('referrer_class')['feb_view_count_global'].agg(['mean','count','sum']).sort_values(by=['mean'], ascending=False).reset_index()
entries = t['count'].sum()
_pvsum = t['sum'].sum()
print('raw entries', entries)
print('raw pgview sum', _pvsum)

In [None]:
n = rdf.groupby('referrer_class')['feb_view_count_global'].agg(['mean','count','sum']).sort_values(by=['mean'], ascending=False).reset_index()
entries = n['count'].sum()
_pvsum = n['sum'].sum()
print('merged entries', entries)
print('merged pgview sum', _pvsum)

In [None]:
rdf.groupby(['referrer_class', 'g_suggestion'], sort=False)["feb_view_count_global"].mean().unstack().sort_values([True],ascending=False).plot(kind="bar", figsize=(15,9), stacked=True);

In [None]:
pd.pivot_table(rdf.sort_values('feb_view_count_global'), values = 'feb_view_count_global', index = 'referrer_class',
               columns = 'g_suggestion',aggfunc ='mean').plot.bar().set_title("Avg pageviews by referrer and suggestion");

In [None]:
#define rows
mask = rdf['referrer_class'].isin(['unknown', 'external'])
rdf_select = rdf[~mask]

#define order
referral_types = ['external (search engine)', 'none', 'internal']
mapping = {referral_type: i for i, referral_type in enumerate(referral_types)}
key = rdf_select['article_type'].map(mapping)

#rdf_select.iloc[key.argsort()]
df_grouped = rdf_select.iloc[key.argsort()]

gapv = pd.pivot_table(df_grouped.sort_values('article_type', ascending=True), 
                      values = 'feb_view_count_global', 
                      index = 'referrer_class',
                      columns = ['g_suggestion', 'article_type'],
                      aggfunc ='mean'
                     )

gapv.plot(kind = 'bar', figsize=(20,10))

plt.title('Avg pageviews by referrer, type, suggestion',fontsize=24)
plt.xlabel('Referral Source Category',fontsize=28)
plt.ylabel('Average Views',fontsize=28)
plt.xticks(rotation=360)
plt.tick_params(labelsize=22)
#plt.savefig('../../results/figs/pageviews/Avg_pv_referrer_suggestion_type.png')
plt.show();

#color scheme by sugg type
#label bars

In [None]:
gapv

In [None]:
fgvr.groupby('referrer_class')["feb_view_count_global"].mean().sort_values(ascending=False).plot(kind="bar", 
                                                                                                 figsize=(15,9), 
                                                                                                 stacked=True);