# process/get contest editors region

In [67]:
from pyspark.sql.types import ArrayType, StringType

from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession

import datetime as dt

import pandas as pd
import numpy as np

In [130]:
%run 2b_data_handling.ipynb
%run 8a_collect_contest_editors_region.ipynb

Stored 'query_vars' (dict)
Stored 'quality_vars' (dict)


#### Editors<a class="anchor" id="editors"></a>
[Back to Table of Contents](#toc)

In [3]:
mce_contest = spark.sql(mce_contest_r.format(**query_vars)).toPandas()

In [4]:
mce_pre = spark.sql(mce_pre_r.format(**query_vars)).toPandas()

In [5]:
mnae_contest = spark.sql(mnae_contest_r.format(**query_vars)).toPandas()

In [6]:
mnae_pre = spark.sql(mnae_pre_r.format(**query_vars)).toPandas()

In [None]:
mae_contest = spark.sql(mae_contest_r.format(**query_vars)).toPandas()

In [8]:
mae_pre = spark.sql(mae_pre_r.format(**query_vars)).toPandas()

In [9]:
ner_contest = spark.sql(ner_contest_r.format(**query_vars)).toPandas()

In [10]:
ner_pre = spark.sql(ner_pre_r.format(**query_vars)).toPandas()

In [11]:
contest_dfs = [mce_contest, mnae_contest, mae_contest, ner_contest]
pre_dfs = [mce_pre, mnae_pre, mae_pre, ner_contest]

contest_editors_region = pd.concat(contest_dfs, axis=1)
pre_contest_editors_region = pd.concat(pre_dfs, axis=1)

## save to csv

In [14]:
contest_editors_region.to_csv("../../data/processed/query_results/regional_counts/baseline/contest_editors_region.csv", sep=',', encoding = 'utf-8', index=False)

In [15]:
pre_contest_editors_region.to_csv("../../data/processed/query_results/regional_counts/baseline/pre_contest_editors_region.csv", sep=',', encoding = 'utf-8', index=False)

### Editor registrations

In [71]:
editor_reg_contest_2019_raw = spark.sql(editor_reg_contest_2019_r.format(**query_vars)).toPandas()

In [None]:
editor_reg_contest_2019_raw['username'].nunique()

In [72]:
editor_reg_contest_2019_raw['reg_date'] = pd.to_datetime(editor_reg_contest_2019_raw['reg_date'])

In [None]:
ud = editor_reg_contest_2019_raw['username']
raw_dupes = editor_reg_contest_2019_raw[ud.isin(ud[ud.duplicated()])].sort_values(by="username")
raw_dupes['username'].nunique()

In [74]:
editor_reg_contest_2019 = editor_reg_contest_2019_raw.groupby('username')['reg_date'].min().reset_index()

In [None]:
editor_reg_contest_2019.info()

In [84]:
editor_reg_contest_2019.to_csv("../../data/processed/query_results/editors/editor_reg_contest.csv", sep=',', encoding = 'utf-8', index=False)

#### 2018

In [79]:
editor_reg_contest_2018_raw = spark.sql(editor_reg_contest_2018_r.format(**query_vars)).toPandas()

In [80]:
editor_reg_contest_2018_raw['reg_date'] = pd.to_datetime(editor_reg_contest_2018_raw['reg_date'])

In [None]:
ud18 = editor_reg_contest_2018_raw['username']
raw_dupes = editor_reg_contest_2018_raw[ud18.isin(ud18[ud18.duplicated()])].sort_values(by="username")
raw_dupes['username'].nunique()

In [82]:
editor_reg_contest_2018 = editor_reg_contest_2018_raw.groupby('username')['reg_date'].min().reset_index()

In [83]:
editor_reg_contest_2018.to_csv("../../data/processed/query_results/editors/editor_reg_contest_18.csv", sep=',', encoding = 'utf-8', index=False)

### New Editors

In [15]:
nrdf = editor_reg_contest_2019[editor_reg_contest_2019['reg_date'] >= '2019-10-01']

In [None]:
len(nrdf)

In [None]:
#get duplicates
usernames = nrdf['username']
nrdf_user_dupes = nrdf[usernames.isin(usernames[usernames.duplicated()])].sort_values(by="username")
nrdf_user_dupes['username'].nunique()

In [None]:
pvt_editor_reg = nrdf.groupby('trunc(CAST(user_registration AS DATE), MM)').count()

In [22]:
pvt_editor_reg.to_csv("../../data/processed/query_results/editors/new_editors_per_month.csv", sep=',', encoding = 'utf-8', index=False)

### Edits

In [3]:
GLOW_editor_edits = spark.sql(GLOW_editor_edits_r).toPandas()

In [13]:
GLOW_editor_edits['event_timestamp'] = pd.to_datetime(GLOW_editor_edits['event_timestamp'])

GLOW_editor_edits['week'] = GLOW_editor_edits.event_timestamp.dt.week
GLOW_editor_edits['month'] = GLOW_editor_edits.event_timestamp.dt.month
GLOW_editor_edits['month/year'] = GLOW_editor_edits['event_timestamp'].apply(lambda x: "%d/%d" % (x.month, x.year))

#GLOW_editor_edits['month']pd.to_datetime(GLOW_editor_edits['event_timestamp'])

In [None]:
GLOW_editor_edits.to_csv("../../data/processed/editors/GLOW_editor_edits.csv", sep=',', encoding = 'utf-8', index=False)

In [54]:
GLOW_editor_edits_summed = GLOW_editor_edits.groupby(['month/year', 'event_user_text']).size().to_frame().reset_index().rename(columns={0:'editor_edits_per_month'})

In [None]:
GLOW_editor_edits_summed.to_csv("../../data/processed/editors/GLOW_editor_edits_summed.csv", sep=',', encoding = 'utf-8', index=False)

### Edits by PTP Chromebook grantees

In [91]:
weekly_edits_query_ptp_grantees=spark.sql(weekly_edits_query_ptp_grantees_r).toPandas()

In [93]:
weekly_edits_query_ptp_grantees.to_csv("../../data/processed/editors/weekly_edits_ptp_grantees.csv", sep=',', encoding = 'utf-8', index=False)

In [131]:
edits_ptp_grantees_r=spark.sql(edits_query_ptp_grantees_r).toPandas()

In [132]:
edits_ptp_grantees = edits_ptp_grantees_r.copy()

In [133]:
edits_ptp_grantees['edit_date'] = pd.to_datetime(edits_ptp_grantees['edit_date'])

edits_ptp_grantees['week'] = edits_ptp_grantees.edit_date.dt.week
edits_ptp_grantees['month'] = edits_ptp_grantees.edit_date.dt.month
edits_ptp_grantees['year'] = edits_ptp_grantees.edit_date.dt.year
edits_ptp_grantees['month/year'] = edits_ptp_grantees['edit_date'].apply(lambda x: "%d.%d" % (x.year, x.month))


In [134]:
edits_ptp_grantees_edits_summed = edits_ptp_grantees.groupby(['month/year', 'event_user_text']).size().to_frame().reset_index().rename(columns={0:'editor_edits_per_month'})

In [135]:
edits_ptp_grantees_edits_summed.to_csv("../../data/processed/editors/edits_ptp_grantees_edits_summed.csv", sep=',', encoding = 'utf-8', index=False)