## preprocessing avec Dataflow

we have to run this notebook in the environment of python 2

In [None]:
%%bash
conda update -y -n base -c defaults conda
source activate py2env
pip uninstall -y google-cloud-dataflow
conda install -y pytz
pip install apache-beam[gcp]==2.9.0

After doing a pip install, click **"Reset Session"** on the notebook so that the Python environment picks up the new packages.

In [1]:
# change these to try this notebook out
BUCKET = 'test_envouthe'
PROJECT = 'envouthe-datalake'
REGION = 'europe-west1'

In [2]:
import os
os.environ['BUCKET'] = BUCKET
os.environ['PROJECT'] = PROJECT
os.environ['REGION'] = REGION

In [3]:
%%bash
if ! gsutil ls | grep -q gs://${BUCKET}/; then
  gsutil mb -l ${REGION} gs://${BUCKET}
fi

En fonction de la notebook *envouthe_base_algo_test*, on decide Random Rorest comme l'algorithme, et les features sont 'id_subscriptionMain','abonnement_consec', 'end_year' et 'total_sub'.

In [4]:
# Call BigQuery and examine in dataframe
import google.datalab.bigquery as bq

query = """
 SELECT
  id_subscriptionMain,
  abonnement_consec,
  EXTRACT(year FROM ending_month) AS end_year,
  total_sub,
  churn___ as churn
 FROM
  renew_abonnement.merged_ds
 LIMIT 100
"""

dataset = bq.Query(query).execute().result().to_dataframe()

dataset.head()


Unnamed: 0,id_subscriptionMain,abonnement_consec,end_year,total_sub,churn
0,16422,1,2014,1,No
1,16422,2,2014,2,No
2,16422,3,2014,3,No
3,16422,4,2014,4,No
4,16396,1,2014,1,No


In [50]:
# echantillonner directement dataset aux train, valid et test set

# shuffled = dataset.sample(frac=1)
# trainsize = int(len(shuffled) * 0.70)
# validsize = int(len(shuffled) * 0.15)
# # df_local = shuffled.iloc[:100]
# df_train = shuffled.iloc[:trainsize, :]
# df_valid = shuffled.iloc[trainsize:(trainsize+validsize), :]
# df_test = shuffled.iloc[(trainsize+validsize):, :]

# print(df_train.shape)
# print(df_valid.shape)
# print(df_test.shape)

In [51]:
# sauvegarder train,valid,test dataset dans google storage

# import google.datalab.storage as storage

# df_train.to_csv('train.csv',encoding='utf-8') #sauvegarder dans le local
# df_valid.to_csv('valid.csv',encoding='utf-8')
# df_test.to_csv('test.csv',encoding='utf-8')

In [52]:
# %bash
# gsutil cp 'train.csv' "gs://$BUCKET/data/"
# gsutil cp 'valid.csv' "gs://$BUCKET/data/"
# gsutil cp 'test.csv' "gs://$BUCKET/data/"

**preprocessing**

on va lire les donnees par la requete BigQuery, puis on fait un peu de preprocessing ici: convertir les valeurs categoriques en numeriques, puis echantilloner les donnees en train set et eval set.

In [5]:
import apache_beam as beam
import datetime, os

def to_csv(rowdict):
  # Pull columns from BQ and create a line
    import hashlib
    import copy
    
    CSV_COLUMNS = "churn,abonnement_consec,end_year,total_sub".split(',')

    df = copy.deepcopy(rowdict)

    for keys in [df]:
      data = ','.join([str(keys[k]) if k in keys else 'None' for k in CSV_COLUMNS])  
      key = hashlib.sha224(data).hexdigest()  # hash the columns to form a key
      yield str('{},{}'.format(data, key))

format of variables:
- rowdict = {u'churn': u'No', u'abonnement_consec': 1, u'end_year': 2014, u'total_sub': 1}
- csv_columns = [u'churn', u'abonnement_consec', u'end_year', u'total_sub']
- data = No,1 2014,1

In [17]:
def preprocess(in_test_mode):
    import shutil, os, subprocess
    
    job_name = 'preprocess-envouthe-base' + '-' + datetime.datetime.now().strftime('%y%m%d-%H%M%S')

    if in_test_mode:
        print('Launching local job ... hang on')
        OUTPUT_DIR = './preproc'
        shutil.rmtree(OUTPUT_DIR, ignore_errors=True)
        os.makedirs(OUTPUT_DIR)
    else:
        print('Launching Dataflow job {} ... hang on'.format(job_name))
        OUTPUT_DIR = 'gs://{0}/envouthe_base/preproc/'.format(BUCKET)
        try:
          subprocess.check_call('gsutil -m rm -r {}'.format(OUTPUT_DIR).split())
        except:
          pass

    options = {
      'staging_location': os.path.join(OUTPUT_DIR, 'tmp', 'staging'),
      'temp_location': os.path.join(OUTPUT_DIR, 'tmp'),
      'job_name': job_name,
      'project': PROJECT,
      'region': REGION,
      'teardown_policy': 'TEARDOWN_ALWAYS',
      'no_save_main_session': True,
      'max_num_workers': 6
    }

    opts = beam.pipeline.PipelineOptions(flags = [], **options)

    if in_test_mode:
        RUNNER = 'DirectRunner'
    else:
        RUNNER = 'DataflowRunner'

    p = beam.Pipeline(RUNNER, options = opts)

    query = """
    SELECT
      int64_field_0 AS row_id,
      id_subscriptionMain,
      abonnement_consec,
      EXTRACT(year FROM ending_month) AS end_year,
      total_sub,
      (CASE
        WHEN churn___ = 'No' THEN 0
        WHEN churn___ = 'Yes' THEN 1
        WHEN churn___ = 'Renew' THEN 2
        ELSE 3 END) AS churn
     FROM
      renew_abonnement.merged_ds
    """

    train_query = 'SELECT * EXCEPT(row_id) FROM ({}) WHERE row_id < (SELECT count(*) FROM `renew_abonnement.telecom_churn` )*0.70'.format(query)

    eval_query = 'SELECT * EXCEPT(row_id) FROM ({}) WHERE row_id >= (SELECT count(*) FROM `renew_abonnement.telecom_churn` )*0.70 \
    and row_id <= (SELECT count(*) FROM `renew_abonnement.telecom_churn` )'.format(query)

    if in_test_mode:
        train_query = train_query + ' LIMIT 100'
        eval_query = eval_query + ' LIMIT 100'

    for step in ['train', 'eval']:
      if step == 'train':
          selquery = train_query
      else:
          selquery = eval_query

      (p 
       | '{}_read'.format(step) >> beam.io.Read(beam.io.BigQuerySource(query = selquery, use_standard_sql = True))
       | '{}_csv'.format(step) >> beam.FlatMap(to_csv)
       | '{}_out'.format(step) >> beam.io.Write(beam.io.WriteToText(os.path.join(OUTPUT_DIR, '{}.csv'.format(step))))
      )

    job = p.run()
    if in_test_mode:
      job.wait_until_finish()
      print("Done!")
    
preprocess(in_test_mode = False)
# preprocess(in_test_mode = True)

Launching Dataflow job preprocess-envouthe-base-190823-132641 ... hang on


  options = pbegin.pipeline.options.view_as(DebugOptions)


**visualiser les fichiers train set et eval set**

In [19]:
%bash
gsutil ls gs://${BUCKET}/envouthe_base/preproc/*-00000*

gs://test_envouthe/envouthe_base/preproc/eval.csv-00000-of-00001
gs://test_envouthe/envouthe_base/preproc/train.csv-00000-of-00001
