In [1]:
import tensorflow as tf
import pandas as pd
import os
import json
import shutil
from tensorflow.python.lib.io.file_io import FileIO as open_file

  from ._conv import register_converters as _register_converters


In [2]:
DATASET_CSV="gs://ml-research-injenia/estimators/datasets/kickstarter-set/2016_trainset.csv"
EVALSET_CSV="gs://ml-research-injenia/estimators/datasets/kickstarter-set/2016_evalset.csv"
MODEL_DIR="gs://ml-research-injenia/estimators/trainings-kickstarter-v2/dnn-classifier/test03"

BATCH_SIZE = 15369
TRAIN_STEPS= 2000000
LEARNING_RATE=5.2897066388156818e-06
L1_NORM=0.0
L2_NORM=0.0

HIDDEN_UNITS=[512,128,32,8]
EMBEDDING_COLUMNS_SIZE=37

In [None]:
with open_file(DATASET_CSV, "r") as f:
    df = pd.read_csv(f)
df

In [None]:
with open_file(EVALSET_CSV, "r") as f:
    df_eval = pd.read_csv(f)
df_eval

In [None]:
with open_file(os.path.join(MODEL_DIR,"trainset.csv"), "w") as f:
    df.to_csv(f,index=False)
with open_file(os.path.join(MODEL_DIR,"evalset.csv"), "w") as f:
    df_eval.to_csv(f,index=False)

In [None]:
INDEX="ID"
COLUMNS=[
    "category",
    "main_category",
    "state",
    "country",
    "timespan_days_scaled",
    "goal_USD_scaled",
    "goal_USD_log_scaled"
]
LABEL_FIELD="state"

In [None]:
FIELD_DEFAULTS=[]
FIELD_TYPES={}
FIELD_CATEGORIES={}
dtypes=dict(df.dtypes)
for c in COLUMNS:
    if(str(dtypes[c])=="bool"):
        FIELD_DEFAULTS.append([0])
        FIELD_TYPES[c]="bool"
    elif(str(dtypes[c])=="object"):
        FIELD_DEFAULTS.append(["NA"])
        FIELD_TYPES[c]="string"
        FIELD_CATEGORIES[c]=list(sorted(set(list(df[c].unique())+["NA"])))
    else:  
        FIELD_DEFAULTS.append([0.0])
        FIELD_TYPES[c]="number"
FIELD_CATEGORIES[LABEL_FIELD]=[x for x in FIELD_CATEGORIES[LABEL_FIELD] if x != "NA"]

In [None]:
with open_file(os.path.join(MODEL_DIR,"data","dataset_fields.json"), "w") as f:
    json.dump(
        {
            "fields":{
                "columns" : COLUMNS,#[x for x in COLUMNS if x != LABEL_FIELD],
                "types" : FIELD_TYPES,#{x:FIELD_TYPES[x] for x in FIELD_TYPES if x != LABEL_FIELD},
                "categories" : FIELD_CATEGORIES,#{x:FIELD_CATEGORIES[x] for x in FIELD_CATEGORIES if x != LABEL_FIELD}
            },
            "label":{
                "column" : LABEL_FIELD,
                "type" : FIELD_TYPES[LABEL_FIELD],
                "categories" : FIELD_CATEGORIES[LABEL_FIELD],
            }
        },
        f
    )

In [None]:
with open_file(os.path.join(MODEL_DIR,"data","tf_trainset.csv"), "w") as f:
    df[COLUMNS].to_csv(f, index=False)
with open_file(os.path.join(MODEL_DIR,"data","tf_evalset.csv"), "w") as f:
    df_eval[COLUMNS].to_csv(f, index=False)

# Train

In [3]:
os.environ['MODEL_DIR'] = str(MODEL_DIR)
os.environ['TRAIN_STEPS'] = str(TRAIN_STEPS)
os.environ['BATCH_SIZE'] = str(BATCH_SIZE)
os.environ['LEARNING_RATE'] = str(LEARNING_RATE)
os.environ['L1_NORM'] = str(L1_NORM)
os.environ['L2_NORM'] = str(L2_NORM)

os.environ['HIDDEN_UNITS']=",".join([str(x) for x in HIDDEN_UNITS])
os.environ['EMBEDDING_COLUMNS_SIZE']=str(EMBEDDING_COLUMNS_SIZE)

os.environ['BUCKET'] = "ml-research-injenia"
os.environ['REGION'] = 'europe-west1'

## Local

In [None]:
%%bash

python -u trainer/task.py \
    --MODEL_DIR $MODEL_DIR \
    --TRAIN_STEPS $TRAIN_STEPS \
    --BATCH_SIZE $BATCH_SIZE   \
    --LEARNING_RATE $LEARNING_RATE \
    --L1_NORM $L1_NORM \
    --L2_NORM $L2_NORM \
    --HIDDEN_UNITS $HIDDEN_UNITS \
    --EMBEDDING_COLUMNS_SIZE $EMBEDDING_COLUMNS_SIZE

In [None]:
%%bash
echo $EMBEDDING_COLUMNS_SIZE

## Cloud

In [None]:
%%bash

JOBNAME=kickstarter_dnn_$(date -u +%y%m%d_%H%M%S)

echo "Launching training job ... trained model will be in $MODEL_DIR"
#gsutil -m rm -rf $OUTPUT_DIR
gcloud ml-engine jobs submit training $JOBNAME \
  --region=$REGION \
  --module-name=trainer.task \
  --package-path=$(pwd)/../trainer \
  --job-dir=$MODEL_DIR \
  --staging-bucket=gs://$BUCKET-staging \
  --runtime-version="1.6" \
  --scale-tier=STANDARD_1 \
  -- \
   --MODEL_DIR=$MODEL_DIR \
   --TRAIN_STEPS $TRAIN_STEPS \
   --BATCH_SIZE $BATCH_SIZE   \
   --LEARNING_RATE $LEARNING_RATE \
   --L1_NORM $L1_NORM \
   --L2_NORM $L2_NORM  --HIDDEN_UNITS $HIDDEN_UNITS --EMBEDDING_COLUMNS_SIZE $EMBEDDING_COLUMNS_SIZE > ../logs/launch_dnn.txt