In [1]:
import tensorflow as tf
import pandas as pd
import os
import json
import shutil
from tensorflow.python.lib.io.file_io import FileIO as open_file

  from ._conv import register_converters as _register_converters


In [2]:
DATASET_CSV="gs://ml-research-injenia/estimators/datasets/kickstarter-set/2016_trainset.csv"
EVALSET_CSV="gs://ml-research-injenia/estimators/datasets/kickstarter-set/2016_evalset.csv"
MODEL_DIR="gs://ml-research-injenia/estimators/trainings-kickstarter-v2/linear-classifier/test01"

BATCH_SIZE = 100
TRAIN_STEPS= 100000
LEARNING_RATE=0.01
L1_NORM=0.0
L2_NORM=0.0

FEATURE_COLUMN_NUM_BUCKETS=8

In [3]:
with open_file(DATASET_CSV, "r") as f:
    df = pd.read_csv(f)
df

Unnamed: 0,ID,category,main_category,currency,state,country,timespan_days,goal_USD,goal_USD_log,timespan_days_scaled,goal_USD_scaled,goal_USD_log_scaled
0,1892252564,Comic Books,Comics,USD,successful,US,30.0,2500.000000,7.824046,-0.308155,-0.036034,-0.453981
1,1569410360,Documentary,Film & Video,USD,failed,US,60.0,2000.000000,7.600902,2.019002,-0.036501,-0.587792
2,703720557,Illustration,Art,USD,failed,US,30.0,2700.000000,7.901007,-0.308155,-0.035847,-0.407830
3,296713470,Photography,Photography,USD,successful,US,30.0,5000.000000,8.517193,-0.308155,-0.033698,-0.038326
4,156448762,Product Design,Design,USD,failed,US,30.0,15000.000000,9.615805,-0.308155,-0.024352,0.620471
5,1255814383,Music,Music,USD,successful,US,30.0,100000.000000,11.512925,-0.308155,0.055088,1.758104
6,697900694,Video Games,Games,CAD,failed,CA,30.0,1199.461089,7.089628,-0.308155,-0.037249,-0.894384
7,1026864342,Civic Design,Design,EUR,failed,IT,60.0,24928.000000,10.123747,2.019002,-0.015073,0.925065
8,1923292016,Farmer's Markets,Food,GBP,failed,GB,25.0,140.355619,4.944179,-0.696015,-0.038239,-2.180931
9,317811128,Publishing,Publishing,USD,failed,US,29.0,15000.000000,9.615805,-0.385727,-0.024352,0.620471


In [4]:
with open_file(EVALSET_CSV, "r") as f:
    df_eval = pd.read_csv(f)
df_eval

Unnamed: 0,ID,category,main_category,currency,state,country,timespan_days,goal_USD,goal_USD_log,timespan_days_scaled,goal_USD_scaled,goal_USD_log_scaled
0,153415837,Flight,Technology,USD,failed,US,30.0,1.500000e+04,9.615805,-0.308155,-0.024352,0.620471
1,1828026144,Fine Art,Photography,USD,failed,US,31.0,6.500001e+06,15.687313,-0.230583,6.036428,4.261331
2,919600393,Children's Books,Publishing,USD,failed,US,32.0,2.000000e+03,7.600902,-0.153011,-0.036501,-0.587792
3,1621230227,Comic Books,Comics,USD,failed,US,30.0,4.000000e+03,8.294050,-0.308155,-0.034632,-0.172137
4,954788005,Theater,Theater,USD,successful,US,14.0,3.500000e+03,8.160518,-1.549306,-0.035099,-0.252211
5,1094453155,Flight,Technology,USD,failed,US,53.0,1.000000e+04,9.210340,1.475999,-0.029025,0.377329
6,536236122,Experimental,Film & Video,USD,successful,US,17.0,2.500000e+01,3.218876,-1.316590,-0.038347,-3.215532
7,2076494684,Quilts,Crafts,USD,failed,US,28.0,1.800000e+03,7.495542,-0.463299,-0.036688,-0.650972
8,70609801,Young Adult,Publishing,EUR,failed,DE,60.0,1.246400e+04,9.430600,2.019002,-0.026722,0.509410
9,1269516548,Product Design,Design,USD,successful,US,30.0,7.500000e+03,8.922658,-0.308155,-0.031361,0.204817


In [5]:
with open_file(os.path.join(MODEL_DIR,"trainset.csv"), "w") as f:
    df.to_csv(f,index=False)
with open_file(os.path.join(MODEL_DIR,"evalset.csv"), "w") as f:
    df_eval.to_csv(f,index=False)

In [6]:
INDEX="ID"
COLUMNS=[
    "category",
    "main_category",
    "state",
    "country",
    "timespan_days_scaled",
    "goal_USD_scaled",
    "goal_USD_log_scaled"
]
LABEL_FIELD="state"

In [7]:
FIELD_DEFAULTS=[]
FIELD_TYPES={}
FIELD_CATEGORIES={}
dtypes=dict(df.dtypes)
for c in COLUMNS:
    if(str(dtypes[c])=="bool"):
        FIELD_DEFAULTS.append([0])
        FIELD_TYPES[c]="bool"
    elif(str(dtypes[c])=="object"):
        FIELD_DEFAULTS.append(["NA"])
        FIELD_TYPES[c]="string"
        FIELD_CATEGORIES[c]=list(sorted(set(list(df[c].unique())+["NA"])))
    else:  
        FIELD_DEFAULTS.append([0.0])
        FIELD_TYPES[c]="number"
FIELD_CATEGORIES[LABEL_FIELD]=[x for x in FIELD_CATEGORIES[LABEL_FIELD] if x != "NA"]

In [8]:
MINS={}
MAXS={}
for c in COLUMNS:
    if c == LABEL_FIELD:
        continue
    if FIELD_TYPES[c]=="number":
        MINS[c]=df[c].min()
        MAXS[c]=df[c].max()

In [9]:
with open_file(os.path.join(MODEL_DIR,"data","dataset_fields.json"), "w") as f:
    json.dump(
        {
            "fields":{
                "columns" : COLUMNS,#[x for x in COLUMNS if x != LABEL_FIELD],
                "types" : FIELD_TYPES,#{x:FIELD_TYPES[x] for x in FIELD_TYPES if x != LABEL_FIELD},
                "categories" : FIELD_CATEGORIES,#{x:FIELD_CATEGORIES[x] for x in FIELD_CATEGORIES if x != LABEL_FIELD}
            },
            "label":{
                "column" : LABEL_FIELD,
                "type" : FIELD_TYPES[LABEL_FIELD],
                "categories" : FIELD_CATEGORIES[LABEL_FIELD],
            },
            "mins":MINS,
            "maxs":MAXS
        },
        f
    )

In [10]:
with open_file(os.path.join(MODEL_DIR,"data","tf_trainset.csv"), "w") as f:
    df[COLUMNS].to_csv(f, index=False)
with open_file(os.path.join(MODEL_DIR,"data","tf_evalset.csv"), "w") as f:
    df_eval[COLUMNS].to_csv(f, index=False)

## hyperparam.yaml

In [11]:
config="""trainingInput:
  scaleTier: STANDARD_1
  hyperparameters:
    goal: MAXIMIZE
    hyperparameterMetricTag: accuracy
    maxTrials: 50
    maxParallelTrials: 10
    params:
    - parameterName: LEARNING_RATE
      type: DOUBLE
      minValue: 0.00001
      maxValue: 0.01
      scaleType: UNIT_LOG_SCALE
    - parameterName: BATCH_SIZE
      type: INTEGER
      minValue: 16
      maxValue: 1024
      scaleType: UNIT_LOG_SCALE
    - parameterName: FEATURE_COLUMN_NUM_BUCKETS
      type: INTEGER
      minValue: 4
      maxValue: 16
      scaleType: UNIT_LINEAR_SCALE
"""

with open_file(os.path.join(MODEL_DIR,"config","hyperparam.yaml"), "w") as f:
    df.to_csv(f,index=False)

print(config)
with open("hyperparam.yaml", "w") as f:
    f.write(config)
    
with open("hyperparam.yaml", "r") as f:
    print(f.read())

trainingInput:
  scaleTier: STANDARD_1
  hyperparameters:
    goal: MAXIMIZE
    hyperparameterMetricTag: accuracy
    maxTrials: 50
    maxParallelTrials: 10
    params:
    - parameterName: LEARNING_RATE
      type: DOUBLE
      minValue: 0.00001
      maxValue: 0.01
      scaleType: UNIT_LOG_SCALE
    - parameterName: BATCH_SIZE
      type: INTEGER
      minValue: 16
      maxValue: 1024
      scaleType: UNIT_LOG_SCALE
    - parameterName: FEATURE_COLUMN_NUM_BUCKETS
      type: INTEGER
      minValue: 4
      maxValue: 16
      scaleType: UNIT_LINEAR_SCALE

trainingInput:
  scaleTier: STANDARD_1
  hyperparameters:
    goal: MAXIMIZE
    hyperparameterMetricTag: accuracy
    maxTrials: 50
    maxParallelTrials: 10
    params:
    - parameterName: LEARNING_RATE
      type: DOUBLE
      minValue: 0.00001
      maxValue: 0.01
      scaleType: UNIT_LOG_SCALE
    - parameterName: BATCH_SIZE
      type: INTEGER
      minValue: 16
      maxValue: 1024
      scaleType: UNIT_LOG_SCALE
    - p

# Train

In [12]:
os.environ['MODEL_DIR'] = str(MODEL_DIR)
os.environ['TRAIN_STEPS'] = str(TRAIN_STEPS)
os.environ['BATCH_SIZE'] = str(BATCH_SIZE)
os.environ['LEARNING_RATE'] = str(LEARNING_RATE)

os.environ['L1_NORM'] = str(L1_NORM)
os.environ['L2_NORM'] = str(L2_NORM)


os.environ['BUCKET'] = "ml-research-injenia"
os.environ['REGION'] = 'europe-west1'

## Local

In [None]:
%%bash

python -u trainer/task.py \
    --MODEL_DIR=$MODEL_DIR \
    --TRAIN_STEPS $TRAIN_STEPS \
    --BATCH_SIZE $BATCH_SIZE   \
    --LEARNING_RATE $LEARNING_RATE \
    --L1_NORM $L1_NORM \
    --L2_NORM $L2_NORM

## Cloud

In [13]:
%%bash

JOBNAME=kickstarter_linear_$(date -u +%y%m%d_%H%M%S)

echo "Launching training job ... trained model will be in $MODEL_DIR"
#gsutil -m rm -rf $OUTPUT_DIR
gcloud ml-engine jobs submit training $JOBNAME \
  --region=$REGION \
  --module-name=trainer.task \
  --package-path=$(pwd)/../trainer \
  --job-dir=$MODEL_DIR \
  --staging-bucket=gs://$BUCKET-staging \
  --runtime-version="1.6" \
  --config=hyperparam.yaml \
  -- \
  --MODEL_DIR=$MODEL_DIR \
  --TRAIN_STEPS $TRAIN_STEPS \
  --L1_NORM $L1_NORM \
  --L2_NORM $L2_NORM

Launching training job ... trained model will be in gs://ml-research-injenia/estimators/trainings-kickstarter-v2/linear-classifier/test01
jobId: kickstarter_linear_180911_100810
state: QUEUED


Job [kickstarter_linear_180911_100810] submitted successfully.
Your job is still active. You may view the status of your job with the command

  $ gcloud ml-engine jobs describe kickstarter_linear_180911_100810

or continue streaming the logs with the command

  $ gcloud ml-engine jobs stream-logs kickstarter_linear_180911_100810


In [14]:
import subprocess
import json


def merge_two_dicts(x, y):
    z = x.copy()   # start with x's keys and values
    z.update(y)    # modifies z with y's keys and values & returns None
    return z


res_str=subprocess.check_output("gcloud ml-engine jobs describe kickstarter_linear_180911_100810 --format json".split(" "))
res=json.loads(res_str)
trials=res["trainingOutput"]["trials"]
df=pd.DataFrame([
    merge_two_dicts({"trialId":t["trialId"]},
    merge_two_dicts(t["finalMetric"],t["hyperparameters"]) )
    for t in trials])

df

Unnamed: 0,BATCH_SIZE,FEATURE_COLUMN_NUM_BUCKETS,LEARNING_RATE,objectiveValue,trainingStep,trialId
0,62,15,0.0099071441400619,0.681774,100007,24
1,17,16,0.0099998899358784,0.678824,100004,32
2,389,16,0.0098972248557665,0.675784,100005,41
3,1004,16,0.0038221435610367,0.675657,100005,47
4,47,16,0.0099965609450405,0.675319,100007,36
5,419,16,0.0075520379512873,0.675084,100007,48
6,846,16,0.0044878040492531,0.675071,100006,34
7,513,16,0.0086478088984302,0.674152,100008,27
8,953,16,0.0099455391322428,0.67405,100007,38
9,436,16,0.0099904847415147,0.673968,100005,31
