# Train from data in Cloud Storage: data.py


In [0]:
from google.cloud import storage


def get_data_using_pandas(line_count):

    # get data from aws s3
    # url = "s3://wagon-public-datasets/taxi-fare-train.csv"
    # df = pd.read_csv(url, nrows=100)

    # load n lines from my csv
    df = pd.read_csv("gs://le-wagon-data/data/train_1k.csv", nrows=line_count)
    return df


def get_data_using_blob(line_count):

    # get data from aws s3
    # url = "s3://wagon-public-datasets/taxi-fare-train.csv"

    # get data from my google storage bucket
    BUCKET_NAME = "le-wagon-data"
    BUCKET_TRAIN_DATA_PATH = "data/train_1k.csv"

    data_file = "train_1k.csv"

    client = storage.Client()  # verifies $GOOGLE_APPLICATION_CREDENTIALS

    bucket = client.bucket(BUCKET_NAME)

    blob = bucket.blob(BUCKET_TRAIN_DATA_PATH)

    blob.download_to_filename(data_file)

    # load downloaded data to dataframe
    df = pd.read_csv(data_file, nrows=line_count)

    return df

# Save trained model to Cloud Storage: data.py

In [0]:
def save_model_to_gcp():

    BUCKET_NAME = "le-wagon-data"
    storage_location = "models/random_forest_model.joblib"
    local_model_filename = "model.joblib"

    client = storage.Client()

    bucket = client.bucket(BUCKET_NAME)

    blob = bucket.blob(storage_location)

    blob.upload_from_filename(local_model_filename)

# Train in the AI Platform

## Makefile

In [0]:
# bucket
BUCKET_NAME=le-wagon-data

# training folder
BUCKET_TRAINING_FOLDER=trainings

# training params
REGION=europe-west1

# app environment
PYTHON_VERSION=3.7
FRAMEWORK=scikit-learn
RUNTIME_VERSION=2.2

# package params
PACKAGE_NAME=taxifare
FILENAME=trainer

##### Job - - - - - - - - - - - - - - - - - - - - - - - - -

JOB_NAME=taxi_fare_training_$(shell date +'%Y%m%d_%H%M%S')

gcp_submit_training:
	gcloud ai-platform jobs submit training ${JOB_NAME} \
		--job-dir gs://${BUCKET_NAME}/${BUCKET_TRAINING_FOLDER} \
		--package-path ${PACKAGE_NAME} \
		--module-name ${PACKAGE_NAME}.${FILENAME} \
		--python-version=${PYTHON_VERSION} \
		--runtime-version=${RUNTIME_VERSION} \
		--region ${REGION} \
		--stream-logs

## MANIFEST.in

In [0]:
include requirements.txt
graft taxifare
global-exclude *.py[cod] __pycache__ *.so *.ipynb