In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import time
from datetime import datetime

from sklearn.model_selection import train_test_split

from google.cloud import storage
from google.cloud import automl_v1beta1 as automl

from automlwrapper import AutoMLWrapper

This notebook utilizes a utility script that wraps much of the AutoML Python client library, to make the code in this notebook easier to read. Feel free to check out the utility for all the details on how we are calling the underlying AutoML Client Library!

In [2]:
# Set your own values for these. bucket_name should be the project_id + '-lcm'.
PROJECT_ID = 'cloudml-demo'
bucket_name = 'cloudml-demo-lcm'

region = 'us-central1' # Region must be us-central1
dataset_display_name = 'kaggle_tweets'
model_display_name = 'kaggle_starter_model1'

storage_client = storage.Client(project=PROJECT_ID)
client = automl.AutoMlClient()

In [3]:
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

/kaggle/input/nlp-getting-started/train.csv
/kaggle/input/nlp-getting-started/sample_submission.csv
/kaggle/input/nlp-getting-started/test.csv


In [4]:
nlp_train_df = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
nlp_test_df = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')
def callback(operation_future):
    result = operation_future.result()

In [5]:
nlp_train_df.tail()

Unnamed: 0,id,keyword,location,text,target
7608,10869,,,Two giant cranes holding a bridge collapse int...,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,,,Police investigating after an e-bike collided ...,1
7612,10873,,,The Latest: More Homes Razed by Northern Calif...,1


### Data spelunking
How often does 'fire' come up in this dataset?

In [6]:
nlp_train_df.loc[nlp_train_df['text'].str.contains('fire', na=False, case=False)]

Unnamed: 0,id,keyword,location,text,target
1,4,,,Forest fire near La Ronge Sask. Canada,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
5,8,,,#RockyFire Update => California Hwy. 20 closed...,1
7,13,,,I'm on top of the hill and I can see a fire in...,1
...,...,...,...,...,...
7427,10625,wounded,,Officer wounded suspect killed in exchange of ...,1
7433,10631,wounded,Yogya Berhati Nyaman,@wocowae Officer Wounded Suspect Killed in Exc...,1
7604,10863,,,#WorldNews Fallen powerlines on G:link tram: U...,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1


Does the presence of the word 'fire' help determine whether the tweets here are real or false?

In [7]:
nlp_train_df.loc[nlp_train_df['text'].str.contains('fire', na=False, case=False)].target.value_counts()

1    344
0    129
Name: target, dtype: int64

### GCS upload/download utilities
These functions make upload and download of files from the kernel to Google Cloud Storage easier. This is needed for AutoML

In [8]:
def upload_blob(bucket_name, source_file_name, destination_blob_name):
    """Uploads a file to the bucket. https://cloud.google.com/storage/docs/ """
    bucket = storage_client.get_bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)
    blob.upload_from_filename(source_file_name)
    print('File {} uploaded to {}'.format(
        source_file_name,
        'gs://' + bucket_name + '/' + destination_blob_name))
    
def download_to_kaggle(bucket_name,destination_directory,file_name,prefix=None):
    """Takes the data from your GCS Bucket and puts it into the working directory of your Kaggle notebook"""
    os.makedirs(destination_directory, exist_ok = True)
    full_file_path = os.path.join(destination_directory, file_name)
    blobs = storage_client.list_blobs(bucket_name,prefix=prefix)
    for blob in blobs:
        blob.download_to_filename(full_file_path)

In [9]:
bucket = storage.Bucket(storage_client, name=bucket_name)
if not bucket.exists():
    bucket.create(location=region)

### Export to CSV and upload to GCS

In [10]:
# Select the text body and the target value, for sending to AutoML NL
nlp_train_df[['text','target']].to_csv('train.csv', index=False, header=False) 

In [11]:
nlp_train_df[['id','text','target']].head()

Unnamed: 0,id,text,target
0,1,Our Deeds are the Reason of this #earthquake M...,1
1,4,Forest fire near La Ronge Sask. Canada,1
2,5,All residents asked to 'shelter in place' are ...,1
3,6,"13,000 people receive #wildfires evacuation or...",1
4,7,Just got sent this photo from Ruby #Alaska as ...,1


In [12]:
training_gcs_path = 'uploads/kaggle_getstarted/full_train.csv'
upload_blob(bucket_name, 'train.csv', training_gcs_path)

File train.csv uploaded to gs://cloudml-demo-lcm/uploads/kaggle_getstarted/full_train.csv


## Create our class instance

In [13]:
amw = AutoMLWrapper(client=client, 
                    project_id=PROJECT_ID, 
                    bucket_name=bucket_name, 
                    region='us-central1', 
                    dataset_display_name=dataset_display_name, 
                    model_display_name=model_display_name)
       

## Create (or retreive) dataset
Check to see if this dataset already exists. If not, create it

In [14]:
if not amw.get_dataset_by_display_name(dataset_display_name):
    print('dataset not found')
    amw.create_dataset()
    amw.import_gcs_data(training_gcs_path)

amw.dataset

searching for dataset named: kaggle_tweets
found 1 matching datasets


name: "projects/421576851279/locations/us-central1/datasets/TCN8969336472326897664"
display_name: "kaggle_tweets"
create_time {
  seconds: 1576777085
  nanos: 231076000
}
etag: "AB3BwFqQVqUUzKga3b4kmo87Ie8TDtw2IC0LlhPy41jN7ULSJstZ-Ro36H1SjueOLXM="
example_count: 7503
text_classification_dataset_metadata {
  classification_type: MULTICLASS
}

## Kick off the training for the model
And retrieve the training info after completion. 
Start model deployment.

In [15]:
if not amw.get_model_by_display_name():
    amw.train_model()
amw.deploy_model()
amw.model

searching for model named: kaggle_starter_model1
found 1 matching models
Deploying model: kaggle_starter_model1 at 2019-12-23, 01:39:36 UTC
searching for model named: kaggle_starter_model1
found 1 matching models
searching for model named: kaggle_starter_model1
found 1 matching models
searching for model named: kaggle_starter_model1
found 1 matching models
searching for model named: kaggle_starter_model1
found 1 matching models
searching for model named: kaggle_starter_model1
found 1 matching models
searching for model named: kaggle_starter_model1
found 1 matching models
searching for model named: kaggle_starter_model1
found 1 matching models
Finished deploying model: kaggle_starter_model1 around 2019-12-23, 01:53:39 UTC


name: "projects/421576851279/locations/us-central1/models/TCN4777347236361142272"
display_name: "kaggle_starter_model1"
dataset_id: "TCN8969336472326897664"
create_time {
  seconds: 1576788886
  nanos: 136762000
}
deployment_state: DEPLOYED
update_time {
  seconds: 1577065962
  nanos: 886196000
}
text_classification_model_metadata {
  classification_type: MULTICLASS
}

In [16]:
amw.model_full_path

'projects/421576851279/locations/us-central1/models/TCN4777347236361142272'

## Prediction
Note that prediction will not run until deployment finishes, which takes a bit of time.
However, once you have your model deployed, this notebook won't re-train the model, thanks to the various safeguards put in place. Instead, it will take the existing (trained) model and make predictions and generate the submission file.

In [17]:
nlp_test_df.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [18]:
# Create client for prediction service.
prediction_client = automl.PredictionServiceClient()
amw.set_prediction_client(prediction_client)

predictions_df = amw.get_predictions(nlp_test_df, 
                                     input_col_name='text', 
#                                      ground_truth_col_name='target', # we don't have ground truth in our test set
                                     limit=None, 
                                     threshold=0.5,
                                     verbose=False)


## (optional) Undeploy model
Undeploy the model to stop charges

In [19]:
amw.undeploy_model()

Undeploying model: kaggle_starter_model1 at 2019-12-23, 02:16:01 UTC
searching for model named: kaggle_starter_model1
found 1 matching models
searching for model named: kaggle_starter_model1
found 1 matching models
searching for model named: kaggle_starter_model1
found 1 matching models
searching for model named: kaggle_starter_model1
found 1 matching models
searching for model named: kaggle_starter_model1
found 1 matching models
searching for model named: kaggle_starter_model1
found 1 matching models
Finished undeploying model: kaggle_starter_model1 around 2019-12-23, 02:28:04 UTC


## Create submission output

In [20]:
predictions_df.head()

Unnamed: 0,score,class,text
0,0.738599,1,Just happened a terrible car crash
1,0.959434,1,"Heard about #earthquake is different cities, s..."
2,0.902981,1,"there is a forest fire at spot pond, geese are..."
3,0.92324,1,Apocalypse lighting. #Spokane #wildfires
4,0.985142,1,Typhoon Soudelor kills 28 in China and Taiwan


In [21]:
submission_df = pd.concat([nlp_test_df['id'], predictions_df['class']], axis=1)
submission_df.head()

Unnamed: 0,id,class
0,0,1
1,2,1
2,3,1
3,9,1
4,11,1


In [22]:
# predictions_df['class'].iloc[:10]
# nlp_test_df['id']

In [23]:
submission_df = submission_df.rename(columns={'class':'target'})
submission_df.head()

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,1
4,11,1


## Submit predictions to the competition!

In [24]:
submission_df.to_csv("submission.csv", index=False, header=True)

In [25]:
! ls -l submission.csv

-rw-r--r-- 1 root root 22746 Dec 23 02:28 submission.csv
