In [118]:
#Library
import numpy as np 
import pandas as pd
# For visualizations
import matplotlib.pyplot as plt
# For regular expressions
import re
# For handling string
import string
# For performing mathematical operations
import math
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ec2-user/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [74]:
import sklearn

In [75]:
## Take a look at the example + drop unwanted columns
dataset = pd.read_csv("train.csv")
dataset = dataset[["id", "text", "target"]]
dataset.head()

Unnamed: 0,id,text,target
0,1,Our Deeds are the Reason of this #earthquake M...,1
1,4,Forest fire near La Ronge Sask. Canada,1
2,5,All residents asked to 'shelter in place' are ...,1
3,6,"13,000 people receive #wildfires evacuation or...",1
4,7,Just got sent this photo from Ruby #Alaska as ...,1


In [76]:
from sklearn.model_selection import train_test_split
X = dataset["text"]
y = dataset["target"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)

In [77]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(5709,) (1904,) (5709,) (1904,)


In [78]:
train = pd.DataFrame(data = {"text": X_train, "target": y_train})
train

Unnamed: 0,text,target
6594,Truth...\nhttps://t.co/h6amECX5K7\n#News\n#BBC...,0
7136,USGS EQ: M 1.9 - 5km S of Volcano Hawaii: Time...,1
6685,Super loud thunder woke me up from my very nic...,0
2559,@Petchary but I can't say that either of us sh...,0
2888,.@karijobe and her band killed it tonight. It...,1
...,...,...
5935,I JUST SCREAMED IN 57 LANGUAGES THIS IS SO GOO...,0
4296,@JYHeffect my good you stay in NY??? ?,0
6638,Udhampur terror attack: NIA takes over probe P...,1
6840,Hollywood Movie About Trapped Miners Released ...,1


In [79]:
test = pd.DataFrame(data = {"text": X_test, "target": y_test})
test

Unnamed: 0,text,target
4816,@samanthaturne19 IIt may logically have been t...,1
3456,@ItsNasB now I have to go replace my sarcasm m...,0
165,I had a airplane accident.,1
2526,'cause right now I can read too good don't sen...,0
479,Attack on Titan game on PS Vita yay! Can't wai...,0
...,...,...
5115,Err:509,0
2302,#charminar demolish if it in falling state any...,0
1313,the bar method ÛÓ integrates the fat burning ...,0
6791,#ModiMinistry Rly tragedy in MP: Some live to ...,1


In [80]:
train.to_csv("blazingtext_csv/train.csv", header = False, index = False)
test.to_csv("blazingtext_csv/test.csv", header = False, index = False)

In [81]:
import sagemaker
from sagemaker import get_execution_role
import json
import boto3

sess = sagemaker.Session()

role = get_execution_role()
print(role) # This is the role that SageMaker would use to leverage AWS resources (S3, CloudWatch) on your behalf

bucket = sess.default_bucket() # Replace with your own bucket name if needed
print(bucket)
prefix = 'blazingtext/supervised' #Replace with the prefix under which you want to store the data if needed

arn:aws:iam::824782811147:role/service-role/AmazonSageMaker-ExecutionRole-20200920T164563
sagemaker-us-west-2-824782811147


In [82]:
from random import shuffle
import multiprocessing
from multiprocessing import Pool
import csv
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/ec2-user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [83]:
def transform_instance(row):
    cur_row = []
    label = "__label__" + index_to_label[row[1]]  #Prefix the index-ed label with __label__
    cur_row.append(label)
    cur_row.extend(nltk.word_tokenize(row[0].lower()))
    #cur_row.extend(nltk.word_tokenize(row[2].lower()))
    return cur_row

In [84]:
def preprocess(input_file, output_file, keep=1):
    all_rows = []
    with open(input_file, 'r') as csvinfile:
        csv_reader = csv.reader(csvinfile, delimiter=',')
        for row in csv_reader:
            all_rows.append(row)
    shuffle(all_rows)
    all_rows = all_rows[:int(keep*len(all_rows))]
    pool = Pool(processes=multiprocessing.cpu_count())
    transformed_rows = pool.map(transform_instance, all_rows)
    pool.close() 
    pool.join()
    
    with open(output_file, 'w') as csvoutfile:
        csv_writer = csv.writer(csvoutfile, delimiter=' ', lineterminator='\n')
        csv_writer.writerows(transformed_rows)

In [85]:
index_to_label = {"1": "TRUE ", "0": "FALSE"}

In [86]:
%%time

# Preparing the training dataset

# Since preprocessing the whole dataset might take a couple of mintutes,
# we keep 20% of the training dataset for this demo.
# Set keep to 1 if you want to use the complete dataset
preprocess('blazingtext_csv/train.csv', 'blazingtext.train', keep=1)
        
# Preparing the validation dataset        
preprocess('blazingtext_csv/test.csv', 'blazingtext.validation')

CPU times: user 46.2 ms, sys: 70.2 ms, total: 116 ms
Wall time: 1.19 s


In [87]:
%%time

train_channel = prefix + '/train'
validation_channel = prefix + '/validation'

sess.upload_data(path='blazingtext.train', bucket=bucket, key_prefix=train_channel)
sess.upload_data(path='blazingtext.validation', bucket=bucket, key_prefix=validation_channel)

s3_train_data = 's3://{}/{}'.format(bucket, train_channel)
s3_validation_data = 's3://{}/{}'.format(bucket, validation_channel)

CPU times: user 43.1 ms, sys: 16.2 ms, total: 59.2 ms
Wall time: 389 ms


In [88]:
s3_output_location = 's3://{}/{}/output'.format(bucket, prefix)

In [89]:
region_name = boto3.Session().region_name
region_name

'us-west-2'

In [90]:
container = sagemaker.amazon.amazon_estimator.get_image_uri(region_name, "blazingtext", "latest")
print('Using SageMaker BlazingText container: {} ({})'.format(container, region_name))

'get_image_uri' method will be deprecated in favor of 'ImageURIProvider' class in SageMaker Python SDK v2.


Using SageMaker BlazingText container: 433757028032.dkr.ecr.us-west-2.amazonaws.com/blazingtext:latest (us-west-2)


In [91]:
bt_model = sagemaker.estimator.Estimator(container,
                                         role, 
                                         train_instance_count=1, 
                                         train_instance_type='ml.c4.4xlarge',
                                         train_volume_size = 30,
                                         train_max_run = 360000,
                                         input_mode= 'File',
                                         output_path=s3_output_location,
                                         sagemaker_session=sess)

Parameter image_name will be renamed to image_uri in SageMaker Python SDK v2.


In [92]:
bt_model.set_hyperparameters(mode="supervised",
                            epochs=10,
                            min_count=2,
                            learning_rate=0.05,
                            vector_dim=10,
                            early_stopping=True,
                            patience=4,
                            min_epochs=5,
                            word_ngrams=2)

In [93]:
train_data = sagemaker.session.s3_input(s3_train_data, distribution='FullyReplicated', 
                        content_type='text/plain', s3_data_type='S3Prefix')
validation_data = sagemaker.session.s3_input(s3_validation_data, distribution='FullyReplicated', 
                             content_type='text/plain', s3_data_type='S3Prefix')
data_channels = {'train': train_data, 'validation': validation_data}

's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.
's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.


In [94]:
bt_model.fit(inputs=data_channels, logs=True)

2020-10-01 07:18:13 Starting - Starting the training job...
2020-10-01 07:18:15 Starting - Launching requested ML instances......
2020-10-01 07:19:33 Starting - Preparing the instances for training......
2020-10-01 07:20:41 Downloading - Downloading input data
2020-10-01 07:20:41 Training - Downloading the training image..[34mArguments: train[0m
[34m[10/01/2020 07:20:57 INFO 140365394396992] nvidia-smi took: 0.0252010822296 secs to identify 0 gpus[0m
[34m[10/01/2020 07:20:57 INFO 140365394396992] Running single machine CPU BlazingText training using supervised mode.[0m
[34m[10/01/2020 07:20:57 INFO 140365394396992] 2 files found in train channel. Using /opt/ml/input/data/train/blazingtext.train for training...[0m
[34m[10/01/2020 07:20:57 INFO 140365394396992] Processing /opt/ml/input/data/train/blazingtext.train . File size: 0 MB[0m
[34m[10/01/2020 07:20:57 INFO 140365394396992] 2 files found in validation channel. Using /opt/ml/input/data/validation/blazingtext.validation f

In [97]:
text_classifier = bt_model.deploy(initial_instance_count = 1,instance_type = 'ml.t2.medium')

Parameter image will be renamed to image_uri in SageMaker Python SDK v2.
Using already existing model: blazingtext-2020-10-01-07-18-13-469


-------------!

In [124]:
sentences = ["Convair was an american aircraft manufacturing company which later expanded into rockets and spacecraft.",
            "Berwick secondary college is situated in the outer melbourne metropolitan suburb of berwick ."]

# using the same nltk tokenizer that we used during data preparation for training
#tokenized_sentences = [' '.join(nltk.word_tokenize(sent)) for sent in sentences]

payload = {"instances" : sentences}

response = text_classifier.predict(json.dumps(payload))

predictions = json.loads(response)
predictions[0]["label"] 

True

In [176]:
kaggle_test = pd.read_csv("test.csv")
kaggle_test

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan
...,...,...,...,...
3258,10861,,,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...
3259,10865,,,Storm in RI worse than last hurricane. My city...
3260,10868,,,Green Line derailment in Chicago http://t.co/U...
3261,10874,,,MEG issues Hazardous Weather Outlook (HWO) htt...


In [177]:
kaggle_test.iloc[0, 3]

'Just happened a terrible car crash'

In [178]:
def api_call(text):  
    payload = {"instances" : text}
    response = text_classifier.predict(json.dumps(payload))
    predictions = json.loads(response)
    if predictions[0]["label"] == ['__label__FALSE']:
        return 0
    else:
        return 1

In [179]:
api_call("Convair was an american aircraft manufacturing company which later expanded into rockets and spacecraft.")

0

In [180]:
def kaggle():
    target = []
    for text in kaggle_test["text"]:
        print(text)
        payload = {"instances" : text}
        response = text_classifier.predict(json.dumps(payload))
        predictions = json.loads(response)
        if predictions[0]["label"] == ['__label__FALSE']:
            result = 0
        else:
            result = 1
    target = target.append(result)
    return target


In [None]:
kaggle_test["text"]

In [None]:
kaggle_test[kaggle["id"] == 69, "text"]

In [181]:
target = kaggle()

Just happened a terrible car crash
Heard about #earthquake is different cities, stay safe everyone.
there is a forest fire at spot pond, geese are fleeing across the street, I cannot save them all
Apocalypse lighting. #Spokane #wildfires
Typhoon Soudelor kills 28 in China and Taiwan
We're shaking...It's an earthquake
They'd probably still show more life than Arsenal did yesterday, eh? EH?
Hey! How are you?
What a nice hat?
Fuck off!
No I don't like cold!
NOOOOOOOOO! Don't do that!
No don't tell me that!
What if?!
Awesome!
Birmingham Wholesale Market is ablaze BBC News - Fire breaks out at Birmingham's Wholesale Market http://t.co/irWqCEZWEU
@sunkxssedharry will you wear shorts for race ablaze ?
#PreviouslyOnDoyinTv: Toke MakinwaÛªs marriage crisis sets Nigerian Twitter ablaze... http://t.co/CMghxBa2XI
Check these out: http://t.co/rOI2NSmEJJ http://t.co/3Tj8ZjiN21 http://t.co/YDUiXEfIpE http://t.co/LxTjc87KLS #nsfw
PSA: IÛªm splitting my personalities.

?? techies follow @ablaze_co
??

ModelError: An error occurred (ModelError) when calling the InvokeEndpoint operation: Received client error (400) from model with message "unable to evaluate payload provided". See https://us-west-2.console.aws.amazon.com/cloudwatch/home?region=us-west-2#logEventViewer:group=/aws/sagemaker/Endpoints/blazingtext-2020-10-01-07-18-13-469 in account 824782811147 for more information.

In [None]:
kaggle_test["target"]

In [None]:
kaggle_test