Source:
https://github.com/aws-samples/aws-machine-learning-university-accelerated-nlp/blob/master/notebooks/MLA-NLP-Lecture2-Sagemaker.ipynb

In [1]:
%pip install -q -r requirements.txt

You should consider upgrading via the '/home/studio-lab-user/.conda/envs/default/bin/python -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
import re
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize

from sklearn.model_selection import train_test_split

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sagemaker.session import Session
import sagemaker
import boto3

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /home/studio-lab-
[nltk_data]     user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/studio-lab-
[nltk_data]     user/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
AWS_ACCESS_KEY = 'stunning'
AWS_SECRET = 'isnt it'

region_name='us-east-2'

boto_session = boto3.session.Session(
   aws_access_key_id=AWS_ACCESS_KEY,
   aws_secret_access_key=AWS_SECRET,
   region_name=region_name
)

sagemaker_session = Session(boto_session=boto_session)

model_package_group_name = f"AmazonModelPackageGroupName"

role = 'arn:aws:iam::013747046745:role/sagemaker-role-amazon'

In [4]:
df = pd.read_csv('train_10k.csv', index_col=0)
df.dropna(axis=0, how='any', inplace=True)
df.head()

Unnamed: 0,label,text
0,1,Stuning even for the non-gamerThis sound track...
1,1,The best soundtrack ever to anything.I'm readi...
2,1,Amazing!This soundtrack is my favorite music o...
3,1,Excellent SoundtrackI truly like this soundtra...
4,1,"Remember, Pull Your Jaw Off The Floor After He..."


In [5]:
#how balanced?
df['label'].mean()

0.4903

In [6]:
df_test = pd.read_csv('test_5k.csv', index_col=0)
df_test.dropna(axis=0, how='any', inplace=True)
df_test.head()

Unnamed: 0,label,text
0,1,Great CDMy lovely Pat has one of the GREAT voi...
1,1,One of the best game music soundtracks - for a...
2,0,Batteries died within a year ...I bought this ...
3,1,"works fine, but Maha Energy is betterCheck out..."
4,1,Great for the non-audiophileReviewed quite a b...


#### Train-Validation-Split on Dummy 10k

In [7]:
X_train, X_val, y_train, y_val = train_test_split(df[["text"]],
                                                  df["label"],
                                                  test_size=0.20,
                                                  shuffle=True,
                                                  random_state=324
                                                 )
X_test, y_test = df_test['text'], df_test['label']

#### Data processing with Pipeline and ColumnTransform

In [8]:
#define model features and target
model_features = ['text']
model_target = 'label'

In [19]:
tfidf = TfidfVectorizer(strip_accents='ascii', 
                        lowercase=True,
                        analyzer = 'word',
                        stop_words='english',
                        token_pattern = r'(?u)\b\w\w+\b',\
                        max_df=0.95,
                        min_df = 5
                       )
tfidf.fit(X_train['text'])


TfidfVectorizer(max_df=0.95, min_df=5, stop_words='english',
                strip_accents='ascii')

In [20]:
X_train = tfidf.transform(X_train['text']).toarray()
X_val = tfidf.transform(X_val['text']).toarray()
X_test = tfidf.transform(X_test).toarray()

#### Train a classifier with SageMaker build-in algorithm

In [21]:
#Call the LinearLearner estimator object
linear_classifier = sagemaker.LinearLearner(role=role,
                                            instance_count=1,
                                            instance_type='ml.m4.xlarge',
                                            predictor_type='binary_classifier')

In [22]:
train_records = linear_classifier.record_set(X_train.astype("float32"),
                                            y_train.values.astype("float32"),
                                            channel='train')
val_records = linear_classifier.record_set(X_val.astype("float32"),
                                          y_val.values.astype("float32"),
                                          channel='validation')
test_records = linear_classifier.record_set(X_test.astype("float32"),
                                           y_test.values.astype("float32"),
                                           channel='test')

In [23]:
%%time
linear_classifier.fit([train_records,
                       val_records,
                       test_records],
                      logs=False)

Defaulting to the only supported framework/algorithm version: 1. Ignoring framework/algorithm version: 1.
Defaulting to the only supported framework/algorithm version: 1. Ignoring framework/algorithm version: 1.



2022-06-01 04:59:35 Starting - Starting the training job.....
2022-06-01 05:00:00 Starting - Preparing the instances for training............
2022-06-01 05:01:10 Downloading - Downloading input data.......
2022-06-01 05:01:49 Training - Downloading the training image.........
2022-06-01 05:02:36 Training - Training image download completed. Training in progress........
2022-06-01 05:03:21 Uploading - Uploading generated training model..
2022-06-01 05:03:31 Completed - Training job completed
CPU times: user 339 ms, sys: 34 ms, total: 373 ms
Wall time: 4min 1s


#### Model Evaluation

In [24]:
sagemaker.analytics.TrainingJobAnalytics(linear_classifier._current_job_name, 
                                         metric_names = ['test:binary_classification_accuracy']
                                        ).dataframe()

Unnamed: 0,timestamp,metric_name,value
0,0.0,test:binary_classification_accuracy,0.819382


#### Deploy Model to Endpoint

In [25]:
%%time
linear_classifier_predictor = linear_classifier.deploy(initial_instance_count = 1,
                                                       instance_type = 'ml.t2.medium',
                                                       endpoint_name = 'NLPLinearLearnerEndpoint'
                                                      )

Defaulting to the only supported framework/algorithm version: 1. Ignoring framework/algorithm version: 1.


-------!CPU times: user 536 ms, sys: 8.11 ms, total: 544 ms
Wall time: 3min 31s


#### Test the Endpoint

In [27]:
#make X_test smaller to avoid exceeding limits
prediction_batches = [linear_classifier_predictor.predict(batch)
                      for batch in np.array_split(X_test[:50].astype("float32"), 25)
                     ]

# Let's get a list of predictions
print([pred.label['score'].float32_tensor.values[0] for pred in prediction_batches[0]])

[0.713131308555603, 0.8066731095314026]


#### Kill the Endpoint

In [28]:
linear_classifier_predictor.delete_endpoint()