In [3]:
import sagemaker as sage
import pandas as pd
from time import gmtime, strftime
from sagemaker import get_execution_role
from sagemaker.pytorch import PyTorch
import os
import numpy as np
import shutil

# data prepare

In [1]:
import pandas as pd   
import numpy as np
import os
import shutil

#preprocess data
def write_txt(df,path):
    '''
    write back to txt
    '''
    #output txt file
    df = df.reset_index()
    with open(path,'a')as f:
        for i in range(len(df)):
            f.write("{} #### {}".format(df.loc[i,'text'].strip(),df.loc[i,'label']))
            f.write('\n')
            
            
def mkdir_rm(folder):
    '''
    make directory if not exists
    '''
    if os.path.exists(folder):
        shutil.rmtree(folder) 
    os.mkdir(folder)
    print ("<< path valid!")
    

def preprocess_data(input_file,output_path,over_sample=True):
    jsonObj = pd.read_csv(input_file)
    jsonObj = jsonObj[jsonObj['label']!='[]']
    print (jsonObj.head())
    
    #remove & remake the output folder 
    mkdir_rm(output_path)
    
    #generate tag.txt
    a_list = ['consumer','zone','target','consequence','product','product_spec']
    with open('tag.txt', 'w') as filehandle:
        filehandle.writelines("%s\n" % tag for tag in a_list)
    
    #train/test/val split
    train, validate, test = np.split(jsonObj.sample(frac=1), [int(.8*len(jsonObj)), int(.9*len(jsonObj))])
   
    print ("training size: ",train.shape)
    print ("test size: ",test.shape)
    print ("validate size: ",validate.shape)
    
    # write train/test/dev
    write_txt(train,os.path.join(output_path,'train.txt'))
    write_txt(test,os.path.join(output_path,'test.txt'))
    write_txt(validate,os.path.join(output_path,'dev.txt'))
    print ("<<<finish data preparing!")
    
input_file = './aspect_category.csv'
output_path = './data/tasd/haofangReview'
preprocess_data(input_file,output_path,over_sample=False)

   Unnamed: 0  sent_num                                               text  \
0           0         0  We are new to the sport and have not used othe...   
1           1         1  Bought for my parents retirement. They are lov...   
2           2         2  Good set.  Paddles and balls are both good qua...   
3           3         3  Got into Pickleball this year and researched a...   
4           4         4  I love these paddles and the case but didn’t r...   

   sent_start  sent_end  sent_len  \
0           0        85        85   
1          85       140        55   
2         140       191        51   
3         191       590       399   
4         590       768       178   

                                               label  
0                           [('paddles', 'product')]  
1                          [('parents', 'consumer')]  
2                 [('Paddles and balls', 'product')]  
3  [('paddle', 'product'), ('me and my wife', 'co...  
4  [('paddles', 'product'), ('did

# train

In [4]:
sess = sage.Session()

WORK_DIRECTORY = "./data"

# S3 prefix
prefix = "haofangReview"

role = get_execution_role()

data_location = sess.upload_data(WORK_DIRECTORY, key_prefix=prefix)

In [5]:
hyperparameters = {
    "task" : "tasd", 
    "dataset" : "haofangReview", 
    "model_name_or_path" : "t5-base", 
    "paradigm": "extraction",
    "eval_batch_size" :"16",
    "train_batch_size" :"2",
    "learning_rate" :"3e-4",
    "num_train_epochs":"1",
    "n_gpu": "1"
}

In [6]:
entry_point = 'finetune.py'
source_dir = './'
git_config = None
role = get_execution_role()
framework_version = '1.7.1'
py_version='py36'
instance_type='ml.p3.2xlarge'
#instance_type='local_gpu'
instance_count=1

In [7]:
estimator = PyTorch(
    entry_point = entry_point,
    source_dir = source_dir,
    git_config = git_config,
    role = role,
    debugger_hook_config=False,
    hyperparameters = hyperparameters,
    framework_version = framework_version, 
    py_version = py_version,
    instance_type = instance_type,
    instance_count = instance_count
)

In [8]:
inputs = {'tasd': data_location+'/tasd/'}

In [10]:
response = estimator.fit(inputs)

OSError: [Errno 28] No space left on device

# deploy 

In [14]:
import sagemaker

instance_type = 'ml.m5.4xlarge'
role = sagemaker.get_execution_role()

In [16]:
#s3_model = estimator.model_data 
s3_model = "s3://sagemaker-us-east-1-726335585155/pytorch-training-2022-06-16-07-12-35-530/output/model.tar.gz"

In [23]:
!aws s3 ls s3://sagemaker-us-east-1-726335585155/pytorch-training-2022-06-16-07-12-35-530/output/model.tar.gz

2022-06-16 07:55:55 3452559113 model.tar.gz


In [18]:
from sagemaker.pytorch.model import PyTorchModel

pytorch_model = PyTorchModel(model_data=s3_model, 
                             role=role,
                             entry_point='inference.py', 
                             source_dir='./', 
                             framework_version='1.7.1', 
                             py_version='py36'
                ) # TODO set model_server_workers=1 to avoid torchhub bug

predictor = pytorch_model.deploy(instance_type=instance_type, initial_instance_count=1)

ClientError: An error occurred (404) when calling the HeadObject operation: Not Found

In [None]:
from boto3.session import Session
import json

body = {"inputs": "I am pretty new to pickleball and finally decided to try out some different paddles."}

session = Session()
runtime = session.client("runtime.sagemaker")
response = runtime.invoke_endpoint(
    EndpointName=predictor.endpoint_name,
    ContentType="application/json",
    Body=json.dumps(body),
)
result = json.loads(response["Body"].read())
print (result)