In [2]:
import sagemaker
from sagemaker import get_execution_role
import boto3
import pandas as pd
import awswrangler as wr

In [1]:
pip install awswrangler

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Collecting awswrangler
  Downloading awswrangler-2.18.0-py3-none-any.whl (255 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m255.3/255.3 KB[0m [31m28.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pg8000<2.0.0,>=1.20.0
  Downloading pg8000-1.29.4-py3-none-any.whl (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.4/51.4 KB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
Collecting opensearch-py<3,>=1
  Downloading opensearch_py-2.0.1-py2.py3-none-any.whl (214 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m214.7/214.7 KB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting requests-aws4auth<2.0.0,>=1.1.1
  Downloading requests_aws4auth-1.1.2-py2.py3-none-any.whl (24 kB)
Collecting pymysql<2.0.0,>=1.0.0
  Downloading PyMySQL-1.0.2-py3-none-any.whl (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4

In [3]:
train = wr.s3.read_csv(path='s3://bucket-to-snowflake/linearRegression_train.csv.gz', compression='gzip')
test = wr.s3.read_csv(path='s3://bucket-to-snowflake/linearRegression_test.csv.gz', compression='gzip')

In [4]:
train

Unnamed: 0,ORDER_ID,USER_ID,EVAL_SET,ORDER_NUMBER,PRODUCT_ID,REORDERED,USER_PREDICTOR,PRODUCT_PREDICTOR,UP_PREDICTOR
0,809799,155035,train,4,13944,1,4,248,1
1,809799,155035,train,4,22389,0,4,5,1
2,809799,155035,train,4,29576,0,4,2,1
3,809799,155035,train,4,26372,0,4,37,1
4,2089846,155038,train,12,29928,1,5,18,1
...,...,...,...,...,...,...,...,...,...
1084611,2546866,155029,train,17,3048,1,12,8,1
1084612,2546866,155029,train,17,34702,1,12,23,1
1084613,2546866,155029,train,17,13920,0,12,46,1
1084614,2546866,155029,train,17,7326,0,12,26,1


In [5]:
test

Unnamed: 0,ORDER_ID,USER_ID,ORDER_NUMBER,PRODUCT_ID,REORDERED,USER_PREDICTOR,PRODUCT_PREDICTOR,UP_PREDICTOR,ROWNO
0,1,112108,4,49302,1,8,8,1,1
1,1,112108,4,11109,1,8,144,1,2
2,1,112108,4,10246,0,8,1062,1,3
3,1,112108,4,49683,0,8,2413,1,4
4,1,112108,4,43633,1,8,24,1,5
...,...,...,...,...,...,...,...,...,...
299995,730552,47163,43,34308,1,9,30,1,299996
299996,730552,47163,43,14705,1,9,85,1,299997
299997,730552,47163,43,27243,1,9,329,1,299998
299998,730552,47163,43,36086,1,9,368,1,299999


# model

In [6]:
from sklearn.linear_model import LogisticRegression

In [7]:
x_train, y_train = train.drop(['ORDER_ID','USER_ID','EVAL_SET','PRODUCT_ID','REORDERED'], axis = 1), train.REORDERED

In [8]:
log = LogisticRegression(random_state=42)

In [9]:
model = log.fit(x_train, y_train)

In [10]:
x_train

Unnamed: 0,ORDER_NUMBER,USER_PREDICTOR,PRODUCT_PREDICTOR,UP_PREDICTOR
0,4,4,248,1
1,4,4,5,1
2,4,4,2,1
3,4,4,37,1
4,12,5,18,1
...,...,...,...,...
1084611,17,12,8,1
1084612,17,12,23,1
1084613,17,12,46,1
1084614,17,12,26,1


# Prediction

In [11]:
test_simulation = test.drop(['ORDER_ID','USER_ID','PRODUCT_ID','REORDERED','ROWNO'], axis=1)

In [12]:
test_pred = model.predict(test_simulation).astype(int)

In [13]:
test_w_pred = test.assign(prediction = test_pred)

In [14]:
test_w_pred

Unnamed: 0,ORDER_ID,USER_ID,ORDER_NUMBER,PRODUCT_ID,REORDERED,USER_PREDICTOR,PRODUCT_PREDICTOR,UP_PREDICTOR,ROWNO,prediction
0,1,112108,4,49302,1,8,8,1,1,0
1,1,112108,4,11109,1,8,144,1,2,0
2,1,112108,4,10246,0,8,1062,1,3,0
3,1,112108,4,49683,0,8,2413,1,4,1
4,1,112108,4,43633,1,8,24,1,5,0
...,...,...,...,...,...,...,...,...,...,...
299995,730552,47163,43,34308,1,9,30,1,299996,1
299996,730552,47163,43,14705,1,9,85,1,299997,1
299997,730552,47163,43,27243,1,9,329,1,299998,1
299998,730552,47163,43,36086,1,9,368,1,299999,1


# evaluation

In [15]:
from sklearn.metrics import f1_score

In [16]:
y_pred = test_w_pred['prediction']
y_true = test_w_pred['REORDERED'].astype(int)

In [17]:
f1_score(y_true,y_pred,average = 'weighted')

0.6293961731610169

# LOAD DATA TO S3 FOR dynamoDB

In [18]:
load_data = test_w_pred[['USER_ID','PRODUCT_ID','USER_PREDICTOR','PRODUCT_PREDICTOR','UP_PREDICTOR','prediction','REORDERED']]

In [19]:
load_data

Unnamed: 0,USER_ID,PRODUCT_ID,USER_PREDICTOR,PRODUCT_PREDICTOR,UP_PREDICTOR,prediction,REORDERED
0,112108,49302,8,8,1,0,1
1,112108,11109,8,144,1,0,1
2,112108,10246,8,1062,1,0,0
3,112108,49683,8,2413,1,1,0
4,112108,43633,8,24,1,0,1
...,...,...,...,...,...,...,...
299995,47163,34308,9,30,1,1,1
299996,47163,14705,9,85,1,1,1
299997,47163,27243,9,329,1,1,1
299998,47163,36086,9,368,1,1,1


In [21]:
# save dateframe to csv file in this working directory
file_name = "load_data.csv"
load_data.to_csv(file_name)    #df.to_csv(file_name)
#load_data.to_csv('load_data.csv')

In [22]:
# load data to the specific bucket with the preferred name
s3 = boto3.resource('s3')
s3.meta.client.upload_file(file_name, 'sagemaker-pred-data', 'test_w_pred0')