In [3]:
import sagemaker
from sagemaker import get_execution_role
import boto3
import pandas as pd
import awswrangler as wr

In [4]:
# pip install awswrangler

# 1. load data from S3

In [5]:
order = wr.s3.read_csv(path="s3://bucket-to-snowflake/orders.csv")
op_train = wr.s3.read_csv(path='s3://bucket-to-snowflake/order_products__train.csv.gz', compression='gzip')
op_prior = wr.s3.read_csv(path='s3://bucket-to-snowflake/order_products__prior.csv.gz', compression='gzip')
product = wr.s3.read_csv(path='s3://bucket-to-snowflake/products.csv')

# 2. reshape data

In [6]:
# change data type
order['eval_set'] = order['eval_set'].astype('category')
product['product_name'] = product['product_name'].astype('category')

In [7]:
op = order.merge(op_prior, on = 'order_id', how = 'inner')

In [8]:
# user predictor 
user = op.groupby('user_id')['order_number'].count().to_frame('user_total_orders')

In [9]:
# product predictor
products = op.groupby('product_id')['order_number'].count().to_frame('product_total_purchase')

In [10]:
# user product predictor
user_product = op.groupby(['user_id','product_id'])['order_id'].count().to_frame('user_total_on_product')

In [11]:
user = user.reset_index()
user_product = user_product.reset_index()
products = products.reset_index()

## merge predictors

In [12]:
predictor0 = user_product.merge(user, on = 'user_id', how = 'left')

In [13]:
predictor = predictor0.merge(products, on = 'product_id', how = 'left')

In [14]:
predictor

Unnamed: 0,user_id,product_id,user_total_on_product,user_total_orders,product_total_purchase
0,1,196,10,59,35791
1,1,10258,9,59,1946
2,1,10326,1,59,5526
3,1,12427,10,59,6476
4,1,13032,3,59,3751
...,...,...,...,...,...
13307948,206209,43961,3,129,55371
13307949,206209,44325,1,129,3485
13307950,206209,48370,1,129,3934
13307951,206209,48697,1,129,9783


# 3. training dataset

In [15]:
order_s = order[['user_id','eval_set','order_id']]

In [None]:
order_predictor = predictor.merge(order_s, on = 'user_id', how = 'left')

In [None]:
order_predictor.isnull().sum

In [None]:
order_predictor.dropna()

In [None]:
# define test & train dataset
train = order_predictor[order_predictor.eval_set == 'train'][:2000]
test = order_predictor[order_predictor.eval_set == 'train'][2000:]

In [None]:
train_merge = train.merge(op_train[['product_id','order_id','reordered']], on = ['product_id'], how='left')
train_merge = train_merge.dropna()
train_merge = train_merge.set_index(['user_id', 'product_id'])
train_merge = train_merge.drop(['eval_set','order_id_x','order_id_y'], axis=1)

## testing dataset

In [None]:
test_merge = test.merge(op_train[['product_id','order_id','reordered']], on = ['product_id'], how='left')
test_merge = test_merge.dropna()
test_merge = test_merge.drop(['eval_set','order_id_x','order_id_y'], axis=1)
test_merge = test_merge.drop_duplicates(keep='first')

In [None]:
# store the true value
test_true = test_merge

In [None]:
test_simultion = test_merge.drop(['reordered','user_id','product_id'], axis=1)

# 3. model

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
x_train, y_train = train_merge.drop('reordered', axis=1), train_merge.reordered

In [None]:
log = LogisticRegression(random_state=42)

In [None]:
model = log.fit(x_train, y_train)

# 4. prediction

In [None]:
test_pred = model.predict(test_simulation).astype(int)

In [None]:
test_simulation['prediction'] = test_pred

In [None]:
final = test_simulation.reset_index()

# 5. evaluation

In [None]:
from sklearn.metrics import f1_score

In [None]:
y_pred = final['prediction']

In [None]:
y_true = test_true['reordered'].astype(int)

In [None]:
f1_score(y_true,y_pred,average = 'weighted')

22:15 NOV 30 2022