In [1]:
import pandas as  pd
import re
import ast
import torch
import torch.nn.functional as func
import numpy as np
import torch.utils.data as data
import torch.nn as nn
import torch.optim as optim

In [20]:
def get_data(path):
    with open(path, 'r') as f:
        lines = f.readlines()
    json_formatted = '['
    for line in lines:
        json_formatted += line + ','
    json_formatted = json_formatted[:-1] + ']'
    json_formatted = re.sub('null', 'None', json_formatted)
    return ast.literal_eval(json_formatted)

In [21]:
df_sessions = pd.DataFrame(get_data('sessions.jsonl'))
df_sessions['timestamp'] = pd.to_datetime(df_sessions['timestamp'])
df_sessions

Unnamed: 0,session_id,timestamp,user_id,product_id,event_type,offered_discount,purchase_id
0,124,2021-02-16 14:51:39,102,1222,VIEW_PRODUCT,10,
1,124,2021-02-16 14:52:31,102,1072,VIEW_PRODUCT,10,
2,124,2021-02-16 14:54:35,102,1073,VIEW_PRODUCT,10,
3,124,2021-02-16 14:59:05,102,1201,VIEW_PRODUCT,10,
4,125,2020-06-10 19:14:06,102,1067,VIEW_PRODUCT,0,
...,...,...,...,...,...,...,...
51548,14658,2021-05-04 01:39:24,401,1010,VIEW_PRODUCT,0,
51549,14658,2021-05-04 01:42:16,401,1006,VIEW_PRODUCT,0,
51550,14658,2021-05-04 01:42:40,401,1011,VIEW_PRODUCT,0,
51551,14658,2021-05-04 01:44:30,401,1013,VIEW_PRODUCT,0,


In [2]:
def user_info(users):
    df_info = copy.deepcopy(users)
    df_info.drop(columns=['street'], inplace=True)
    df_info['name'] = df_info['name'].str.split(' ').str.get(0)
    df_info['name'] = df_info['name'].str.endswith('a')
    df_info.rename(columns={'name':'sex'}, inplace=True)
    df_info['sex'] = df_info['sex'].astype(int)

    return df_info

In [4]:
def categories_to_tensor(categorical, categories):
    tensor = torch.zeros(categories.shape[0])
    for category in categorical:
        tensor += func.one_hot(torch.from_numpy(np.where(categories == category)[0])[0], num_classes=categories.shape[0])
    return tensor

In [5]:
def merge_with_products(session, products):
    session['purchase_id'] = session.loc[:,('purchase_id')].notna().astype(int)
    session = pd.merge(session, products, on='product_id', how="left")
    session.rename(columns={'purchase_id':'purchase'}, inplace=True)
    return session

In [6]:
def set_spent_time(session):
    session_end = session.iloc[-1]['timestamp']
    session_start = session.iloc[0]['timestamp']
    session['spent_time'] = session_end - session_start
    return session

In [7]:
def merge_session_to_one_row(session):
    merged_session = pd.DataFrame(columns=['spent_time', 'offered_discount', 'sex', 'n_of_products_seen', 'sum_of_products_price', 'cheapest_prod', 'most_exp_prod'])
    session_last = session.iloc[-1]
    merged_session.loc[0] = [session_last['spent_time'].total_seconds(), session_last['offered_discount'], session_last['sex'], session['product_id'].unique().shape[0], session['price'].sum(), session['price'].min(), session['price'].max()]
    return merged_session


In [8]:
def set_root_category(session):
    session['category_path'] = session['category_path'].str.split(';').str.get(0)
    session.rename(columns={'category_path':'root_category'}, inplace=True)

    return session

In [9]:
def get_cities(users):
    cities = users['city'].unique()
    return cities

In [10]:
def get_categories(products):
    categories = products['category_path'].str.split(';').str.get(0).unique()
    return categories

In [11]:
def session_to_single_rows(session, products, users):
    user_inf = user_info(users)
    session = merge_with_products(session, products)

    target = torch.tensor(session.iloc[-1]['purchase'], dtype=torch.float32)

    session = set_root_category(session)
    session.drop(columns=['event_type', 'product_name'], inplace=True)
    session = session.merge(user_inf, on='user_id', how='left')
    session.drop(columns='user_id', inplace=True)
    session = set_spent_time(session)
    session['day'] = session.iloc[0]['timestamp'].dayofweek
    session['month'] = session.iloc[0]['timestamp'].month
    categorical_columns_one = session.iloc[0][['city', 'day', 'month']]

    categorical_columns_many = session['root_category'].unique()

    merged_session = merge_session_to_one_row(session)

    return merged_session, categorical_columns_one, categorical_columns_many, target

In [12]:
def session_to_tensor(session):
    products = pd.DataFrame(get_data('products.jsonl'))
    users = pd.DataFrame(get_data('users.jsonl'))
    merged_session, categorical_columns_one, categorical_columns_many, target = session_to_single_rows(session, products, users)
    numerical_values = torch.from_numpy(merged_session.values)[0].float()

    cities = get_cities(users)
    categories = get_categories(products)

    day = func.one_hot(torch.tensor(categorical_columns_one['day']), num_classes=7)
    month = func.one_hot(torch.tensor(categorical_columns_one['month']-1), num_classes=12)
    city = func.one_hot(torch.from_numpy(np.where(cities == categorical_columns_one['city'])[0])[0], num_classes=cities.shape[0])
    cat_many = categories_to_tensor(categorical_columns_many, categories)
    
    categorical_values = torch.cat([cat_many, day, month, city])



    return numerical_values, categorical_values, target

In [13]:
def get_sessions_tensors(df_sessions):
    sessions = df_sessions.groupby('session_id')
    numerical_data = []
    categorical_data = []
    targets = []
    sessions_id = df_sessions['session_id'].unique()
    for i ,session_id in enumerate(sessions_id):
        session = sessions.get_group(session_id)
        session_tensor = session_to_tensor(session)
        numerical_data.append(session_tensor[0])
        categorical_data.append(session_tensor[1])
        targets.append(session_tensor[2])

    return torch.stack(numerical_data), torch.stack(categorical_data), torch.stack(targets)

In [14]:
df_sessions_small = df_sessions.iloc[:50]
df_sessions_small

NameError: name 'df_sessions' is not defined

In [57]:
numerical_data, categorical_data, targets = get_sessions_tensors(df_sessions)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  session['purchase_id'] = session.loc[:,('purchase_id')].notna().astype(int)


In [58]:
categorical_data

tensor([[1., 0., 0.,  ..., 0., 0., 0.],
        [1., 0., 0.,  ..., 0., 0., 0.],
        [1., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 1., 0.,  ..., 0., 1., 0.],
        [0., 1., 0.,  ..., 0., 1., 0.],
        [0., 0., 1.,  ..., 0., 1., 0.]])

In [59]:
targets

tensor([0., 1., 1.,  ..., 0., 1., 0.])

In [60]:
train_indices = np.random.rand(len(numerical_data))>0.3

In [61]:
train_numerical = numerical_data[train_indices]
train_categorical = categorical_data[train_indices]
train_targets = targets[train_indices]

test_numerical = numerical_data[~train_indices]
test_categorical = categorical_data[~train_indices]
test_targets = targets[~train_indices]

In [62]:
train_dataset = data.TensorDataset(train_numerical,train_categorical,train_targets)
test_dataset = data.TensorDataset(test_numerical,test_categorical,test_targets)

# Model naiwny - bazujący na czasie trwania sesji

In [63]:
class NaiveClassifier:
    def __init__(self):
        self.time_mean = 0

    def train(self, train_numerical):
        self.time_mean = train_numerical[:, 0].mean()

    def forward(self, numerical):
        return (numerical[:, 0]>self.time_mean).int()

In [64]:
naive = NaiveClassifier()
naive.train(train_numerical)
out = naive.forward(test_numerical)
correct = out.eq(test_targets.view_as(out)).sum().item()
total = test_numerical.shape[0]

In [65]:
import pickle
with open('naive.pkl', 'wb') as f:
    pickle.dump(naive, f)

In [66]:
correct/total

0.4638985005767013

# Model bardziej zaawansowany - sieć neuronowa

In [67]:
class SessionClassifier(nn.Module):
    def __init__(self):
        super(SessionClassifier, self).__init__()
        self.emb_layer = nn.Linear(categorical_data.shape[1], categorical_data.shape[1])
        self.act_emb = nn.Tanh()
        self.layer1 = nn.Linear(numerical_data.shape[1] + categorical_data.shape[1], 40)
        self.act_1 =  nn.LeakyReLU()
        self.d1 = nn.Dropout(0.4)
        self.layer2 = nn.Linear(40, 20)
        self.act_2 =  nn.LeakyReLU()
        self.d2 = nn.Dropout(0.4)
        self.layer3 = nn.Linear(20, 1)

        self.f = nn.Sigmoid()
    def forward(self, x, cat_x):
        cat_x_embedded = self.emb_layer(cat_x)
        cat_x_embedded = self.act_emb(cat_x_embedded)
        x = torch.cat([x,cat_x_embedded],dim=1)
        activation1 = self.act_1(self.layer1(x))
        activation1 = self.d1(activation1)
        activation2 = self.act_2(self.layer2(activation1))
        activation2 = self.d2(activation2)
        output = self.layer3(activation2)

        output = self.f(output)
        return output

In [68]:
categorical_data.shape[1]

30

In [69]:
numerical_data.shape[1]

7

In [70]:
device = torch.device('cuda')
device

device(type='cuda')

In [71]:
def get_accuracy(model, data_loader):
    correct = 0
    total = 0
    model.eval()
    for x, cat_x, labels in data_loader:
        x, cat_x, labels = x.to(device), cat_x.to(device), labels.to(device)
        output = model(x, cat_x)
        pred = output>0.5
        correct += pred.eq(labels.view_as(pred)).sum().item()
        total += x.shape[0]
    return correct / total

In [72]:
model = SessionClassifier().to(device)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=128, shuffle=False)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=3e-4)

iters = []
losses = []
train_acc = []
val_acc = []
for n in range(101):
    epoch_losses = []
    for x, cat_x, labels in iter(train_loader):
        x, cat_x, labels = x.to(device), cat_x.to(device), labels.to(device)

        model.train()
        out = model(x, cat_x).squeeze()

        loss = criterion(out, labels)
        loss.backward()
        epoch_losses.append(loss.item())
        optimizer.step()
        optimizer.zero_grad()

    loss_mean = np.array(epoch_losses).mean()
    iters.append(n)
    losses.append(loss_mean)
    test_acc = get_accuracy(model, test_loader)
    if n % 5 == 0:
        print(f"Epoch {n} loss {loss_mean:.3} test_acc: {test_acc:.3}")
    train_acc.append(get_accuracy(model, train_loader)) # compute training accuracy
    val_acc.append(test_acc)  # compute validation accuracy


print("Final Training Accuracy: {}".format(train_acc[-1]))
print("Final Validation Accuracy: {}".format(val_acc[-1]))

Epoch 0 loss 0.703 test_acc: 0.524
Epoch 5 loss 0.695 test_acc: 0.524
Epoch 10 loss 0.691 test_acc: 0.524
Epoch 15 loss 0.669 test_acc: 0.632
Epoch 20 loss 0.63 test_acc: 0.782
Epoch 25 loss 0.612 test_acc: 0.798
Epoch 30 loss 0.604 test_acc: 0.809
Epoch 35 loss 0.601 test_acc: 0.825
Epoch 40 loss 0.595 test_acc: 0.826
Epoch 45 loss 0.593 test_acc: 0.826
Epoch 50 loss 0.59 test_acc: 0.828
Epoch 55 loss 0.589 test_acc: 0.827
Epoch 60 loss 0.59 test_acc: 0.828
Epoch 65 loss 0.587 test_acc: 0.828
Epoch 70 loss 0.586 test_acc: 0.827
Epoch 75 loss 0.585 test_acc: 0.828
Epoch 80 loss 0.587 test_acc: 0.827
Epoch 85 loss 0.587 test_acc: 0.823
Epoch 90 loss 0.585 test_acc: 0.827
Epoch 95 loss 0.585 test_acc: 0.827
Epoch 100 loss 0.585 test_acc: 0.827
Final Training Accuracy: 0.8207843137254902
Final Validation Accuracy: 0.8272202998846597


In [73]:
torch.save(model.state_dict(), 'model.pth')

# Wyznaczmy współczynnik alpha dla tego modelu (SessionClassifier)

In [74]:
test_data = test_dataset[test_dataset[:][2]==1]
test_data = data.TensorDataset(test_data[0], test_data[1], test_data[2])
test_data = torch.utils.data.DataLoader(test_data, batch_size=128, shuffle=False)
get_accuracy(model, test_data)

0.6542372881355932

In [75]:
df_sessions = pd.DataFrame(get_data('sessions.jsonl'))
df_sessions['timestamp'] = pd.to_datetime(df_sessions['timestamp'])

# Test mikroserwisu - wysyłanie requestów

In [15]:
import numpy as np
import copy

pd.set_option('expand_frame_repr', False)

def create_tensor_by_id(session_id, df):
    df = df.where(df['session_id'] == session_id)
    df["purchase_id"] = df["purchase_id"].notna().astype(int)
    df = df.dropna()
    print(df.head())
    try:
        numerical_data, categorical_data, targets = get_sessions_tensors(df)
        return numerical_data, categorical_data, targets
    except Exception as e:
        print('Wrong id')

In [16]:
import json
import requests

def send_request_by_id(session_id, df, model='nn_model'):
    numerical_data, categorical_data, targets = create_tensor_by_id(session_id, df)
    data_set = {
        'x_cat': categorical_data.tolist()[0],
        'x_num': numerical_data.tolist()[0],
        'model': model
    }
    json_request = json.dumps(data_set)
    r = requests.post('http://localhost:5000/predict', json=json_request)
    return r.json()

In [30]:
r = send_request_by_id(1685, df_sessions, 'nn_model')
r

      session_id           timestamp  user_id  product_id    event_type  offered_discount  purchase_id
5618      1685.0 2020-04-14 13:14:49    137.0      1047.0  VIEW_PRODUCT               5.0            0
5619      1685.0 2020-04-14 13:18:55    137.0      1046.0  VIEW_PRODUCT               5.0            0
5620      1685.0 2020-04-14 13:19:24    137.0      1041.0  VIEW_PRODUCT               5.0            0
5621      1685.0 2020-04-14 13:21:06    137.0      1042.0  VIEW_PRODUCT               5.0            0
5622      1685.0 2020-04-14 13:25:21    137.0      1084.0  VIEW_PRODUCT               5.0            0


{'certainty': 100.0, 'result': 0}