# Обучение

# Light FM для Москвы, Питера и остальных

# Top Recommender для холодного старта

In [1]:
from lightfm import LightFM
from top_recommender import TopRecommender

In [2]:
import pandas as pd
import numpy as np
import scipy
from tqdm import tqdm
from interaction_table import InteractionTable
from h3_index import H3Index

In [3]:
from process_data import preprocess_orders_and_clicks, additional_filtration_orders_and_clicks
from user_features import generate_user_features

In [4]:
path = '../data/orders.parquet'
orders = pd.read_parquet(path)
path = '../data/clicks.parquet'
clicks = pd.read_parquet(path)

# user_features = generate_user_features(orders, clicks)

# orders, clicks = additional_filtration_orders_and_clicks(orders, clicks, 0, regs_to_filter=[1])
# orders = orders.rename(columns={"customer_id": "user_id"})

In [5]:
# orders = pd.read_parquet("../data/orders_filtered.parquet")
user_features = pd.read_parquet("../data/user_features.parquet")

## Обучим модель для Москвы

In [6]:
path = '../data/orders.parquet'
orders = pd.read_parquet(path)
path = '../data/clicks.parquet'
clicks = pd.read_parquet(path)

orders, clicks = additional_filtration_orders_and_clicks(orders, clicks, 0, regs_to_filter=[1])
orders = orders.rename(columns={"customer_id": "user_id"})

interactions_moscow = InteractionTable(orders, None, alpha=0, test_slice=100000)

# возьмем все фичи и сравним со средним
user_features_sparse_moscow = scipy.sparse.csr_matrix(
    (
        user_features.loc[interactions_moscow.user_to_index.keys()] 
        - user_features.loc[interactions_moscow.user_to_index.keys()].mean()
        > 0
    ).astype(int)
)

model_moscow = LightFM(no_components=10, loss='warp', user_alpha=0.1, random_state=42)
model_moscow.fit(
    interactions_moscow.sparse_interaction_matrix.T, 
    user_features=user_features_sparse_moscow,
    epochs=600,
    num_threads=4
)

top_rec_moscow = TopRecommender()
top_rec_moscow = top_rec_moscow.fit(orders)

del orders
del clicks
# del interactions
# del user_features_sparse

Orders weighter: use user avg orders per chain as weight
            user_id      chain_id        weight
count  3.106486e+06  3.106486e+06  3.106486e+06
mean   3.666636e+07  3.212015e+04  1.755490e+00
std    2.148159e+07  1.517362e+04  8.203714e+01
min    0.000000e+00  9.000000e+00  1.000000e+00
25%    1.143635e+07  2.714700e+04  1.000000e+00
50%    3.991074e+07  3.007500e+04  1.000000e+00
75%    5.175972e+07  4.451900e+04  2.000000e+00
max    7.213893e+07  7.332400e+04  1.444470e+05
Orders df weighted: size=3106486, uniq_users=1394062, uniq_chains=7792
Interaction df len for test:  222705


## Обучим модель для Питера

In [7]:
path = '../data/orders.parquet'
orders = pd.read_parquet(path)
path = '../data/clicks.parquet'
clicks = pd.read_parquet(path)

orders, clicks = additional_filtration_orders_and_clicks(orders, clicks, 0, regs_to_filter=[2])
orders = orders.rename(columns={"customer_id": "user_id"})

interactions_piter = InteractionTable(orders, None, alpha=0, test_slice=100000)

# возьмем все фичи и сравним со средним
user_features_sparse_piter = scipy.sparse.csr_matrix(
    (
        user_features.loc[interactions_piter.user_to_index.keys()] 
        - user_features.loc[interactions_piter.user_to_index.keys()].mean()
        > 0
    ).astype(int)
)

model_piter = LightFM(no_components=10, loss='warp', user_alpha=0.1, random_state=42)
model_piter.fit(
    interactions_piter.sparse_interaction_matrix.T, 
    user_features=user_features_sparse_piter, 
    epochs=600,
    num_threads=4
)

top_rec_piter = TopRecommender()
top_rec_piter = top_rec_piter.fit(orders)

del orders
del clicks
# del interactions
# del user_features_sparse

Orders weighter: use user avg orders per chain as weight
            user_id       chain_id         weight
count  9.556730e+05  955673.000000  955673.000000
mean   3.854621e+07   28260.356502       1.631581
std    2.157295e+07   14201.733151      43.371718
min    0.000000e+00     499.000000       1.000000
25%    2.789286e+07   20432.000000       1.000000
50%    4.260935e+07   29276.000000       1.000000
75%    5.382427e+07   33164.000000       2.000000
max    7.213901e+07   73274.000000   42375.000000
Orders df weighted: size=955673, uniq_users=479535, uniq_chains=2043
Interaction df len for test:  199238


## Остальные города

In [8]:
cities_other = list(range(3, 223))

In [9]:
path = '../data/orders.parquet'
orders = pd.read_parquet(path)
path = '../data/clicks.parquet'
clicks = pd.read_parquet(path)

orders, clicks = additional_filtration_orders_and_clicks(orders, clicks, 0, regs_to_filter=cities_other)
orders = orders.rename(columns={"customer_id": "user_id"})

interactions_other = InteractionTable(orders, None, alpha=0, test_slice=100000)

# возьмем все фичи и сравним со средним
user_features_sparse_other = scipy.sparse.csr_matrix(
    (
        user_features.loc[interactions_other.user_to_index.keys()] 
        - user_features.loc[interactions_other.user_to_index.keys()].mean()
        > 0
    ).astype(int)
)

model_other = LightFM(no_components=10, loss='warp', user_alpha=0.1, random_state=42)
model_other.fit(
    interactions_other.sparse_interaction_matrix.T, 
    user_features=user_features_sparse_other, 
    epochs=600,
    num_threads=4
)

top_rec_other = TopRecommender()
top_rec_other = top_rec_other.fit(orders)

del orders
del clicks
# del interactions
# del user_features_sparse

Orders weighter: use user avg orders per chain as weight
            user_id      chain_id        weight
count  3.386497e+06  3.386497e+06  3.386497e+06
mean   4.351366e+07  3.670508e+04  1.508013e+00
std    2.047605e+07  1.567719e+04  9.663449e+00
min    0.000000e+00  1.150000e+03  1.000000e+00
25%    3.183724e+07  2.942800e+04  1.000000e+00
50%    4.825052e+07  3.349400e+04  1.000000e+00
75%    5.854460e+07  4.613400e+04  1.000000e+00
max    7.213902e+07  7.329100e+04  7.953000e+03
Orders df weighted: size=3386497, uniq_users=1725297, uniq_chains=13508
Interaction df len for test:  195687


In [10]:
import pickle
# now you can save it to a file
with open('lightfm_moscow.pkl', 'wb') as f:
    pickle.dump(model_moscow, f)
with open('interactions_moscow.pkl', 'wb') as f:
    pickle.dump(interactions_moscow, f)
with open('user_features_sparse_moscow.pkl', 'wb') as f:
    pickle.dump(user_features_sparse_moscow, f)

# now you can save it to a file
with open('lightfm_piter.pkl', 'wb') as f:
    pickle.dump(model_piter, f)
with open('interactions_piter.pkl', 'wb') as f:
    pickle.dump(interactions_piter, f)
with open('user_features_sparse_piter.pkl', 'wb') as f:
    pickle.dump(user_features_sparse_piter, f)

# now you can save it to a file
with open('lightfm_other.pkl', 'wb') as f:
    pickle.dump(model_other, f)
with open('interactions_other.pkl', 'wb') as f:
    pickle.dump(interactions_other, f)
with open('user_features_sparse_other.pkl', 'wb') as f:
    pickle.dump(user_features_sparse_other, f)

    
with open('top_rec_moscow.pkl', 'wb') as f:
    pickle.dump(top_rec_moscow, f)
with open('top_rec_piter.pkl', 'wb') as f:
    pickle.dump(top_rec_piter, f)
with open('top_rec_other.pkl', 'wb') as f:
    pickle.dump(top_rec_other, f)

## Top Rec

In [11]:
from top_recommender import TopRecommender

In [12]:
path = '../data/orders.parquet'
orders = pd.read_parquet(path)
path = '../data/clicks.parquet'
clicks = pd.read_parquet(path)

orders, clicks = additional_filtration_orders_and_clicks(orders, clicks, 0, regs_to_filter=list(range(223)))
orders = orders.rename(columns={"customer_id": "user_id"})

top_rec = TopRecommender()
top_rec = top_rec.fit(orders)

In [13]:
del clicks
del orders

In [14]:
with open('top_rec.pkl', 'wb') as f:
    pickle.dump(top_rec, f)

In [4]:
path = '../data/orders.parquet'
orders = pd.read_parquet(path)
path = '../data/clicks.parquet'
clicks = pd.read_parquet(path)

orders, clicks = additional_filtration_orders_and_clicks(orders, clicks, 0, regs_to_filter=list(range(223)))
orders = orders.rename(columns={"customer_id": "user_id"})

In [5]:
orders.head()

Unnamed: 0,order_id,city_id,user_id,order_date,status_id,vendor_id,chain_id,expected_delivery_min,products_count,latitude,...,delivery_fee,service_fee,initial_product_sum,rider_id,backend_expected_delivery_time,delivery_time,delivery_distance,star_rating,discount_percent,fee_percent
0,210802978,5,69429960,2021-08-01 00:07:07,11,20936,14011,39.0,1,56,...,0,14,530,,,,,5.0,0.0,0.025735
1,211027641,34,39604616,2021-08-01 18:26:54,11,37994,27841,47.0,1,54,...,0,14,1349,,,,,3.0,0.0,0.010271
3,210931644,47,28610448,2021-08-01 14:48:35,11,31845,19783,60.0,1,56,...,0,14,599,,,,,5.0,0.0,0.022838
4,210958294,2,47252176,2021-08-01 15:49:32,11,46965,30098,41.0,5,60,...,179,14,347,79471.0,40.0,25.200001,0.4793,5.0,0.0,0.357407
8,210842484,132,9241705,2021-08-01 10:58:20,11,88278,53800,70.0,9,56,...,0,14,1036,,,,,5.0,0.0,0.013333


In [38]:
tmp = orders.groupby(["user_id", "city_id"], sort=False)["chain_id"].apply(set)

In [39]:
tmp = tmp.reset_index()

In [41]:
tmp.head()

Unnamed: 0,user_id,city_id,chain_id
0,69429960,5,{14011}
1,39604616,34,"{27841, 16228, 19441}"
2,28610448,47,"{14911, 19783}"
3,47252176,2,{30098}
4,9241705,132,"{53800, 39179}"


In [42]:
tmp = tmp[tmp.chain_id.apply(len) >= 5]

In [43]:
tmp.to_pickle("user_chains.pkl")

In [15]:
tmp.head()

Unnamed: 0,user_id,city_id,chain_id
123,56682184,1,"{30112, 44256, 13698, 19523, 777, 63242, 22333..."
185,45625476,1,"{14049, 15275, 29454, 28720, 32049, 48274, 155..."
225,28798526,34,"{27841, 16228, 32652, 53424, 32145, 44402, 556..."
233,46935164,1,"{32577, 41090, 17894, 57574, 15275, 41067, 294..."
270,53881984,1,"{2305, 13698, 29992, 3369, 42, 15275, 29036, 2..."
