# Обучение

# Light FM для Москвы, Питера и остальных

# Top Recommender для холодного старта

In [1]:
from lightfm import LightFM
from top_recommender import TopRecommender

In [2]:
import pandas as pd
import numpy as np
import scipy
from tqdm import tqdm
from interaction_table import InteractionTable
from h3_index import H3Index

In [3]:
from process_data import preprocess_orders_and_clicks, additional_filtration_orders_and_clicks
from user_features import generate_user_features

In [4]:
path = '../data/orders.parquet'
orders = pd.read_parquet(path)
path = '../data/clicks.parquet'
clicks = pd.read_parquet(path)

# user_features = generate_user_features(orders, clicks)

# orders, clicks = additional_filtration_orders_and_clicks(orders, clicks, 0, regs_to_filter=[1])
# orders = orders.rename(columns={"customer_id": "user_id"})

In [5]:
# orders = pd.read_parquet("../data/orders_filtered.parquet")
user_features = pd.read_parquet("../data/user_features.parquet")

## Обучим модель для Москвы

In [6]:
path = '../data/orders.parquet'
orders = pd.read_parquet(path)
path = '../data/clicks.parquet'
clicks = pd.read_parquet(path)

orders, clicks = additional_filtration_orders_and_clicks(orders, clicks, 0, regs_to_filter=[1])
orders = orders.rename(columns={"customer_id": "user_id"})

interactions_moscow = InteractionTable(orders, None, alpha=0, test_slice=None)

# возьмем все фичи и сравним со средним
user_features_sparse_moscow = scipy.sparse.csr_matrix(
    (
        user_features.loc[interactions_moscow.user_to_index.keys()] 
        - user_features.loc[interactions_moscow.user_to_index.keys()].mean()
        > 0
    ).astype(int)
)

model_moscow = LightFM(no_components=10, loss='warp', user_alpha=0.1, random_state=42)
model_moscow.fit(
    interactions_moscow.sparse_interaction_matrix.T, 
    user_features=user_features_sparse_moscow,
    epochs=600,
    num_threads=4
)

top_rec_moscow = TopRecommender()
top_rec_moscow = top_rec_moscow.fit(orders)

del orders
del clicks
# del interactions
# del user_features_sparse

Orders weighter: use user avg orders per chain as weight
            user_id      chain_id        weight
count  3.106486e+06  3.106486e+06  3.106486e+06
mean   3.666636e+07  3.212015e+04  1.755490e+00
std    2.148159e+07  1.517362e+04  8.203714e+01
min    0.000000e+00  9.000000e+00  1.000000e+00
25%    1.143635e+07  2.714700e+04  1.000000e+00
50%    3.991074e+07  3.007500e+04  1.000000e+00
75%    5.175972e+07  4.451900e+04  2.000000e+00
max    7.213893e+07  7.332400e+04  1.444470e+05
Orders df weighted: size=3106486, uniq_users=1394062, uniq_chains=7792


## Обучим модель для Питера

In [7]:
path = '../data/orders.parquet'
orders = pd.read_parquet(path)
path = '../data/clicks.parquet'
clicks = pd.read_parquet(path)

orders, clicks = additional_filtration_orders_and_clicks(orders, clicks, 0, regs_to_filter=[2])
orders = orders.rename(columns={"customer_id": "user_id"})

interactions_piter = InteractionTable(orders, None, alpha=0, test_slice=None)

# возьмем все фичи и сравним со средним
user_features_sparse_piter = scipy.sparse.csr_matrix(
    (
        user_features.loc[interactions_piter.user_to_index.keys()] 
        - user_features.loc[interactions_piter.user_to_index.keys()].mean()
        > 0
    ).astype(int)
)

model_piter = LightFM(no_components=10, loss='warp', user_alpha=0.1, random_state=42)
model_piter.fit(
    interactions_piter.sparse_interaction_matrix.T, 
    user_features=user_features_sparse_piter, 
    epochs=600,
    num_threads=4
)

top_rec_piter = TopRecommender()
top_rec_piter = top_rec_piter.fit(orders)

del orders
del clicks
# del interactions
# del user_features_sparse

Orders weighter: use user avg orders per chain as weight
            user_id       chain_id         weight
count  9.556730e+05  955673.000000  955673.000000
mean   3.854621e+07   28260.356502       1.631581
std    2.157295e+07   14201.733151      43.371718
min    0.000000e+00     499.000000       1.000000
25%    2.789286e+07   20432.000000       1.000000
50%    4.260935e+07   29276.000000       1.000000
75%    5.382427e+07   33164.000000       2.000000
max    7.213901e+07   73274.000000   42375.000000
Orders df weighted: size=955673, uniq_users=479535, uniq_chains=2043


## Остальные города

In [8]:
cities_other = list(range(3, 223))

In [9]:
path = '../data/orders.parquet'
orders = pd.read_parquet(path)
path = '../data/clicks.parquet'
clicks = pd.read_parquet(path)

orders, clicks = additional_filtration_orders_and_clicks(orders, clicks, 0, regs_to_filter=cities_other)
orders = orders.rename(columns={"customer_id": "user_id"})

interactions_other = InteractionTable(orders, None, alpha=0, test_slice=None)

# возьмем все фичи и сравним со средним
user_features_sparse_other = scipy.sparse.csr_matrix(
    (
        user_features.loc[interactions_other.user_to_index.keys()] 
        - user_features.loc[interactions_other.user_to_index.keys()].mean()
        > 0
    ).astype(int)
)

model_other = LightFM(no_components=10, loss='warp', user_alpha=0.1, random_state=42)
model_other.fit(
    interactions_other.sparse_interaction_matrix.T, 
    user_features=user_features_sparse_other, 
    epochs=600,
    num_threads=4
)

top_rec_other = TopRecommender()
top_rec_other = top_rec_other.fit(orders)

del orders
del clicks
# del interactions
# del user_features_sparse

Orders weighter: use user avg orders per chain as weight
            user_id      chain_id        weight
count  3.386497e+06  3.386497e+06  3.386497e+06
mean   4.351366e+07  3.670508e+04  1.508013e+00
std    2.047605e+07  1.567719e+04  9.663449e+00
min    0.000000e+00  1.150000e+03  1.000000e+00
25%    3.183724e+07  2.942800e+04  1.000000e+00
50%    4.825052e+07  3.349400e+04  1.000000e+00
75%    5.854460e+07  4.613400e+04  1.000000e+00
max    7.213902e+07  7.329100e+04  7.953000e+03
Orders df weighted: size=3386497, uniq_users=1725297, uniq_chains=13508


In [10]:
import pickle
# now you can save it to a file
with open('lightfm_moscow.pkl', 'wb') as f:
    pickle.dump(model_moscow, f)
with open('interactions_moscow.pkl', 'wb') as f:
    pickle.dump(interactions_moscow, f)
with open('user_features_sparse_moscow.pkl', 'wb') as f:
    pickle.dump(user_features_sparse_moscow, f)

# now you can save it to a file
with open('lightfm_piter.pkl', 'wb') as f:
    pickle.dump(model_piter, f)
with open('interactions_piter.pkl', 'wb') as f:
    pickle.dump(interactions_piter, f)
with open('user_features_sparse_piter.pkl', 'wb') as f:
    pickle.dump(user_features_sparse_piter, f)

# now you can save it to a file
with open('lightfm_other.pkl', 'wb') as f:
    pickle.dump(model_other, f)
with open('interactions_other.pkl', 'wb') as f:
    pickle.dump(interactions_other, f)
with open('user_features_sparse_other.pkl', 'wb') as f:
    pickle.dump(user_features_sparse_other, f)

    
with open('top_rec_moscow.pkl', 'wb') as f:
    pickle.dump(top_rec_moscow, f)
with open('top_rec_piter.pkl', 'wb') as f:
    pickle.dump(top_rec_piter, f)
with open('top_rec_other.pkl', 'wb') as f:
    pickle.dump(top_rec_other, f)

## Top Rec

In [11]:
from top_recommender import TopRecommender

In [12]:
path = '../data/orders.parquet'
orders = pd.read_parquet(path)
path = '../data/clicks.parquet'
clicks = pd.read_parquet(path)

orders, clicks = additional_filtration_orders_and_clicks(orders, clicks, 0, regs_to_filter=list(range(223)))
orders = orders.rename(columns={"customer_id": "user_id"})

top_rec = TopRecommender()
top_rec = top_rec.fit(orders)

In [13]:
del clicks
del orders

In [14]:
with open('top_rec.pkl', 'wb') as f:
    pickle.dump(top_rec, f)