In [1]:
import pandas as pd
import numpy as np
import json
from tqdm import tqdm
tqdm.pandas()

## Общая предобработка

In [2]:
data1 = pd.read_csv('internship_clickstream_data_1.gzip', compression='gzip')
data2 = pd.read_csv('internship_clickstream_data_2.gzip', compression='gzip')
data3 = pd.read_csv('internship_clickstream_data_3.gzip', compression='gzip')
print(data1.shape, data2.shape, data3.shape)

(30000000, 8) (30000000, 8) (22463346, 8)


Необходимо составить датасет, в котором фичи будут - эмбеддинги юзеров с первой недели, а таргеты - инфа об их склонностях к аренде/продаже со второй недели (исключаем data leak). Соответственно, с первой и второй недель нужно взять пересечение юзеров + важно, чтобы объявки с первой недели были также все те же, что и со второй.

In [3]:
# drop duplicates
data1.drop_duplicates(subset=['uid', 'offer_id'], inplace=True)
data2.drop_duplicates(subset=['uid', 'offer_id'], inplace=True)
data3.drop_duplicates(subset=['uid', 'offer_id'], inplace=True)
print(data1.shape, data2.shape, data3.shape)

# work with date
data1['timestamp'] = pd.to_datetime(data1['timestamp'])
data2['timestamp'] = pd.to_datetime(data2['timestamp'])
data3['timestamp'] = pd.to_datetime(data3['timestamp'])

# save users from 1 week
data1_s = data1[data1['timestamp'] < pd.to_datetime('2022-07-10 00:00:00.020000')]
data2_s = data2[data2['timestamp'] < pd.to_datetime('2022-07-10 00:00:00.020000')]
data3_s = data3[data3['timestamp'] < pd.to_datetime('2022-07-10 00:00:00.020000')]
users_1_week = pd.concat([data1_s, data2_s, data3_s])['uid'].unique()
offers_1_week = pd.concat([data1_s, data2_s, data3_s])['offer_id'].unique()

# filter by date, second week
data1 = data1[(data1['timestamp'] >= pd.to_datetime('2022-07-10 00:00:00.020000')) & \
              (data1['timestamp'] < pd.to_datetime('2022-07-17 00:00:00.020000'))]
data2 = data2[(data2['timestamp'] >= pd.to_datetime('2022-07-10 00:00:00.020000')) & \
              (data2['timestamp'] < pd.to_datetime('2022-07-17 00:00:00.020000'))]
data3 = data3[(data3['timestamp'] >= pd.to_datetime('2022-07-10 00:00:00.020000')) & \
              (data3['timestamp'] < pd.to_datetime('2022-07-17 00:00:00.020000'))]
print(data1.shape, data2.shape, data3.shape)

# take users and offers from 2 week who is in 1 week
data1 = data1[(data1['uid'].isin(users_1_week)) & (data1['offer_id'].isin(offers_1_week))]
data2 = data2[(data2['uid'].isin(users_1_week)) & (data2['offer_id'].isin(offers_1_week))]
data3 = data3[(data3['uid'].isin(users_1_week)) & (data3['offer_id'].isin(offers_1_week))]

data = pd.concat([data1, data2, data3])
print('after concating:', data.shape)
data.drop_duplicates(subset=['uid', 'offer_id'], inplace=True)
print('after drop duplicates:', data.shape)

(22585345, 8) (22559623, 8) (17427910, 8)
(4999240, 8) (5104941, 8) (4111483, 8)
after concating: (7842763, 8)
after drop duplicates: (6897045, 8)


In [4]:
# take users that have > 5 clicks
while not (data['offer_id'].value_counts()[data['offer_id'].value_counts() <= 5].empty) or \
      not (data['uid'].value_counts()[data['uid'].value_counts() <= 5].empty):
    offer_ids = data['offer_id'].value_counts()[data['offer_id'].value_counts() > 5].index
    data = data[data['offer_id'].isin(offer_ids)]
    uids = data['uid'].value_counts()[data['uid'].value_counts() > 5].index
    data = data[data['uid'].isin(uids)]
print(data.shape)

(4366021, 8)


## Добавляем lst-features
Чтобы выделить объявления для аренды и продажи

In [5]:
data11 = pd.read_csv('lst_announcement_data_1.gzip', compression='gzip', usecols=['id', 'category'])
data22 = pd.read_csv('lst_announcement_data_2.gzip', compression='gzip', usecols=['id', 'category'])
data33 = pd.read_csv('lst_announcement_data_3.gzip', compression='gzip', usecols=['id', 'category'])

In [6]:
print(data11.shape, data22.shape, data33.shape)
data11.drop_duplicates(subset=['id'], inplace=True)
data22.drop_duplicates(subset=['id'], inplace=True)
data33.drop_duplicates(subset=['id'], inplace=True)
print(data11.shape, data22.shape, data33.shape)

lst_features = pd.concat([data11, data22, data33])
print('after concating:', lst_features.shape)
lst_features.drop_duplicates(subset=['id'], inplace=True)
print('after drop duplicates:', lst_features.shape)

(1000000, 2) (1000000, 2) (778005, 2)
(1000000, 2) (1000000, 2) (778005, 2)
after concating: (2778005, 2)
after drop duplicates: (2778005, 2)


In [7]:
print(lst_features['category'].unique())

['townhouseSale' 'commercialLandRent' 'landSale' 'flatRent' 'flatSale'
 'houseSale' 'dailyHouseRent' 'newBuildingFlatSale' 'dailyFlatRent'
 'shoppingAreaSale' 'cottageRent' 'roomRent' 'freeAppointmentObjectRent'
 'businessSale' 'cottageSale' 'officeRent' 'freeAppointmentObjectSale'
 'townhouseRent' 'garageSale' 'houseShareSale' 'industryRent' 'officeSale'
 'roomSale' 'industrySale' 'warehouseRent' 'commercialLandSale'
 'houseRent' 'houseShareRent' 'garageRent' 'shoppingAreaRent'
 'buildingSale' 'dailyRoomRent' 'buildingRent' 'warehouseSale' 'bedRent'
 'flatShareSale' 'dailyBedRent' 'businessRent']


In [8]:
def sale_rent(arg):
    """
        кодируем объявления продажи 1
        аренды - 0
    """
    if 'flatSale' in arg or 'newBuildingFlatSale' in arg:
        return 1
    if 'flatRent' in arg:
        return 0
    else:
        return np.nan
lst_features['category'] = lst_features['category'].progress_apply(sale_rent)
lst_features['category'].unique()

100%|█████████████████████████████| 2778005/2778005 [00:03<00:00, 833290.06it/s]


array([nan,  0.,  1.])

In [9]:
merged = data.merge(lst_features, left_on='offer_id', right_on='id', how='left')
print(merged.shape)
merged.head()

(4366021, 10)


Unnamed: 0,timestamp,hit_id,uid,platform,event_name,screen,offer_id,ptn_dadd,id,category
0,2022-07-16 00:00:12.928,cd1a21ecfd2f47f7,21404791,android,OpenOfferScreen,SearchResultsList,271617821,2022-07-16,271617821.0,1.0
1,2022-07-13 22:38:12.355,c326f40e13654821,92254503,ios,OpenOfferScreen,MapScreen,275559266,2022-07-13,275559266.0,0.0
2,2022-07-13 22:38:32.488,b75ee3fac5964206,33791824,ios,OpenOfferScreen,MapScreen,274687903,2022-07-13,274687903.0,1.0
3,2022-07-13 22:38:52.902,6ba2fd0f39b844af,12536475,android,OpenOfferScreen,BookmarksScreen,275136292,2022-07-13,275136292.0,1.0
4,2022-07-13 22:39:38.156,29ee5e2791af466a,15764141,ios,OpenOfferScreen,MapScreen,271908694,2022-07-13,271908694.0,


In [10]:
grouped = merged.groupby(['uid'])['category'].agg(['count', 'sum']).reset_index()
grouped

Unnamed: 0,uid,count,sum
0,0001d81a-d960-4556-ac84-bdcfa1bb5bcd,17,17.0
1,000490b4-9978-408d-b133-85a58146af23,9,8.0
2,000695d8-e5c2-4d64-a570-2d5432575dc7,1,1.0
3,00083E71-627E-4922-9714-CFB9CB05016F,1,1.0
4,00098EAF-21C2-41E6-AE6C-9F96DD5408CC,13,0.0
...,...,...,...
217708,ffe558c1-8cf0-4373-984c-6757c9aeedc3,5,0.0
217709,fff12f36-aafc-4563-af81-e8a5155a7b60,6,6.0
217710,fff3cbe3-da73-438c-a16c-020b5219d4e1,16,0.0
217711,fffb9859-ba42-4920-ab3c-80779491c582,7,7.0


In [11]:
users = data['uid'].unique()
grouped['per'] = grouped['sum'] / grouped['count']

def enc_pref(arg):
    """
        кто больше интересуется продажей - 1
        кто больше интересуется арендой - 0
    """
    if arg > 0.8:
        return 1
    elif arg < 0.2:
        return 0
    else:
        return -1
grouped['pref'] = grouped['per'].progress_apply(enc_pref)
print(grouped.shape)
grouped = grouped[grouped['pref'] >= 0]
print(grouped.shape)

100%|███████████████████████████████| 217713/217713 [00:00<00:00, 729042.19it/s]


(217713, 5)
(170419, 5)


In [12]:
with open('mapping_out_seq.json') as f:
    uid_encoder = json.load(f)
grouped['uid_enc'] = grouped['uid'].map(uid_encoder) + 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  grouped['uid_enc'] = grouped['uid'].map(uid_encoder) + 1


In [13]:
grouped.dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


In [14]:
grouped['uid_enc'] = grouped['uid_enc'].astype(int)
grouped[['uid_enc', 'pref']].to_csv('uid_12_week_preferences.csv')
grouped

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  grouped['uid_enc'] = grouped['uid_enc'].astype(int)


Unnamed: 0,uid,count,sum,per,pref,uid_enc
2,000695d8-e5c2-4d64-a570-2d5432575dc7,1,1.0,1.0,1,14908
3,00083E71-627E-4922-9714-CFB9CB05016F,1,1.0,1.0,1,77272
4,00098EAF-21C2-41E6-AE6C-9F96DD5408CC,13,0.0,0.0,0,153632
6,000AB396-667C-45B3-A4B5-9FC6CCDAF604,14,14.0,1.0,1,172584
10,000ce71d-e87f-4e76-901f-decef1606cea,17,17.0,1.0,1,114597
...,...,...,...,...,...,...
217702,ffddc5bf-6ee4-4c87-84b7-17d558349072,18,18.0,1.0,1,234623
217703,ffe09535-50e0-4d85-ac59-f35d27012fd6,7,7.0,1.0,1,115362
217704,ffe2cecd-a242-4cbf-909c-7a5cb6a8aea1,9,0.0,0.0,0,352851
217705,ffe31461-e6d7-479b-adfb-d5003a79c968,21,21.0,1.0,1,120610


=======================================================================

## Составляем итоговый датасет
Эмбеды пользователей из модели - признаки (с первой недели)

Предпочтения пользователя, которые только что получили - таргет (со второй недели)

In [15]:
user_embeds = np.load('user_embeddings_final.npy')
user_embeds = pd.DataFrame(user_embeds, columns = range(user_embeds.shape[1]))
user_embeds['num'] = range(user_embeds.shape[0])
user_embeds.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,num
0,0.876574,0.080785,8.623681,3.21042,2.515167,-1.65925,3.187721,4.355467,0.035435,-4.659596,...,-1.258561,1.693343,-1.452698,-2.223263,-0.626605,2.716868,3.236983,-0.680937,-1.195604,0
1,4.796618,1.073819,2.882599,-0.895706,3.312656,-3.419885,2.493178,5.870429,3.396194,-5.100319,...,-3.245037,1.35775,-2.459414,-2.612297,0.266907,2.706663,6.533426,3.658595,-1.058491,1
2,1.678901,-0.870985,8.215559,-1.460267,6.422165,-2.986025,0.550298,6.478464,2.385857,-4.60558,...,-3.400242,-2.355995,-0.090732,-0.994794,0.028755,5.719481,4.812598,1.217735,-4.290208,2
3,-3.687217,2.89496,10.01008,-1.33111,6.923524,-1.010847,5.124724,4.497645,1.368332,-1.995275,...,-1.811968,-1.017258,-2.530054,-2.727057,-0.828689,4.212907,3.111273,2.260993,-0.037053,3
4,0.335711,-2.066123,9.004489,-0.921573,4.573396,-4.751493,2.778687,4.257411,3.657955,-5.171963,...,-3.287347,1.613672,0.527825,-0.785314,-0.859856,2.080764,2.854068,-0.190898,-0.833137,4


In [17]:
final_data = user_embeds.merge(grouped[['uid_enc', 'pref']], left_on='num', right_on='uid_enc', how='inner')
final_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,93,94,95,96,97,98,99,num,uid_enc,pref
0,2.922725,0.830718,9.080918,-5.07421,7.456001,-0.255483,4.67861,7.503748,1.478974,-4.936927,...,0.887057,-2.739411,-4.122108,4.036154,1.399453,-3.438827,-1.1531,5,5,0
1,4.08051,0.59317,8.199186,-0.681509,2.790491,-3.139963,4.288406,2.628913,2.409329,-5.595078,...,-1.061931,-3.821832,-4.377748,0.712455,3.847312,0.833625,1.383679,7,7,1
2,6.448274,-0.432458,2.441519,-0.210807,3.417932,-1.393718,5.005304,-0.657679,4.510461,-2.432598,...,-0.518479,-3.6213,-5.126737,6.550021,3.019962,-0.780975,-1.335271,9,9,0
3,0.916382,-5.241655,8.070838,1.787305,5.989114,-5.491125,0.908961,3.160624,4.824673,-4.569901,...,-2.129529,-1.504479,0.143295,-0.473785,4.95516,1.703063,-3.454467,10,10,1
4,5.000651,0.733252,9.829646,0.338261,3.341683,-4.020606,1.996263,5.889336,1.988187,-5.366116,...,-2.91006,-3.205703,-4.104513,8.268032,7.33062,3.302877,-0.838978,11,11,1


In [18]:
final_data.drop(['uid_enc', 'num'], axis=1, inplace=True)

## Подстановка в модель

In [19]:
! pip install xgboost

Collecting xgboost
  Downloading xgboost-1.6.2-py3-none-macosx_10_15_x86_64.macosx_11_0_x86_64.macosx_12_0_x86_64.whl (1.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: xgboost
Successfully installed xgboost-1.6.2

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.1.2[0m[39;49m -> [0m[32;49m22.2.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [20]:
! pip install --upgrade pip

Collecting pip
  Downloading pip-22.2.2-py3-none-any.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 22.1.2
    Uninstalling pip-22.1.2:
      Successfully uninstalled pip-22.1.2
Successfully installed pip-22.2.2


In [21]:
from sklearn import model_selection
import xgboost as xgb
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

In [23]:
X = final_data.drop(['pref'], axis=1)
y = final_data['pref']

In [25]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.2, \
                                                                    random_state=42)

In [26]:
# Init classifier
xgb_cl = xgb.XGBClassifier()

# Fit
xgb_cl.fit(X_train, y_train)

# Predict
preds = xgb_cl.predict(X_test)

# Score
accuracy_score(y_test, preds)

0.6308082698947544

In [27]:
precision_score(y_test, preds)

0.6432904211215361

In [28]:
recall_score(y_test, preds)

0.9546043258645254

In [31]:
print(y_train.value_counts(normalize=True))
y_test.value_counts(normalize=True)

1    0.6463
0    0.3537
Name: pref, dtype: float64


1    0.642373
0    0.357627
Name: pref, dtype: float64

In [32]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test, preds)

0.5019037681433828

In [None]:
# Создание матриц наблюдений в формате DMatrix
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test)

In [None]:
# Гиперпараметры модели
xgb_pars = {'min_child_weight': 20, 'eta': 0.1, 'colsample_bytree': 0.9, 
            'max_depth': 6, 'subsample': 0.9, 'lambda': 1, 'nthread': -1, 
            'booster' : 'gbtree', 'eval_metric': ['precision', 'recall']#, 'objective': 'reg:squarederror'
           }
# Тренировочная и валидационная выборка
watchlist = [(dtrain, 'train'), (dtest, 'test')]
# Обучаем модель XGBoost
model = xgb.train(
    params=xgb_pars, #гиперпараметры модели
    dtrain=dtrain, #обучающая выборка
    num_boost_round=300, #количество моделей в ансамбле
    evals=watchlist, #выборки, на которых считается матрица
    early_stopping_rounds=20, #раняя остановка
    maximize=False, #смена поиска максимума на минимум
    verbose_eval=10 #шаг, через который происходит отображение метрик
)

In [None]:
#Делаем предсказание на тестовом наборе данных
y_test_predict = np.exp(model.predict(dtest)) - 1
print('Modeling RMSLE %.5f' % model.best_score)