In [1]:
import pandas as pd
import numpy as np
import json
from tqdm import tqdm
tqdm.pandas()

## Общая предобработка

In [2]:
data1 = pd.read_csv('internship_clickstream_data_1.gzip', compression='gzip')
data2 = pd.read_csv('internship_clickstream_data_2.gzip', compression='gzip')
data3 = pd.read_csv('internship_clickstream_data_3.gzip', compression='gzip')
print(data1.shape, data2.shape, data3.shape)

(30000000, 8) (30000000, 8) (22463346, 8)


Необходимо составить датасет, в котором фичи будут - эмбеддинги юзеров с первой недели, а таргеты - инфа об их склонностях к аренде/продаже со второй недели (исключаем data leak). Соответственно, с первой и второй недель нужно взять пересечение юзеров + важно, чтобы объявки с первой недели были также все те же, что и со второй.

In [3]:
# drop duplicates
data1.drop_duplicates(subset=['uid', 'offer_id'], inplace=True)
data2.drop_duplicates(subset=['uid', 'offer_id'], inplace=True)
data3.drop_duplicates(subset=['uid', 'offer_id'], inplace=True)
print(data1.shape, data2.shape, data3.shape)

# work with date
data1['timestamp'] = pd.to_datetime(data1['timestamp'])
data2['timestamp'] = pd.to_datetime(data2['timestamp'])
data3['timestamp'] = pd.to_datetime(data3['timestamp'])

# save users from 1 week
data1_s = data1[data1['timestamp'] < pd.to_datetime('2022-07-10 00:00:00.020000')]
data2_s = data2[data2['timestamp'] < pd.to_datetime('2022-07-10 00:00:00.020000')]
data3_s = data3[data3['timestamp'] < pd.to_datetime('2022-07-10 00:00:00.020000')]
users_1_week = pd.concat([data1_s, data2_s, data3_s])['uid'].unique()
offers_1_week = pd.concat([data1_s, data2_s, data3_s])['offer_id'].unique()

# filter by date, second week
data1 = data1[(data1['timestamp'] >= pd.to_datetime('2022-07-10 00:00:00.020000')) & \
              (data1['timestamp'] < pd.to_datetime('2022-07-17 00:00:00.020000'))]
data2 = data2[(data2['timestamp'] >= pd.to_datetime('2022-07-10 00:00:00.020000')) & \
              (data2['timestamp'] < pd.to_datetime('2022-07-17 00:00:00.020000'))]
data3 = data3[(data3['timestamp'] >= pd.to_datetime('2022-07-10 00:00:00.020000')) & \
              (data3['timestamp'] < pd.to_datetime('2022-07-17 00:00:00.020000'))]
print(data1.shape, data2.shape, data3.shape)

# take users and offers from 2 week who is in 1 week
data1 = data1[(data1['uid'].isin(users_1_week)) & (data1['offer_id'].isin(offers_1_week))]
data2 = data2[(data2['uid'].isin(users_1_week)) & (data2['offer_id'].isin(offers_1_week))]
data3 = data3[(data3['uid'].isin(users_1_week)) & (data3['offer_id'].isin(offers_1_week))]

data = pd.concat([data1, data2, data3])
print('after concating:', data.shape)
data.drop_duplicates(subset=['uid', 'offer_id'], inplace=True)
print('after drop duplicates:', data.shape)

(22585345, 8) (22559623, 8) (17427910, 8)
(4999240, 8) (5104941, 8) (4111483, 8)
after concating: (7842763, 8)
after drop duplicates: (6897045, 8)


In [4]:
# take users that have > 5 clicks
while not (data['offer_id'].value_counts()[data['offer_id'].value_counts() <= 5].empty) or \
      not (data['uid'].value_counts()[data['uid'].value_counts() <= 5].empty):
    offer_ids = data['offer_id'].value_counts()[data['offer_id'].value_counts() > 5].index
    data = data[data['offer_id'].isin(offer_ids)]
    uids = data['uid'].value_counts()[data['uid'].value_counts() > 5].index
    data = data[data['uid'].isin(uids)]
print(data.shape)

(4366021, 8)


## Добавляем lst-features
Чтобы выделить объявления для аренды и продажи

In [5]:
data11 = pd.read_csv('lst_announcement_data_1.gzip', compression='gzip', usecols=['id', 'category'])
data22 = pd.read_csv('lst_announcement_data_2.gzip', compression='gzip', usecols=['id', 'category'])
data33 = pd.read_csv('lst_announcement_data_3.gzip', compression='gzip', usecols=['id', 'category'])

In [6]:
print(data11.shape, data22.shape, data33.shape)
data11.drop_duplicates(subset=['id'], inplace=True)
data22.drop_duplicates(subset=['id'], inplace=True)
data33.drop_duplicates(subset=['id'], inplace=True)
print(data11.shape, data22.shape, data33.shape)

lst_features = pd.concat([data11, data22, data33])
print('after concating:', lst_features.shape)
lst_features.drop_duplicates(subset=['id'], inplace=True)
print('after drop duplicates:', lst_features.shape)

(1000000, 2) (1000000, 2) (778005, 2)
(1000000, 2) (1000000, 2) (778005, 2)
after concating: (2778005, 2)
after drop duplicates: (2778005, 2)


In [8]:
print(lst_features['category'].unique())

['townhouseSale' 'commercialLandRent' 'landSale' 'flatRent' 'flatSale'
 'houseSale' 'dailyHouseRent' 'newBuildingFlatSale' 'dailyFlatRent'
 'shoppingAreaSale' 'cottageRent' 'roomRent' 'freeAppointmentObjectRent'
 'businessSale' 'cottageSale' 'officeRent' 'freeAppointmentObjectSale'
 'townhouseRent' 'garageSale' 'houseShareSale' 'industryRent' 'officeSale'
 'roomSale' 'industrySale' 'warehouseRent' 'commercialLandSale'
 'houseRent' 'houseShareRent' 'garageRent' 'shoppingAreaRent'
 'buildingSale' 'dailyRoomRent' 'buildingRent' 'warehouseSale' 'bedRent'
 'flatShareSale' 'dailyBedRent' 'businessRent']


In [7]:
def sale_rent(arg):
    """
        кодируем объявления продажи 1
        аренды - 0
    """
    if 'Rent' in arg:
        return 0
    if 'Sale' in arg:
        return 1
    else:
        return np.nan
lst_features['category'] = lst_features['category'].progress_apply(sale_rent)
lst_features['category'].unique()

100%|█████████████████████████████| 2778005/2778005 [00:03<00:00, 824080.24it/s]


array([1, 0])

In [8]:
merged = data.merge(lst_features, left_on='offer_id', right_on='id', how='left')
print(merged.shape)
merged.head()

(4366021, 10)


Unnamed: 0,timestamp,hit_id,uid,platform,event_name,screen,offer_id,ptn_dadd,id,category
0,2022-07-16 00:00:12.928,cd1a21ecfd2f47f7,21404791,android,OpenOfferScreen,SearchResultsList,271617821,2022-07-16,271617821.0,1.0
1,2022-07-13 22:38:12.355,c326f40e13654821,92254503,ios,OpenOfferScreen,MapScreen,275559266,2022-07-13,275559266.0,0.0
2,2022-07-13 22:38:32.488,b75ee3fac5964206,33791824,ios,OpenOfferScreen,MapScreen,274687903,2022-07-13,274687903.0,1.0
3,2022-07-13 22:38:52.902,6ba2fd0f39b844af,12536475,android,OpenOfferScreen,BookmarksScreen,275136292,2022-07-13,275136292.0,1.0
4,2022-07-13 22:39:38.156,29ee5e2791af466a,15764141,ios,OpenOfferScreen,MapScreen,271908694,2022-07-13,271908694.0,0.0


In [11]:
grouped = merged.groupby(['uid'])['category'].agg(['count', 'sum']).reset_index()
grouped

Unnamed: 0,uid,count,sum
0,0001d81a-d960-4556-ac84-bdcfa1bb5bcd,17,17.0
1,000490b4-9978-408d-b133-85a58146af23,9,8.0
2,000695d8-e5c2-4d64-a570-2d5432575dc7,9,9.0
3,00083E71-627E-4922-9714-CFB9CB05016F,10,1.0
4,00098EAF-21C2-41E6-AE6C-9F96DD5408CC,13,0.0
...,...,...,...
217708,ffe558c1-8cf0-4373-984c-6757c9aeedc3,10,0.0
217709,fff12f36-aafc-4563-af81-e8a5155a7b60,6,6.0
217710,fff3cbe3-da73-438c-a16c-020b5219d4e1,25,0.0
217711,fffb9859-ba42-4920-ab3c-80779491c582,7,7.0


In [12]:
users = data['uid'].unique()
grouped['per'] = grouped['sum'] / grouped['count']

def enc_pref(arg):
    """
        кто больше интересуется продажей - 1
        кто больше интересуется арендой - 0
    """
    if arg > 0.8:
        return 1
    elif arg < 0.2:
        return 0
    else:
        return -1
grouped['pref'] = grouped['per'].progress_apply(enc_pref)
print(grouped.shape)
grouped = grouped[grouped['pref'] >= 0]
print(grouped.shape)

100%|███████████████████████████████| 217713/217713 [00:00<00:00, 696449.40it/s]

(217713, 5)
(195293, 5)





In [14]:
with open('mapping_out_seq.json') as f:
    uid_encoder = json.load(f)
grouped['uid_enc'] = grouped['uid'].map(uid_encoder) + 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  grouped['uid_enc'] = grouped['uid'].map(uid_encoder) + 1


In [20]:
grouped.dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


In [22]:
grouped['uid_enc'] = grouped['uid_enc'].astype(int)
grouped[['uid_enc', 'pref']].to_csv('uid_12_week_preferences.csv')
grouped

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  grouped['uid_enc'] = grouped['uid_enc'].astype(int)


Unnamed: 0,uid,count,sum,per,pref,uid_enc
2,000695d8-e5c2-4d64-a570-2d5432575dc7,9,9.0,1.000000,1,14908
3,00083E71-627E-4922-9714-CFB9CB05016F,10,1.0,0.100000,0,77272
4,00098EAF-21C2-41E6-AE6C-9F96DD5408CC,13,0.0,0.000000,0,153632
6,000AB396-667C-45B3-A4B5-9FC6CCDAF604,15,14.0,0.933333,1,172584
10,000ce71d-e87f-4e76-901f-decef1606cea,17,17.0,1.000000,1,114597
...,...,...,...,...,...,...
217704,ffe2cecd-a242-4cbf-909c-7a5cb6a8aea1,9,0.0,0.000000,0,352851
217705,ffe31461-e6d7-479b-adfb-d5003a79c968,21,21.0,1.000000,1,120610
217706,ffe31b84-655e-47f5-aca4-d27da38f9700,7,0.0,0.000000,0,411440
217707,ffe449af-e79a-42af-bc0c-2f01a63ad701,44,44.0,1.000000,1,26456


=======================================================================

## Составляем итоговый датасет
Эмбеды пользователей из модели - признаки (с первой недели)

Предпочтения пользователя, которые только что получили - таргет (со второй недели)

In [31]:
user_embeds = np.load('user_embeddings_final.npy')
user_embeds = pd.DataFrame(user_embeds, columns = range(user_embeds.shape[1]))
user_embeds['num'] = range(user_embeds.shape[0])
user_embeds.head()

Unnamed: 0,uid_enc,pref
2,14908,1
3,77272,0
4,153632,0
6,172584,1
10,114597,1
...,...,...
217704,352851,0
217705,120610,1
217706,411440,0
217707,26456,1


In [None]:
final_data = user_embeds.merge(grouped[['uid_enc', 'pref']], right_on='num', left_on='uid_enc', how='inner')
final_data.head()

## Подстановка в модель

In [None]:
X = final_data.drop(['pref'])
y = final_data['pref']

In [None]:
scaler = preprocessing.MinMaxScaler()
X_scaled = scaler.fit_transform(X) # ??
X_train, X_test, y_train, y_test = model_selection.train_test_split(X_scaled, y, test_size=0.2, \
                                                                    random_state=42)

In [None]:
# Создание матриц наблюдений в формате DMatrix
dtrain = xgb.DMatrix(X_train, label=y_train, feature_names=best_features)
#dvalid ??
dtest = xgb.DMatrix(X_test, feature_names=best_features)

In [None]:
# Гиперпараметры модели
xgb_pars = {'min_child_weight': 20, 'eta': 0.1, 'colsample_bytree': 0.9, 
            'max_depth': 6, 'subsample': 0.9, 'lambda': 1, 'nthread': -1, 
            'booster' : 'gbtree', 'eval_metric': 'rmse', 'objective': 'reg:squarederror'
           }
# Тренировочная и валидационная выборка
watchlist = [(dtrain, 'train'), (dvalid, 'valid')]
# Обучаем модель XGBoost
model = xgb.train(
    params=xgb_pars, #гиперпараметры модели
    dtrain=dtrain, #обучающая выборка
    num_boost_round=300, #количество моделей в ансамбле
    evals=watchlist, #выборки, на которых считается матрица
    early_stopping_rounds=20, #раняя остановка
    maximize=False, #смена поиска максимума на минимум
    verbose_eval=10 #шаг, через который происходит отображение метрик
)

In [None]:
#Делаем предсказание на тестовом наборе данных
y_test_predict = np.exp(model.predict(dtest)) - 1
print('Modeling RMSLE %.5f' % model.best_score)