In [58]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np
import json
from sklearn.preprocessing import MinMaxScaler
from tqdm import tqdm
tqdm.pandas()

In [2]:
#help(pd.read_csv)

In [3]:
data1 = pd.read_csv('lst_announcement_data_1.gzip', compression='gzip')
print('data1 has been read')
data2 = pd.read_csv('lst_announcement_data_2.gzip', compression='gzip')
print('data2 has been read')
data3 = pd.read_csv('lst_announcement_data_3.gzip', compression='gzip')
print('data3 has been read')

data1 has been read
data2 has been read
data3 has been read


In [4]:
%%time
data = pd.concat([data1, data2, data3])

CPU times: user 7.65 s, sys: 26.9 s, total: 34.5 s
Wall time: 54.8 s


In [5]:
del data1
del data2
del data3

In [6]:
data.head()

Unnamed: 0,id,floornumber,category,bargainterms,description,totalarea,geo,building,land,hasfurniture,roomscount,hasconditioner,haskitchenfurniture,hastv,haswasher,hasbathtub,hasgarage,hasshower,bedroomscount,hasdishwasher,repairtype,petsallowed,hasbathhouse,hasfridge,wclocationtype,kitchenarea,roomarea,isapartments,loggiascount,balconiescount,allroomsarea,livingarea,flattype,combinedwcscount,ptn_dadd
0,191401467,,townhouseSale,"{""price"":5.5E7,""currency"":""rur"",""priceType"":""a...",Все предложения в Барвихе! ОКП Барвиха Клаб. В...,400.0,"{""countryId"":138,""undergrounds"":[],""calculated...","{""materialType"":""monolith"",""parking"":{},""floor...","{""status"":""individualHousingConstruction"",""are...",,,,,,,,True,,5.0,,,,True,,indoors,,,,,,,,,,2022-08-01
1,212158376,,commercialLandRent,"{""clientFee"":0,""leaseTermType"":""longTerm"",""pri...",Сдается от собственника в аренду площадка 560 ...,,"{""countryId"":138,""undergrounds"":[],""calculated...","{""parking"":{},""cranageTypes"":[],""extinguishing...","{""status"":""settlements"",""possibleToChangeStatu...",,,,,,,,,,,,,,,,,,,,,,,,,,2022-08-01
2,227815832,,landSale,"{""price"":240000.0,""currency"":""rur"",""priceType""...",Продам участок в жилом районе с. ХОМУТОВО,,"{""countryId"":138,""undergrounds"":[],""calculated...","{""parking"":{},""cranageTypes"":[],""extinguishing...","{""status"":""individualHousingConstruction"",""are...",,,,,,,,,,,,,,,,,,,,,,,,,,2022-08-01
3,238017518,1.0,flatRent,"{""clientFee"":0,""leaseTermType"":""longTerm"",""pri...","Привет, наконец ты попал сюда!\n\nУютная студи...",27.0,"{""countryId"":138,""undergrounds"":[],""calculated...","{""materialType"":""brick"",""parking"":{},""floorsCo...",{},True,,True,True,,True,True,,,,,euro,False,,True,,,,False,,1.0,,,studio,,2022-08-01
4,241597454,20.0,flatSale,"{""price"":1.6E7,""currency"":""rur"",""priceType"":""a...",Видовой 20 этаж. Корпус Д. Срочная перепродажа...,60.65,"{""countryId"":138,""undergrounds"":[],""calculated...","{""name"":""ЖК «CASABLANKA (Касабланка)»"",""materi...",{},,2.0,,,,,,,,,,no,,,,,22.0,,False,1.0,1.0,18+20,,rooms,,2022-08-01


### Предыдущие действия по преобразованию данных

In [7]:
def get_nan_cols(df, nan_percent=0.8):
    """
        функция для определения колонок, в которых много NaN
    """
    threshold = len(df.index) * nan_percent
    return [c for c in df.columns if df[c].isna().sum() >= threshold]

colss = get_nan_cols(data, 0.9)
data = data.drop(colss, axis=1)
data.drop(['ptn_dadd'], axis=1, inplace=True) 

In [8]:
data['loc_id'] = data['geo'].progress_apply(lambda x: json.loads(x)['address'][0]['id'])
data['lat'] = data['geo'].progress_apply(lambda x: json.loads(x)['coordinates']['lat'])
data['lng'] = data['geo'].progress_apply(lambda x: json.loads(x)['coordinates']['lng'])
data.drop(['geo'], axis=1, inplace=True)

100%|██████████████████████████████| 2778005/2778005 [01:52<00:00, 24635.05it/s]
100%|██████████████████████████████| 2778005/2778005 [01:51<00:00, 24936.67it/s]
100%|██████████████████████████████| 2778005/2778005 [01:51<00:00, 24896.16it/s]


In [9]:
data['bargainterms'] = data['bargainterms'].progress_apply(json.loads)
keys_bargainterms = data['bargainterms'].apply(lambda x: x.keys()).explode().unique()
bargainterms_df = pd.DataFrame(data['bargainterms'].values.tolist())

data['building'] = data['building'].progress_apply(json.loads)
keys_building = data['building'].apply(lambda x: x.keys()).explode().unique()
building_df = pd.DataFrame(data['building'].values.tolist())

data['land'] = data['land'].progress_apply(json.loads)
keys_land = data['land'].apply(lambda x: x.keys()).explode().unique()
land_df = pd.DataFrame(data['land'].values.tolist())

land_df.rename(columns={'type': 'land_type'}, inplace=True)

100%|█████████████████████████████| 2778005/2778005 [00:26<00:00, 105505.13it/s]
100%|██████████████████████████████| 2778005/2778005 [02:15<00:00, 20502.41it/s]
100%|█████████████████████████████| 2778005/2778005 [00:09<00:00, 288661.00it/s]


In [10]:
# удаление строковых колонок
def del_str_cols(df):
    columns = df.columns
    for col in columns:
        if type(df[col].iloc[1]) == str:
            df.drop(col, inplace=True, axis=1)
    return df

print(data.shape)
print(bargainterms_df.shape)
print(building_df.shape)
print(land_df.shape)

data = del_str_cols(data)
bargainterms_df = del_str_cols(bargainterms_df)
building_df = del_str_cols(building_df)
land_df = del_str_cols(land_df)

print(data.shape)
print(bargainterms_df.shape)
print(building_df.shape)
print(land_df.shape)

(2778005, 30)
(2778005, 28)
(2778005, 33)
(2778005, 5)
(2778005, 28)
(2778005, 22)
(2778005, 33)
(2778005, 3)


In [11]:
# почистим колонки с большим кол-вом пропусков и удалим ненужные словарные
data.drop(['bargainterms', 'building', 'land'], axis=1, inplace=True)
colss = get_nan_cols(data, 0.9)
data.drop(colss, axis=1, inplace=True)

colss = get_nan_cols(bargainterms_df, 0.9)
bargainterms_df.drop(colss, axis=1, inplace=True)

colss = get_nan_cols(building_df, 0.9)
building_df.drop(colss, axis=1, inplace=True)

colss = get_nan_cols(land_df, 0.9)
land_df.drop(colss, axis=1, inplace=True)

print(data.shape)
print(bargainterms_df.shape)
print(building_df.shape)
print(land_df.shape)

(2778005, 25)
(2778005, 9)
(2778005, 17)
(2778005, 1)


In [12]:
display(data.head())
display(bargainterms_df.head())
display(building_df.head())
land_df.head()

Unnamed: 0,id,floornumber,totalarea,hasfurniture,roomscount,haskitchenfurniture,hastv,haswasher,hasbathtub,bedroomscount,repairtype,petsallowed,hasfridge,wclocationtype,kitchenarea,isapartments,loggiascount,balconiescount,allroomsarea,livingarea,flattype,combinedwcscount,loc_id,lat,lng
0,191401467,,400.0,,,,,,,5.0,,,,indoors,,,,,,,,,4593,55.718725,37.272026
1,212158376,,,,,,,,,,,,,,,,,,,,,,4579,61.809605,34.296402
2,227815832,,,,,,,,,,,,,,,,,,,,,,4572,52.485696,104.341456
3,238017518,1.0,27.0,True,,True,,True,True,,euro,False,True,,,False,,1.0,,,studio,,4553,45.021564,38.925492
4,241597454,20.0,60.65,,2.0,,,,,,no,,,,22.0,False,1.0,1.0,18+20,,rooms,,4584,43.485547,39.893921


Unnamed: 0,price,includedOptions,mortgageAllowed,clientFee,agentFee,prepayMonths,deposit,utilitiesTerms,saleType
0,55000000.0,[],True,,,,,,
1,6000.0,[],,0.0,0.0,1.0,,,
2,240000.0,[],,,,,,,
3,12990.0,[],,0.0,0.0,1.0,8000.0,"{'includedInPrice': False, 'price': 1.0, 'flow...",
4,16000000.0,[],True,,,,,,free


Unnamed: 0,materialType,parking,floorsCount,heatingType,cranageTypes,extinguishingSystemTypes,liftTypes,infrastructure,totalArea,openingHours,deadline,name,hasGarbageChute,passengerLiftsCount,buildYear,cargoLiftsCount,ceilingHeight
0,monolith,{},4.0,centralGas,[],[],[],{},400.0,{},{},,,,,,
1,,{},,,[],[],[],{},,{},{},,,,,,
2,,{},,,[],[],[],{},,{},{},,,,,,
3,brick,{},3.0,,[],[],[],{},27.0,{},{},,,,,,
4,monolith,{'type': 'multilevel'},20.0,,[],[],[],{},60.7,{},{},ЖК «CASABLANKA (Касабланка)»,True,3.0,2014.0,,


Unnamed: 0,area
0,2.0
1,5.0
2,10.0
3,
4,


### Очистка от выбросов


In [14]:
data = data.join(bargainterms_df).join(building_df).join(land_df)

In [13]:
# here is testing
col_name = data['ceilingHeight']
print(col_name.min())
print(col_name.max())

0.0
4500.0


In [15]:
# rows to delete
data.drop(data[data['kitchenarea'] == -1].index, inplace=True)
data.drop(data[data['balconiescount'] == -1].index, inplace=True)
data.drop(data[data['combinedwcscount'] < 0].index, inplace=True) 
data.drop(data[data['passengerLiftsCount'] < 0].index, inplace=True)
data.drop(data[data['cargoLiftsCount'] < 0].index, inplace=True)

### Разбиение на бины

In [16]:
### test for defining parameter q
col_name = data['buildYear']
q =10
pd.qcut(col_name, q=q)

0         NaN
0         NaN
0         NaN
1         NaN
1         NaN
         ... 
999997    NaN
999998    NaN
999998    NaN
999999    NaN
999999    NaN
Name: buildYear, Length: 2777998, dtype: category
Categories (10, interval[float64, right]): [(999.999, 1962.0] < (1962.0, 1971.0] < (1971.0, 1980.0] < (1980.0, 1991.0] ... (2011.0, 2015.0] < (2015.0, 2018.0] < (2018.0, 2021.0] < (2021.0, 2029.0]]

In [17]:
def my_qcut(col, q):
    
    tmp = pd.qcut(col, q=q)
    encoder = dict(zip(tmp.unique(), list(range(q + 1))))
    return col.map(encoder)

##### data:

floornumber 9
totalarea 10
hasfurniture BOOL
roomscount 4
haskitchenfurniture	hastv	haswasher	hasbathtub BOOL
bedroomscount 4
repairtype CAT
petsallowed	hasfridge BOOL
wclocationtype CAT
kitchenarea	10
isapartments BOOL
loggiascount 1
balconiescount 3
##allroomsarea DELETE
livingarea 10
flattype CAT
combinedwcscount 2
loc_id - возможно, ранжирование не имеет смысла
lat	10
lng	10

##### bargainterms_df:

price 10
##includedOptions DELETE
mortgageAllowed	BOOL
clientFee 2
agentFee 1
prepayMonths 1
deposit 4	
utilitiesTerms DICT
saleType CAT

##### building_df:

materialType CAT
parking DICT
floorsCount 6
heatingType CAT
##cranageTypes DELETE
##extinguishingSystemTypes DELETE
##liftTypes DELETE
infrastructure DICT
totalArea 10
openingHours DICT
deadline DICT	
##name DELETE
hasGarbageChute BOOL
passengerLiftsCount 2
buildYear 10
cargoLiftsCount 3
ceilingHeight 10

##### land_df:

area 10

In [20]:
data['floornumber'] = my_qcut(data['floornumber'], 9)
data['totalarea'] = my_qcut(data['totalarea'], 10)
data['roomscount'] = my_qcut(data['roomscount'], 4)
data['bedroomscount'] = my_qcut(data['bedroomscount'], 4)
data['kitchenarea'] = my_qcut(data['kitchenarea'], 10)
data['loggiascount'] = my_qcut(data['loggiascount'], 1)
data['balconiescount'] = my_qcut(data['balconiescount'], 2)
data['livingarea'] = my_qcut(data['livingarea'], 10)
data['combinedwcscount'] = my_qcut(data['combinedwcscount'], 2)
data['lat'] = my_qcut(data['lat'], 10)
data['lng'] = my_qcut(data['lng'], 10)

# из bargainterms_df
data['price'] = my_qcut(data['price'], 10)
data['clientFee'] = my_qcut(data['clientFee'], 1)
data['agentFee'] = my_qcut(data['agentFee'], 1)
data['prepayMonths'] = my_qcut(data['prepayMonths'], 1)
data['deposit'] = my_qcut(data['deposit'], 4)

# из building_df
data['floorsCount'] = my_qcut(data['floorsCount'], 6)
data['totalArea'] = my_qcut(data['totalArea'], 10)
data['passengerLiftsCount'] = my_qcut(data['passengerLiftsCount'], 2)
data['buildYear'] = my_qcut(data['buildYear'], 10)
data['cargoLiftsCount'] = my_qcut(data['cargoLiftsCount'], 2)
data['ceilingHeight'] = my_qcut(data['ceilingHeight'], 10)

# из land_df
data['area'] = my_qcut(data['area'], 10)

In [21]:
data

Unnamed: 0,id,floornumber,totalarea,hasfurniture,roomscount,haskitchenfurniture,hastv,haswasher,hasbathtub,bedroomscount,repairtype,petsallowed,hasfridge,wclocationtype,kitchenarea,isapartments,loggiascount,balconiescount,allroomsarea,livingarea,flattype,combinedwcscount,loc_id,lat,lng,price,includedOptions,mortgageAllowed,clientFee,agentFee,prepayMonths,deposit,utilitiesTerms,saleType,materialType,parking,floorsCount,heatingType,cranageTypes,extinguishingSystemTypes,liftTypes,infrastructure,totalArea,openingHours,deadline,name,hasGarbageChute,passengerLiftsCount,buildYear,cargoLiftsCount,ceilingHeight,area
0,191401467,,0.0,,,,,,,0.0,,,,indoors,,,,,,,,,4593,0,0.0,0,[],True,,,,,,,monolith,{},0.0,centralGas,[],[],[],{},0.0,{},{},,,,,,,0.0
0,275557007,1.0,1.0,,,,,,,,euro,,,,,False,,,,,studio,1.0,4584,1,1.0,0,[],True,,,,,,,monolith,{},0.0,centralGas,[],[],[],{},0.0,{},{},,,,,,,0.0
0,274482535,,2.0,,,,,,,,,,,,,,,,,,,,4593,0,2.0,0,[],True,,,,,,,monolith,{},0.0,centralGas,[],[],[],{},0.0,{},{},,,,,,,0.0
1,212158376,,,,,,,,,,,,,,,,,,,,,,4579,2,3.0,1,[],,1.0,1.0,1.0,,,,,{},,,[],[],[],{},,{},{},,,,,,,1.0
1,275619810,2.0,4.0,,1.0,,,,,,euro,,,,,,1.0,1.0,,,rooms,1.0,4629,2,4.0,1,[],,1.0,1.0,1.0,,,,,{},,,[],[],[],{},,{},{},,,,,,,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999997,274206329,,,,,,,,,,,,,,,,,,,,,,4591,5,7.0,2,[],,,,,,,,,{},,,[],[],[],{},,{},{},,,,,,,2.0
999998,275226384,1.0,1.0,,,,,,,,,,,,4.0,False,,,10.06,8.0,studio,,2,2,5.0,5,[],False,,,,,,fz214,,{},6.0,,[],[],[],{},2.0,{},"{'quarter': 'second', 'year': 2024, 'isComplet...",,,,,,1.0,
999998,274265180,4.0,0.0,,,,,,,,,,,,,,,,,,,,4596,5,8.0,5,[],False,,,,,,fz214,,{},6.0,,[],[],[],{},2.0,{},"{'quarter': 'second', 'year': 2024, 'isComplet...",,,,,,1.0,
999999,275347394,9.0,6.0,,2.0,,,,,,,,,,9.0,,,2.0,,4.0,rooms,,4599,9,4.0,8,[],True,,,,,"{'includedInPrice': False, 'price': 0.0, 'flow...",free,panel,{},4.0,,[],[],[],{},,{},{},,,2.0,,,4.0,


In [22]:
colss = get_nan_cols(data, 0.9)
data = data.drop(colss, axis=1)
data.head()

Unnamed: 0,id,floornumber,totalarea,hasfurniture,roomscount,haskitchenfurniture,hastv,haswasher,hasbathtub,bedroomscount,repairtype,petsallowed,hasfridge,wclocationtype,kitchenarea,isapartments,loggiascount,balconiescount,allroomsarea,livingarea,flattype,combinedwcscount,loc_id,lat,lng,price,includedOptions,mortgageAllowed,clientFee,agentFee,prepayMonths,deposit,utilitiesTerms,saleType,materialType,parking,floorsCount,heatingType,cranageTypes,extinguishingSystemTypes,liftTypes,infrastructure,totalArea,openingHours,deadline,hasGarbageChute,passengerLiftsCount,buildYear,cargoLiftsCount,ceilingHeight,area
0,191401467,,0.0,,,,,,,0.0,,,,indoors,,,,,,,,,4593,0,0.0,0,[],True,,,,,,,monolith,{},0.0,centralGas,[],[],[],{},0.0,{},{},,,,,,0.0
0,275557007,1.0,1.0,,,,,,,,euro,,,,,False,,,,,studio,1.0,4584,1,1.0,0,[],True,,,,,,,monolith,{},0.0,centralGas,[],[],[],{},0.0,{},{},,,,,,0.0
0,274482535,,2.0,,,,,,,,,,,,,,,,,,,,4593,0,2.0,0,[],True,,,,,,,monolith,{},0.0,centralGas,[],[],[],{},0.0,{},{},,,,,,0.0
1,212158376,,,,,,,,,,,,,,,,,,,,,,4579,2,3.0,1,[],,1.0,1.0,1.0,,,,,{},,,[],[],[],{},,{},{},,,,,,1.0
1,275619810,2.0,4.0,,1.0,,,,,,euro,,,,,,1.0,1.0,,,rooms,1.0,4629,2,4.0,1,[],,1.0,1.0,1.0,,,,,{},,,[],[],[],{},,{},{},,,,,,1.0


In [23]:
# удалим нечисловые признаки
data.drop(['hasfurniture', 'haskitchenfurniture', 'hastv', 'haswasher', 'hasbathtub', \
           'repairtype', 'petsallowed', 'hasfridge', 'wclocationtype', 'isapartments', \
           'flattype', 'loc_id', 'includedOptions', 'mortgageAllowed', 'utilitiesTerms', \
           'saleType', 'materialType', 'parking', 'heatingType', 'cranageTypes', \
           'extinguishingSystemTypes', 'liftTypes', 'infrastructure', 'openingHours',\
           'deadline', 'hasGarbageChute', 'allroomsarea'], axis=1, inplace=True)

In [24]:
data.head()

Unnamed: 0,id,floornumber,totalarea,roomscount,bedroomscount,kitchenarea,loggiascount,balconiescount,livingarea,combinedwcscount,lat,lng,price,clientFee,agentFee,prepayMonths,deposit,floorsCount,totalArea,passengerLiftsCount,buildYear,cargoLiftsCount,ceilingHeight,area
0,191401467,,0.0,,0.0,,,,,,0,0.0,0,,,,,0.0,0.0,,,,,0.0
0,275557007,1.0,1.0,,,,,,,1.0,1,1.0,0,,,,,0.0,0.0,,,,,0.0
0,274482535,,2.0,,,,,,,,0,2.0,0,,,,,0.0,0.0,,,,,0.0
1,212158376,,,,,,,,,,2,3.0,1,1.0,1.0,1.0,,,,,,,,1.0
1,275619810,2.0,4.0,1.0,,,1.0,1.0,,1.0,2,4.0,1,1.0,1.0,1.0,,,,,,,,1.0


In [25]:
data.shape

(2777998, 24)

### Обработка пропусков

In [26]:
# обработка пропусков
# средним???
def fillna_num(data_col):
    return data_col.fillna(data_col.mean())

for col in tqdm(data.columns[1:]):
    data[col] = fillna_num(data[col])

100%|███████████████████████████████████████████| 23/23 [00:00<00:00, 24.42it/s]


### Отфильтруем id, что есть в clickstream 

In [27]:
data1 = pd.read_csv('internship_clickstream_data_1.gzip', compression='gzip')
data2 = pd.read_csv('internship_clickstream_data_2.gzip', compression='gzip')
data3 = pd.read_csv('internship_clickstream_data_3.gzip', compression='gzip')
print(data1.shape, data2.shape, data3.shape)

(30000000, 8) (30000000, 8) (22463346, 8)


In [28]:
# drop duplicates
data1.drop_duplicates(subset=['uid', 'offer_id'], inplace=True)
data2.drop_duplicates(subset=['uid', 'offer_id'], inplace=True)
data3.drop_duplicates(subset=['uid', 'offer_id'], inplace=True)
print(data1.shape, data2.shape, data3.shape)

# filter by date
data1['timestamp'] = pd.to_datetime(data1['timestamp'])
data2['timestamp'] = pd.to_datetime(data2['timestamp'])
data3['timestamp'] = pd.to_datetime(data3['timestamp'])
data1 = data1[data1['timestamp'] < pd.to_datetime('2022-07-10 00:00:00.020000')]
data2 = data2[data2['timestamp'] < pd.to_datetime('2022-07-10 00:00:00.020000')]
data3 = data3[data3['timestamp'] < pd.to_datetime('2022-07-10 00:00:00.020000')]
print(data1.shape, data2.shape, data3.shape)

clickstream = pd.concat([data1, data2, data3])
print('after concating:', clickstream.shape)
clickstream.drop_duplicates(subset=['uid', 'offer_id'], inplace=True)
print('after drop duplicates:', clickstream.shape)

(22585345, 8) (22559623, 8) (17427910, 8)
(5602387, 8) (5352304, 8) (3440925, 8)
after concating: (14395616, 8)
after drop duplicates: (12581794, 8)


In [29]:
del data1
del data2
del data3

In [30]:
# check what offers is in embedding file
embedding_ID = data['id']
clickstream = clickstream[clickstream['offer_id'].isin(embedding_ID)]

In [31]:
# take users that have > 5 clicks
while not (clickstream['offer_id'].value_counts()[clickstream['offer_id'].value_counts() <= 5].empty) or \
      not (clickstream['uid'].value_counts()[clickstream['uid'].value_counts() <= 5].empty):
    offer_ids = clickstream['offer_id'].value_counts()[clickstream['offer_id'].value_counts() > 5].index
    clickstream = clickstream[clickstream['offer_id'].isin(offer_ids)]
    uids = clickstream['uid'].value_counts()[clickstream['uid'].value_counts() > 5].index
    clickstream = clickstream[clickstream['uid'].isin(uids)]
print(clickstream.shape)

(8663549, 8)


In [53]:
# фильтруем
def filter_embed(embeds, offers):
    """
        Filter embedding file, leaves only those that are in the dataset
        INPUT: embeds, offers - numpy arrays (of lists)
        OUTPUT: modified dataset
    """
    embeds['mask'] = np.isin(np.array(embeds['id']), np.array(offers))#.reshape((embeds.shape[0], 1))
    embeds = embeds[embeds['mask'] != 0]
    embeds.drop(['mask'], inplace=True, axis=1)
    return embeds
offers = clickstream['offer_id'].unique()
embeddings = filter_embed(data, offers)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [54]:
embeddings.shape

(388604, 24)

### Группировка по ID

In [55]:
sh1 = data.shape
sh2 = data.drop_duplicates(subset=embeddings.columns[1:], inplace=False).shape
print(sh1, sh2)
sh1[0] - sh2[0]

(2777998, 25) (2436445, 25)


341553

In [56]:
sh1 = embeddings.shape
sh2 = embeddings.drop_duplicates(subset=embeddings.columns[1:], inplace=False).shape
print(sh1, sh2)
sh1[0] - sh2[0]

(388604, 24) (375399, 24)


13205

In [57]:
embeddings.head()

Unnamed: 0,id,floornumber,totalarea,roomscount,bedroomscount,kitchenarea,loggiascount,balconiescount,livingarea,combinedwcscount,lat,lng,price,clientFee,agentFee,prepayMonths,deposit,floorsCount,totalArea,passengerLiftsCount,buildYear,cargoLiftsCount,ceilingHeight,area
2,274683091,4.946701,0.0,2.285319,2.391695,5.41219,1.0,1.031852,5.409267,1.05114,2,5.0,2,1.0,1.0,1.0,2.522954,2.927414,5.402581,1.79454,5.113357,1.892738,4.441047,2.0
3,238017518,4.0,1.0,2.285319,2.391695,5.41219,1.0,1.0,5.409267,1.05114,4,1.0,1,1.0,1.0,1.0,1.0,0.0,2.0,1.79454,5.113357,1.892738,4.441047,5.072844
6,246784149,4.946701,5.0,2.285319,3.0,5.41219,1.0,1.031852,5.409267,1.05114,8,3.0,1,1.0,1.0,1.0,2.0,2.927414,4.0,1.79454,1.0,1.892738,4.441047,0.0
12,261757282,6.0,9.0,1.0,2.391695,2.0,1.0,1.031852,4.0,1.05114,9,4.0,7,1.0,1.0,1.0,2.522954,4.0,3.0,1.79454,5.113357,1.892738,2.0,5.072844
16,263336719,4.946701,0.0,2.285319,2.0,5.41219,1.0,1.031852,5.409267,1.05114,5,0.0,6,1.0,1.0,1.0,1.0,3.0,0.0,1.79454,3.0,1.892738,4.441047,6.0


In [59]:
embeddings['id'] = embeddings['id'].astype(str) + '+'

In [60]:
embeddings_grouped = embeddings.groupby(list(embeddings.columns[1:])).sum().reset_index()
embeddings_grouped

Unnamed: 0,floornumber,totalarea,roomscount,bedroomscount,kitchenarea,loggiascount,balconiescount,livingarea,combinedwcscount,lat,lng,price,clientFee,agentFee,prepayMonths,deposit,floorsCount,totalArea,passengerLiftsCount,buildYear,cargoLiftsCount,ceilingHeight,area,id
0,1.0,0.0,1.0,2.391695,3.0,1.0,1.000000,5.409267,1.00000,2,5.0,0,1.0,1.0,1.0,2.522954,3.0,0.000000,1.79454,5.000000,1.892738,4.441047,8.000000,275275003+
1,1.0,0.0,1.0,2.391695,3.0,1.0,1.000000,9.000000,1.00000,2,5.0,0,1.0,1.0,1.0,2.522954,6.0,8.000000,2.00000,5.000000,2.000000,8.000000,5.072844,268076006+
2,1.0,0.0,1.0,2.391695,3.0,1.0,1.000000,9.000000,1.05114,4,2.0,2,1.0,1.0,1.0,2.522954,3.0,6.000000,1.79454,7.000000,1.892738,4.441047,8.000000,198216332+
3,1.0,0.0,1.0,2.391695,3.0,1.0,1.000000,9.000000,1.05114,8,4.0,0,1.0,1.0,1.0,2.522954,2.0,0.000000,1.79454,4.000000,1.892738,4.441047,5.072844,275296875+
4,1.0,0.0,1.0,2.391695,3.0,1.0,1.000000,9.000000,2.00000,2,5.0,0,1.0,1.0,1.0,2.522954,6.0,0.000000,1.00000,3.000000,2.000000,4.441047,5.072844,274837176+
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
375394,9.0,10.0,4.0,2.391695,3.0,1.0,2.000000,5.409267,1.00000,1,8.0,7,1.0,1.0,1.0,2.522954,4.0,5.402581,2.00000,5.113357,1.892738,3.000000,5.072844,272171273+
375395,9.0,10.0,4.0,2.391695,4.0,1.0,1.031852,5.409267,1.05114,7,4.0,1,1.0,1.0,1.0,3.000000,4.0,6.000000,2.00000,8.000000,1.892738,4.441047,5.072844,269924487+
375396,9.0,10.0,4.0,2.391695,6.0,1.0,1.000000,5.409267,1.00000,5,7.0,8,1.0,1.0,1.0,2.522954,4.0,6.000000,1.79454,5.113357,1.000000,6.000000,5.072844,275311757+
375397,9.0,10.0,4.0,2.391695,8.0,1.0,1.000000,4.000000,1.00000,6,7.0,1,1.0,1.0,1.0,2.000000,4.0,6.000000,2.00000,4.000000,2.000000,4.441047,5.072844,274778317+


### Кодирование clickstream-embeddings по offer_id

In [61]:
embeddings_grouped['counter_range'] = [i for i in range(1, embeddings_grouped.shape[0] + 1)]
embeddings_grouped['id'] = embeddings_grouped['id'].apply(lambda x: x[:-1])
embeddings_grouped

Unnamed: 0,floornumber,totalarea,roomscount,bedroomscount,kitchenarea,loggiascount,balconiescount,livingarea,combinedwcscount,lat,lng,price,clientFee,agentFee,prepayMonths,deposit,floorsCount,totalArea,passengerLiftsCount,buildYear,cargoLiftsCount,ceilingHeight,area,id,counter_range
0,1.0,0.0,1.0,2.391695,3.0,1.0,1.000000,5.409267,1.00000,2,5.0,0,1.0,1.0,1.0,2.522954,3.0,0.000000,1.79454,5.000000,1.892738,4.441047,8.000000,275275003,1
1,1.0,0.0,1.0,2.391695,3.0,1.0,1.000000,9.000000,1.00000,2,5.0,0,1.0,1.0,1.0,2.522954,6.0,8.000000,2.00000,5.000000,2.000000,8.000000,5.072844,268076006,2
2,1.0,0.0,1.0,2.391695,3.0,1.0,1.000000,9.000000,1.05114,4,2.0,2,1.0,1.0,1.0,2.522954,3.0,6.000000,1.79454,7.000000,1.892738,4.441047,8.000000,198216332,3
3,1.0,0.0,1.0,2.391695,3.0,1.0,1.000000,9.000000,1.05114,8,4.0,0,1.0,1.0,1.0,2.522954,2.0,0.000000,1.79454,4.000000,1.892738,4.441047,5.072844,275296875,4
4,1.0,0.0,1.0,2.391695,3.0,1.0,1.000000,9.000000,2.00000,2,5.0,0,1.0,1.0,1.0,2.522954,6.0,0.000000,1.00000,3.000000,2.000000,4.441047,5.072844,274837176,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
375394,9.0,10.0,4.0,2.391695,3.0,1.0,2.000000,5.409267,1.00000,1,8.0,7,1.0,1.0,1.0,2.522954,4.0,5.402581,2.00000,5.113357,1.892738,3.000000,5.072844,272171273,375395
375395,9.0,10.0,4.0,2.391695,4.0,1.0,1.031852,5.409267,1.05114,7,4.0,1,1.0,1.0,1.0,3.000000,4.0,6.000000,2.00000,8.000000,1.892738,4.441047,5.072844,269924487,375396
375396,9.0,10.0,4.0,2.391695,6.0,1.0,1.000000,5.409267,1.00000,5,7.0,8,1.0,1.0,1.0,2.522954,4.0,6.000000,1.79454,5.113357,1.000000,6.000000,5.072844,275311757,375397
375397,9.0,10.0,4.0,2.391695,8.0,1.0,1.000000,4.000000,1.00000,6,7.0,1,1.0,1.0,1.0,2.000000,4.0,6.000000,2.00000,4.000000,2.000000,4.441047,5.072844,274778317,375398


In [78]:
embeddings_grouped['id'] = embeddings_grouped['id'].apply(lambda x: x.split('+'))
for_map = embeddings_grouped[['id', 'counter_range']].explode('id')

In [82]:
for_map['id'] = for_map['id'].astype(int)
print(for_map.shape)
for_map

(388604, 2)


Unnamed: 0,id,counter_range
0,275275003,1
1,268076006,2
2,198216332,3
3,275296875,4
4,274837176,5
...,...,...
375394,272171273,375395
375395,269924487,375396
375396,275311757,375397
375397,274778317,375398


In [83]:
# offer encoder
offer_encoder = dict(zip(for_map['id'], for_map['counter_range']))
clickstream['offer_id_enc'] = clickstream['offer_id'].map(offer_encoder)
clickstream.head()

Unnamed: 0,timestamp,hit_id,uid,platform,event_name,screen,offer_id,ptn_dadd,offer_id_enc
680507,2022-07-09 00:38:02.242,9d3c7688534e48c5,49913337,ios,OpenOfferScreen,RecommendationsScreen,274396956,2022-07-09,268210
680508,2022-07-09 00:38:51.240,637eb286bad14cd9,28AC55D5-B94C-4AF1-B1B3-0A72A71BE0DA,ios,OpenOfferScreen,SearchResultsList,275220523,2022-07-09,357908
680509,2022-07-09 00:39:20.699,998700915cc746ed,76885983,ios,OpenOfferScreen,NewBuildingOffersList,275324448,2022-07-09,367811
680511,2022-07-09 00:40:27.075,c45598e4b9294c8e,28AC55D5-B94C-4AF1-B1B3-0A72A71BE0DA,ios,OpenOfferScreen,MapScreen,275697649,2022-07-09,222192
680512,2022-07-09 00:40:37.734,c4d64940f28f4431,90050966,ios,OpenOfferScreen,RecommendationsScreen,272633121,2022-07-09,173239


In [86]:
# user's is encoder
uid_encoder = {uid: ind for ind, uid in enumerate(clickstream['uid'].unique())}
clickstream['uid_enc'] = clickstream['uid'].map(uid_encoder) + 1

In [87]:
# sort by time and user id
clickstream['timestamp'] = pd.to_datetime(clickstream['timestamp'])
clickstream.sort_values(by=['uid_enc', 'timestamp'], inplace=True)
print(clickstream.shape)
clickstream.head(5)

(8663549, 10)


Unnamed: 0,timestamp,hit_id,uid,platform,event_name,screen,offer_id,ptn_dadd,offer_id_enc,uid_enc
13567692,2022-07-03 10:31:36.281,172f4ac946ff4677,49913337,ios,OpenOfferScreen,RecommendationsScreen,271821092,2022-07-03,256223,1
13567695,2022-07-03 10:31:53.808,57e6bd3399b94d2f,49913337,ios,OpenOfferScreen,RecommendationsScreen,271133138,2022-07-03,251986,1
60022,2022-07-03 10:32:56.753,b2071702a85d48e3,49913337,ios,OpenOfferScreen,RecommendationsScreen,273896418,2022-07-03,340542,1
29665848,2022-07-03 10:33:42.214,64508d9634914757,49913337,ios,OpenOfferScreen,RecommendationsScreen,274877165,2022-07-03,225356,1
13567714,2022-07-03 10:36:08.711,85aa6497a7484120,49913337,ios,OpenOfferScreen,RecommendationsScreen,274931249,2022-07-03,80312,1


In [89]:
# create .txt file for input to model
clickstream[['uid_enc', 'offer_id_enc']].to_csv('out_bins.txt', sep='\t', header=False, index=False)

### Вернёмся к эмбеддингам и сохраним полный преобразованный файл с ними

### Масштабирование данных

In [96]:
embeddings_grouped.columns[:-2]

Index(['floornumber', 'totalarea', 'roomscount', 'bedroomscount',
       'kitchenarea', 'loggiascount', 'balconiescount', 'livingarea',
       'combinedwcscount', 'lat', 'lng', 'price', 'clientFee', 'agentFee',
       'prepayMonths', 'deposit', 'floorsCount', 'totalArea',
       'passengerLiftsCount', 'buildYear', 'cargoLiftsCount', 'ceilingHeight',
       'area'],
      dtype='object')

In [97]:
# отмасштабируем числовые признаки, кроме id
scaler = MinMaxScaler()
embeddings_scaled = scaler.fit_transform(embeddings_grouped[embeddings_grouped.columns[:-2]])
print(np.max(embeddings_scaled), np.min(embeddings_scaled))

1.0 0.0


In [98]:
embeddings = np.hstack((np.array(embeddings_grouped['id']).reshape((embeddings_grouped.shape[0], 1)), \
                        embeddings_scaled))
print(embeddings.shape)
embeddings

(375399, 24)


array([[list(['275275003']), 0.0, 0.0, ..., 0.8927382283214156,
        0.38233858717265956, 0.8],
       [list(['268076006']), 0.0, 0.0, ..., 1.0, 0.7777777777777777,
        0.507284382957014],
       [list(['198216332']), 0.0, 0.0, ..., 0.8927382283214156,
        0.38233858717265956, 0.8],
       ...,
       [list(['275311757']), 1.0, 1.0, ..., 0.0, 0.5555555555555556,
        0.507284382957014],
       [list(['274778317']), 1.0, 1.0, ..., 1.0, 0.38233858717265956,
        0.507284382957014],
       [list(['275528519']), 1.0, 1.0, ..., 0.8927382283214156,
        0.38233858717265956, 0.507284382957014]], dtype=object)

In [99]:
# save in binary format
np.save('embeddings_bin_num.npy', embeddings)
np.save('embeddings_bin_num_ID.npy', embeddings[:, 0])  

======================================================================================================================

### Сохранение без группировки

In [158]:
data['id'] = data['id'].apply(lambda x: x[:-1]).astype(int)
data.shape

(2777998, 24)

In [159]:
data

Unnamed: 0,id,floornumber,totalarea,roomscount,bedroomscount,kitchenarea,loggiascount,balconiescount,livingarea,combinedwcscount,lat,lng,price,clientFee,agentFee,prepayMonths,deposit,floorsCount,totalArea,passengerLiftsCount,buildYear,cargoLiftsCount,ceilingHeight,area
0,191401467,4.708525,0.000000,2.125113,0.000000,5.463984,1.0,1.031852,5.438916,1.05114,0,0.0,0,1.461308,1.0,1.0,2.522749,0.000000,0.000000,1.79454,5.131124,1.892738,4.417647,0.00000
0,275557007,9.000000,2.000000,2.125113,2.391695,5.463984,1.0,1.031852,5.438916,1.00000,5,3.0,0,1.461308,1.0,1.0,2.522749,0.000000,0.000000,1.79454,5.131124,1.892738,4.417647,0.00000
0,274482535,4.708525,10.000000,2.125113,2.391695,5.463984,1.0,1.031852,5.438916,1.05114,0,5.0,0,1.461308,1.0,1.0,2.522749,0.000000,0.000000,1.79454,5.131124,1.892738,4.417647,0.00000
1,212158376,4.708525,5.425487,2.125113,2.391695,5.463984,1.0,1.031852,5.438916,1.05114,1,1.0,1,1.000000,1.0,1.0,2.522749,2.953382,5.341775,1.79454,5.131124,1.892738,4.417647,1.00000
1,275619810,3.000000,5.000000,2.000000,2.391695,5.463984,1.0,1.000000,5.438916,1.00000,1,8.0,1,1.000000,1.0,1.0,2.522749,2.953382,5.341775,1.79454,5.131124,1.892738,4.417647,1.00000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999997,274206329,4.708525,5.425487,2.125113,2.391695,5.463984,1.0,1.031852,5.438916,1.05114,8,6.0,2,1.461308,1.0,1.0,2.522749,2.953382,5.341775,1.79454,5.131124,1.892738,4.417647,2.00000
999998,275226384,9.000000,2.000000,2.125113,2.391695,2.000000,1.0,1.031852,6.000000,1.05114,1,9.0,5,1.461308,1.0,1.0,2.522749,6.000000,2.000000,1.79454,5.131124,1.892738,1.000000,5.04352
999998,274265180,1.000000,0.000000,2.125113,2.391695,5.463984,1.0,1.031852,5.438916,1.05114,8,4.0,5,1.461308,1.0,1.0,2.522749,6.000000,2.000000,1.79454,5.131124,1.892738,1.000000,5.04352
999999,275347394,8.000000,3.000000,1.000000,2.391695,7.000000,1.0,2.000000,4.000000,1.05114,6,8.0,6,1.461308,1.0,1.0,2.522749,5.000000,5.341775,2.00000,5.131124,1.892738,4.000000,5.04352


In [161]:
# масштабиирование
scaler = MinMaxScaler()
data_scaled2 = scaler.fit_transform(data[data.columns[:-1]])
print(np.max(data_scaled2), np.min(data_scaled2))

1.0 0.0


In [162]:
embeddings2 = np.hstack((np.array(data['id']).reshape((data.shape[0], 1)), data_scaled2))
print(embeddings2.shape)
embeddings2

(2777998, 24)


array([[1.91401467e+08, 6.90938243e-01, 4.63565663e-01, ...,
        4.59013788e-01, 8.92738228e-01, 3.79738522e-01],
       [2.75557007e+08, 9.96195365e-01, 1.00000000e+00, ...,
        4.59013788e-01, 8.92738228e-01, 3.79738522e-01],
       [2.74482535e+08, 9.92297936e-01, 4.63565663e-01, ...,
        4.59013788e-01, 8.92738228e-01, 3.79738522e-01],
       ...,
       [2.74265180e+08, 9.91509525e-01, 0.00000000e+00, ...,
        4.59013788e-01, 8.92738228e-01, 0.00000000e+00],
       [2.75347394e+08, 9.95435036e-01, 8.75000000e-01, ...,
        4.59013788e-01, 8.92738228e-01, 3.33333333e-01],
       [2.74270034e+08, 9.91527132e-01, 5.00000000e-01, ...,
        4.59013788e-01, 8.92738228e-01, 3.33333333e-01]])

In [163]:
np.save('embeddings_for_bin_num.npy', embeddings2)
np.save('embeddings_for_bin_num_ID.npy', embeddings2[:, 0])  