#### **此notebook要求较大内存，当前结果是在60G内存、0G Swap环境下运行，峰值占用能达到70%~80%。**

将user和user-item连接，将showPos、refresh分桶，将network转为One-Hot向量。

In [1]:
import gc
from pathlib import Path
import pandas as pd
import numpy as np
from time import time
import matplotlib.pyplot as plt
pd.set_option('expand_frame_repr', False)
pd.set_option('display.float_format', lambda x: '%.2f' % x)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 1000)
pd.set_option('max_colwidth', 200)

In [2]:
st = time()
user_df = pd.read_pickle('~/data/user.pkl')
ui_df = pd.read_pickle('~/data/user_item.pkl')
ed = time()
print(f'read {ed - st: .2f}s')

read  24.53s


In [3]:
data = ui_df.merge(user_df, how='left', on='userId')
data.userId = data.userId.astype('category')
del user_df, ui_df

In [4]:
data

Unnamed: 0,userId,itemId,showTime,network,refresh,showPos,click,duration,deviceName,OS,province,city,0-24,25-29,30-39,40-,female,male
0,1000014754,463510256,2021-06-28 01:29:16.147,5,0,16,0,0,DVC-AN20,Android,江西,南昌,0.01,0.04,0.29,0.66,0.36,0.64
1,1000014754,463852707,2021-06-28 01:29:16.147,5,0,13,1,80,DVC-AN20,Android,江西,南昌,0.01,0.04,0.29,0.66,0.36,0.64
2,1000014754,464757134,2021-06-30 11:36:39.841,5,0,13,1,1050,DVC-AN20,Android,江西,南昌,0.01,0.04,0.29,0.66,0.36,0.64
3,1000014754,464617167,2021-06-30 11:36:39.841,5,0,16,1,286,DVC-AN20,Android,江西,南昌,0.01,0.04,0.29,0.66,0.36,0.64
4,1000014754,465426190,2021-07-04 07:07:01.168,5,0,5,0,0,DVC-AN20,Android,江西,南昌,0.01,0.04,0.29,0.66,0.36,0.64
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
189766954,999938860,465384048,2021-07-02 23:46:39.623,2,2,18,0,0,vivoX9,Android,江西,景德镇,0.31,0.07,0.23,0.40,0.07,0.93
189766955,999938860,464821689,2021-07-02 23:46:08.356,2,1,15,1,96,vivoX9,Android,江西,景德镇,0.31,0.07,0.23,0.40,0.07,0.93
189766956,999938860,464835275,2021-07-02 23:46:08.356,2,1,13,0,0,vivoX9,Android,江西,景德镇,0.31,0.07,0.23,0.40,0.07,0.93
189766957,999938860,463270375,2021-06-26 22:41:30.961,2,7,67,0,0,vivoX9,Android,江西,景德镇,0.31,0.07,0.23,0.40,0.07,0.93


In [5]:
network = data.network
nw_df = pd.get_dummies(network, prefix='network')
print(nw_df)
del network

           network_2  network_3  network_4  network_5
0                  0          0          0          1
1                  0          0          0          1
2                  0          0          0          1
3                  0          0          0          1
4                  0          0          0          1
...              ...        ...        ...        ...
189766954          1          0          0          0
189766955          1          0          0          0
189766956          1          0          0          0
189766957          1          0          0          0
189766958          1          0          0          0

[189766959 rows x 4 columns]


In [6]:
refresh = data.refresh.astype('int')
rfr_df = pd.get_dummies(pd.qcut(refresh, q=8, labels=['refresh' + str(i) for i in range(8)]))
print(rfr_df)
del refresh

           refresh0  refresh1  refresh2  refresh3  refresh4  refresh5  refresh6  refresh7
0                 1         0         0         0         0         0         0         0
1                 1         0         0         0         0         0         0         0
2                 1         0         0         0         0         0         0         0
3                 1         0         0         0         0         0         0         0
4                 1         0         0         0         0         0         0         0
...             ...       ...       ...       ...       ...       ...       ...       ...
189766954         0         1         0         0         0         0         0         0
189766955         1         0         0         0         0         0         0         0
189766956         1         0         0         0         0         0         0         0
189766957         0         0         0         0         1         0         0         0
189766958 

In [7]:
showPos = data.showPos.astype('int')
sp_df = pd.get_dummies(pd.qcut(showPos, q=8, labels=['showPos' + str(i) for i in range(8)]))
print(sp_df)
del showPos

           showPos0  showPos1  showPos2  showPos3  showPos4  showPos5  showPos6  showPos7
0                 0         1         0         0         0         0         0         0
1                 1         0         0         0         0         0         0         0
2                 1         0         0         0         0         0         0         0
3                 0         1         0         0         0         0         0         0
4                 1         0         0         0         0         0         0         0
...             ...       ...       ...       ...       ...       ...       ...       ...
189766954         0         0         1         0         0         0         0         0
189766955         0         1         0         0         0         0         0         0
189766956         1         0         0         0         0         0         0         0
189766957         0         0         0         0         0         0         1         0
189766958 

In [12]:
# garbage = data[['showPos', 'refresh', 'network']]
# data.drop(['showPos', 'refresh', 'network'], axis=1, inplace=True)  # 为什么drop后内存占用会上升？
# data = data.drop(['showPos', 'refresh', 'network'], axis=1)

In [8]:
tmp = pd.concat([data, nw_df, rfr_df, sp_df], axis=1)
del data, nw_df, rfr_df, sp_df
data = tmp

In [20]:
data.dtypes

userId                object
itemId              category
showTime      datetime64[ns]
click               category
duration            category
deviceName          category
OS                  category
province            category
city                category
0-24                 float64
25-29                float64
30-39                float64
40-                  float64
female               float64
male                 float64
network_2              uint8
network_3              uint8
network_4              uint8
network_5              uint8
refresh0               uint8
refresh1               uint8
refresh2               uint8
refresh3               uint8
refresh4               uint8
refresh5               uint8
refresh6               uint8
refresh7               uint8
showPos0               uint8
showPos1               uint8
showPos2               uint8
showPos3               uint8
showPos4               uint8
showPos5               uint8
showPos6               uint8
showPos7      

In [19]:
data.iloc[:data.shape[0] // 2].to_pickle('../data/all_data1.pkl')

In [20]:
data.iloc[data.shape[0] // 2:].to_pickle('../data/all_data2.pkl')

In [1]:
# 运行完以上代码，重启内核运行该cell

# import pandas as pd
# from time import time
# st = time()
# data1 = pd.read_pickle('../data/all_data1.pkl')
# data2 = pd.read_pickle('../data/all_data2.pkl')
# ed = time()
# print(f'read {ed - st:.2f}s')

# data = pd.concat([data1, data2], axis=0)
# del data1, data2
# data.to_pickle('../data/all_data.pkl')
# print('write finished')

read 104.87s
