## 对train_data.txt的EDA和预处理

In [35]:
import gc
from pathlib import Path
import pandas as pd
import numpy as np
from time import time
import matplotlib.pyplot as plt
pd.set_option('expand_frame_repr', False)
pd.set_option('display.float_format', lambda x: '%.2f' % x)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 1000)
pd.set_option('max_colwidth', 200)

In [3]:
# 各列字段为：用户id、文章id、展现时间、网路环境、刷新次数、展现位置、是否点击、消费时长（秒）；
# 从train_data.txt读入
ui_path = '~/data/train_data.txt'
st = time()
ui_df = pd.read_csv(str(ui_path), header=None, sep='\t',
                    names=['userId', 'itemId', 'showTime', 'network', 'refresh', 'showPos', 'click', 'duration'],
                    dtype={'userId': 'category', 'itemId': 'category', 'showTime': 'int64','network': 'category',
                           'refresh': 'category', 'showPos': 'category', 'click': 'category', 'duration': 'category'})
ed = time()
print(f'read csv: {ed - st: .2f}s')
print(ui_df.info())
print(ui_df.isnull().sum())

read csv:  292.59s
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 189766959 entries, 0 to 189766958
Data columns (total 8 columns):
 #   Column    Dtype   
---  ------    -----   
 0   userId    category
 1   itemId    category
 2   showTime  int64   
 3   network   category
 4   refresh   category
 5   showPos   category
 6   click     category
 7   duration  category
dtypes: category(7), int64(1)
memory usage: 4.3 GB
None


In [18]:
print(f'total {ui_df.shape[0]} records.\n')
print(ui_df.head())
print(f'\nnull values:\n{ui_df.isnull().sum()}')

total 189766959 records.

       userId     itemId       showTime network refresh showPos click duration
0  1000014754  463510256  1624843756147       5       0      16     0        0
1  1000014754  463852707  1624843756147       5       0      13     1       80
2  1000014754  464757134  1625052999841       5       0      13     1     1050
3  1000014754  464617167  1625052999841       5       0      16     1      286
4  1000014754  465426190  1625382421168       5       0       5     0        0

null values:
userId      0
itemId      0
showTime    0
network     0
refresh     0
showPos     0
click       0
duration    0
dtype: int64


In [19]:
# "训练数据取自用户历史12天的行为日志，"
showTime = ui_df['showTime']
showTime = pd.to_datetime(showTime, unit='ms')
ui_df['showTime'] = showTime
print(f'earliest time is {showTime.min()}\nlatest time is {showTime.max()}.\n')
print(showTime.describe(datetime_is_numeric=True))
print('\nthe distribution of records is basically average in time.')

earliest time is 2021-06-24 14:56:53.949000
latest time is 2021-07-06 15:59:59.972000.

count                        189766959
mean     2021-06-30 16:07:51.462944768
min         2021-06-24 14:56:53.949000
25%      2021-06-27 14:10:33.423000064
50%         2021-06-30 13:17:46.568000
75%      2021-07-03 15:31:24.436999936
max         2021-07-06 15:59:59.972000
Name: showTime, dtype: object

the distribution of records is basically average in time.


In [20]:
userId = ui_df['userId'].astype('int64')
print(f'total {userId.unique().size} users.')
print(f'max Id is {userId.max()}, min Id is {userId.min()}.')

print(f'\nthe number of records of each user:')
counts = userId.value_counts()
print(counts)

print(f'\nand distribution:')
print(counts.describe())

print(f'users with few records account for not little ratio.')

total 1478694 users.
max Id is 2447273874, min Id is 17340.

the number of records of each user:
1170017440    27202
2445143950    20515
1159427472    18189
1767331530    15355
2445841240    14015
              ...  
1715540286        1
1715351880        1
1715303324        1
1715182266        1
1430069648        1
Name: userId, Length: 1478694, dtype: int64

and distribution:
count   1478694.00
mean        128.33
std         335.12
min           1.00
25%           2.00
50%          13.00
75%          86.00
max       27202.00
Name: userId, dtype: float64
users with few records account for not little ratio.


In [21]:
itemId = ui_df['itemId'].astype('int64')
print(f'total {itemId.unique().size} items.')
print(f'max Id is {itemId.max()}, min Id is {itemId.min()}.')

print(f'\nclicks counts for each item:')
print(itemId.value_counts())

clicks = itemId.value_counts()
print(f'\ncounts distribution:')
print(clicks.describe())

print(f'\nsimilarly with users records, items with few clicks also account for not little ratio.')
print(f'even more concentrated than users records.')

total 590949 items.
max Id is 466910675, min Id is 325279629.

clicks counts for each item:
465497699    204351
463963285    197206
463154982    188750
465266173    158444
465797210    156024
              ...  
465270165         1
463560625         1
462177360         1
464531418         1
464700121         1
Name: itemId, Length: 590949, dtype: int64

counts distribution:
count   590949.00
mean       321.12
std       2713.34
min          1.00
25%          2.00
50%          9.00
75%         52.00
max     204351.00
Name: itemId, dtype: float64

 similarly with users records, items with few clicks account for not little ratio.
even more concentrated than users records.


In [22]:
# 网络环境：0：未知；1：离线；2：WiFi；3：2g；4：3g；5：4g；
network = ui_df['network']
print('network status distribution:')
print(network.value_counts())

print(f'\nmost users are "wifi-2", then "4g-5", no 0 or 1')

network status distribution:
2    121109653
5     67834372
4       636315
3       186619
Name: network, dtype: int64

most users are "wifi-2", then "4g-5", no 0 or 1


In [24]:
# 刷新次数：用户打开APP后推荐页的刷新次数，直到退出APP则清零；
print('refresh distribution:')
refresh = ui_df['refresh'].astype('int64')
counts = refresh.value_counts()
print(counts)
print('\nand refresh counts distribution:')
print(refresh.describe())

refresh distribution:
1      25081571
2      21967996
0      18751054
3      16297941
4      13208328
         ...   
647           3
650           3
645           3
646           3
649           3
Name: refresh, Length: 651, dtype: int64

and refresh counts distribution:
count   189766959.00
mean            8.92
std            15.02
min             0.00
25%             2.00
50%             4.00
75%            10.00
max           650.00
Name: refresh, dtype: float64


In [25]:
showPos = ui_df['showPos'].astype('int64')
print(f'total {showPos.unique().size} unique values.')
print(f'min position is {showPos.min()}, max position is {showPos.max()}.')

print(f'show position distribution:')
counts = showPos.value_counts()
print(counts)

print(f'\nposition counts distribution:')
print(showPos.describe())

total 2634 unique values.
min position is 0, max position is 2698.
show position distribution:
15      6941048
12      6208301
17      5977890
14      5769413
16      5742256
         ...   
2697          1
2386          1
2269          1
2266          1
2151          1
Name: showPos, Length: 2634, dtype: int64

position counts distribution:
count   189766959.00
mean           52.76
std            63.84
min             0.00
25%            17.00
50%            32.00
75%            64.00
max          2698.00
Name: showPos, dtype: float64


In [26]:
click = ui_df['click']
print(click.value_counts())


0    162674966
1     27091993
Name: click, dtype: int64


In [31]:
# duration存在-1的值，比赛官网没有说明-1指的是啥？ 这里先不做处理，之后的训练也不用到duration
duration = ui_df['duration'].astype('int64')
print(duration.describe())
print(duration.value_counts(dropna=False))


count   189766959.00
mean           24.59
std         44957.50
min            -1.00
25%             0.00
50%             0.00
75%             0.00
max     619315213.00
Name: duration, dtype: float64
0       162663361
2          250474
1          237243
3          226274
4          220248
          ...    
3228            1
3350            1
3547            1
2949            1
3065            1
Name: duration, Length: 3256, dtype: int64


In [34]:
ui_df[duration == -1].click.value_counts()

1    92496
0    11605
Name: click, dtype: int64

In [37]:
ui_df.head()

Unnamed: 0,userId,itemId,showTime,network,refresh,showPos,click,duration
0,1000014754,463510256,2021-06-28 01:29:16.147,5,0,16,0,0
1,1000014754,463852707,2021-06-28 01:29:16.147,5,0,13,1,80
2,1000014754,464757134,2021-06-30 11:36:39.841,5,0,13,1,1050
3,1000014754,464617167,2021-06-30 11:36:39.841,5,0,16,1,286
4,1000014754,465426190,2021-07-04 07:07:01.168,5,0,5,0,0


In [36]:
ui_df.to_pickle('~/data/user_item.pkl')