# 导入包和数据

In [1]:
from datetime import datetime

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import seaborn as sns

plt.style.use('ggplot')
%matplotlib inline
# %matplotlib widget
plt.rcParams['font.sans-serif'] = 'SimHei'  # 中文问题
plt.rcParams['axes.unicode_minus'] = False  # 负号问题
%config InlineBackend.figure_format = 'svg'

In [2]:
# def reduce_mem_usage(df):
#     """
#     function：来自kaggle，遍历数据框的所有列并修改数据类型 减少内存使用量。
#     Parameters：
#         df：数据集 DataFrame
#     """
#     start_mem = df.memory_usage().sum() / 1024**2
#     print('数据集原始大小是 {:.2f} MB'.format(start_mem))

#     for col in df.columns:
#         col_type = df[col].dtype

#         if col_type != object:
#             c_min = df[col].min()
#             c_max = df[col].max()
#             if str(col_type)[:3] == 'int':
#                 if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
#                     df[col] = df[col].astype(np.int8)
#                 elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
#                     df[col] = df[col].astype(np.int16)
#                 elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
#                     df[col] = df[col].astype(np.int32)
#                 elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
#                     df[col] = df[col].astype(np.int64)
#             else:
#                 if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
#                     df[col] = df[col].astype(np.float16)
#                 elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
#                     df[col] = df[col].astype(np.float32)
#                 else:
#                     df[col] = df[col].astype(np.float64)
#         else:
#             df[col] = df[col].astype('category')

#     end_mem = df.memory_usage().sum() / 1024**2
#     print('压缩后的数据集大小是 {:.2f} MB'.format(end_mem))
#     print('降低了{:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

#     return df

In [3]:
# user_action = pd.read_csv('dataset/jdata_action.csv')
# # user_action = reduce_mem_usage(user_action)

In [4]:
# # 数据集太大，本次仅截取部分并保存
# user_data = user_action[(user_action['action_time'] > '2018-03-30')
#                         & (user_action['action_time'] < '2018-04-15')]
# user_data.to_csv('dataset/my_user_data.csv', sep=',')

# 浏览数据

In [5]:
# 查看数据类型
data = pd.read_csv('dataset/my_user_data.csv')
data = data[['user_id', 'sku_id', 'action_time', 'module_id', 'type']]
data.head(10)

Unnamed: 0,user_id,sku_id,action_time,module_id,type
0,1455298,208441,2018-04-11 15:21:43,6190659,1
1,1455298,334318,2018-04-11 15:14:54,6190659,1
2,1455298,237755,2018-04-11 15:14:13,6190659,1
3,1455298,6422,2018-04-11 15:22:25,6190659,1
4,1455298,268566,2018-04-11 15:14:26,6190659,1
5,1455298,115915,2018-04-11 15:13:35,6190659,1
6,1455298,208254,2018-04-11 15:22:16,6190659,1
7,1455298,177209,2018-04-14 14:09:59,6628254,1
8,1455298,71793,2018-04-14 14:10:29,6628254,1
9,1455298,141950,2018-04-12 15:37:53,10207258,1


In [6]:
data.info(verbose=True, null_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7540394 entries, 0 to 7540393
Data columns (total 5 columns):
 #   Column       Non-Null Count    Dtype 
---  ------       --------------    ----- 
 0   user_id      7540394 non-null  int64 
 1   sku_id       7540394 non-null  int64 
 2   action_time  7540394 non-null  object
 3   module_id    7540394 non-null  int64 
 4   type         7540394 non-null  int64 
dtypes: int64(4), object(1)
memory usage: 287.6+ MB


In [7]:
data.describe()

Unnamed: 0,user_id,sku_id,module_id,type
count,7540394.0,7540394.0,7540394.0,7540394.0
mean,803372.5,189089.4,6202712.0,1.452293
std,465459.8,108590.8,3435805.0,1.132476
min,2.0,1.0,1.0,1.0
25%,398774.0,95885.0,3188662.0,1.0
50%,802801.0,188152.0,6381248.0,1.0
75%,1208782.0,282974.0,9574079.0,1.0
max,1608707.0,378457.0,11363610.0,5.0


# 数据预处理

In [8]:
# 提取日期
data['date'] = pd.to_datetime(data['action_time']).dt.date
data['hour'] = pd.to_datetime(data['action_time']).dt.hour
data['weekday'] = pd.to_datetime(data['action_time']).dt.day_name()

In [9]:
# 删除无用列
data = data.drop('module_id', axis=1)

## 对大数据进行压缩处理

In [10]:
def mem_usage(pandas_obj):
    if isinstance(pandas_obj, pd.DataFrame):
        usage_b = pandas_obj.memory_usage(deep=True).sum()
    else:
        usage_b = pandas_obj.memory_usage(deep=True)
    usage_mb = usage_b/1024**2
    return '{:03.2f} MB'.format(usage_mb)

In [11]:
# 向下转化为无符号整数
data_int = data.select_dtypes(include=['int64'])
data_newint = data_int.apply(pd.to_numeric, downcast='unsigned')
print(mem_usage(data_int))
print(mem_usage(data_newint))

230.11 MB
71.91 MB


In [12]:
data_obj = data.select_dtypes(include=['object'])
data_obj.nunique()

action_time    1669414
date                16
weekday              7
dtype: int64

In [13]:
# object对象若具有少量类别，则可以转换为category类型节省大量空间！
data_obj = data.select_dtypes(include=['object'])
data_newobj = data_obj.astype('category')
print(mem_usage(data_obj))
print(mem_usage(data_newobj))

1296.34 MB
244.94 MB


In [18]:
behavior = pd.concat([data_newint, data_newobj], axis=1)
print(mem_usage(behavior))

316.85 MB


In [19]:
behavior_type = {1: 'pv', 2: 'pay', 3: 'fav', 4: 'comm', 5: 'cart'}
behavior['type'] = behavior['type'].apply(lambda x: behavior_type[x])
behavior.head(10)

Unnamed: 0,user_id,sku_id,type,hour,action_time,date,weekday
0,1455298,208441,pv,15,2018-04-11 15:21:43,2018-04-11,Wednesday
1,1455298,334318,pv,15,2018-04-11 15:14:54,2018-04-11,Wednesday
2,1455298,237755,pv,15,2018-04-11 15:14:13,2018-04-11,Wednesday
3,1455298,6422,pv,15,2018-04-11 15:22:25,2018-04-11,Wednesday
4,1455298,268566,pv,15,2018-04-11 15:14:26,2018-04-11,Wednesday
5,1455298,115915,pv,15,2018-04-11 15:13:35,2018-04-11,Wednesday
6,1455298,208254,pv,15,2018-04-11 15:22:16,2018-04-11,Wednesday
7,1455298,177209,pv,14,2018-04-14 14:09:59,2018-04-14,Saturday
8,1455298,71793,pv,14,2018-04-14 14:10:29,2018-04-14,Saturday
9,1455298,141950,pv,15,2018-04-12 15:37:53,2018-04-12,Thursday


# 构建分析指标

## 流量分析指标

In [16]:
# 总访问量
# pv = behavior_new[behavior_new['type'] == 'pv']
# 总访客数

# 消费用户数

# 日均访问量

# 人均访问量

# 消费用户访问量

# 消费用户占比

# 消费用户访问量占比

# 消费用户人均访问量

In [17]:
temp = data['action_time'].astype('category')
print(mem_usage(temp))
print(mem_usage(data['action_time']))

230.56 MB
547.62 MB
