In [18]:
import pandas as pd
import numpy as np 
import plotly.express as ex 
import plotly.graph_objects as go

# 数据清洗&预处理

In [19]:
#加载三张数据集
user_df = pd.read_csv('databases/user—test.csv',encoding='gb18030')
order_df = pd.read_csv('databases/Order-test.csv')
consult_df = pd.read_csv('databases/consult-test.csv')

## 处理 Consult 表格

In [20]:
print(consult_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 662158 entries, 0 to 662157
Data columns (total 6 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   consult_no          662158 non-null  int64  
 1   consult_start_time  662158 non-null  object 
 2   consult_end_time    662158 non-null  object 
 3   user_id             662158 non-null  int64  
 4   consult_round       0 non-null       float64
 5   like_level          94404 non-null   float64
dtypes: float64(2), int64(2), object(2)
memory usage: 30.3+ MB
None


In [21]:
# 删除 consult_round 这个variable 全是None 所以可以直接去除
del consult_df['consult_round']

In [35]:
consult_df['consult_start_time'] = pd.to_datetime(consult_df['consult_start_time'].tolist(),errors='coerce')
consult_df['consult_end_time'] = pd.to_datetime(consult_df['consult_end_time'].tolist(),errors='coerce')
consult_df = consult_df[consult_df['consult_start_time'] != consult_df['consult_end_time']] 

In [41]:
consult_df.loc[:,'duration'] = (consult_df['consult_end_time'] - consult_df['consult_start_time']).dt.seconds/60
consult_df.loc[:, 'duration'] = consult_df['duration'].round()
consult_df.head(5)

Unnamed: 0,consult_no,consult_start_time,consult_end_time,user_id,like_level,duration
0,22002906,2023-01-03 00:47:06+00:00,2023-01-03 00:58:20+00:00,570051393580120,,11.0
1,22698650,2023-01-25 02:38:35+00:00,2023-01-25 02:58:37+00:00,570051393580120,,20.0
2,22004214,2023-01-03 01:19:09+00:00,2023-01-03 01:34:09+00:00,570051393580120,,15.0
3,22698026,2023-01-25 02:06:16+00:00,2023-01-25 02:21:26+00:00,570051393580120,,15.0
4,22629434,2023-01-20 02:46:12+00:00,2023-01-20 02:46:47+00:00,570051393580120,,1.0


## 处理 User 表格

In [23]:
print(user_df.info(),"\n")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 870460 entries, 0 to 870459
Data columns (total 11 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   user_id         870460 non-null  int64  
 1   sex             62753 non-null   object 
 2   age             61970 non-null   float64
 3   country         866918 non-null  object 
 4   province_name   866918 non-null  object 
 5   city_name       866918 non-null  object 
 6   city_level      863913 non-null  object 
 7   is_member_flag  870460 non-null  int64  
 8   first_day       811277 non-null  object 
 9   last_day        870460 non-null  object 
 10  be_member_time  596011 non-null  object 
dtypes: float64(1), int64(2), object(8)
memory usage: 73.1+ MB
None 



In [24]:
# 这里 sex & age Missing value 是80w+, 这个数据集总共才87W, 缺失值总共约占数据集92%
# 直接去掉这两行
user_df = user_df.drop(['sex','age'],axis=1)

In [25]:
# 查看province_name 为缺失值有3542个，由于省名缺失，对后续分析有影响。
# 所以,直接将这三千多行数据去掉，约占总数据集的0.4%，影响不大
# user_df = user_df.dropna(subset=['province_name'])
# columnlist = user_df.columns.values
print(user_df.isna().sum())
print(user_df.shape)

user_id                0
country             3542
province_name       3542
city_name           3542
city_level          6547
is_member_flag         0
first_day          59183
last_day               0
be_member_time    274449
dtype: int64
(870460, 9)


In [26]:
#缺失city_level这些地方属于一些特殊的省直管县级市，按照这种方法，使用‘其他城市’去替换Nan
user_df['city_level'] = user_df['city_level'].fillna("其他城市")

In [27]:
#关于填补first_day 的策略，我分下面两种情况进行填补
# 1. 如果开通会员的话，be_member_time就是他的first_time
# 2. 如果没有开通会员的话，first_time 就是它的last_day, 把这类临时用户
user_df['first_day'] = user_df['first_day'].fillna(user_df['be_member_time'])
user_df.reset_index(drop=True)

Unnamed: 0,user_id,country,province_name,city_name,city_level,is_member_flag,first_day,last_day,be_member_time
0,570009353210116,中国,山东,潍坊,三线城市,1,2023/4/3,2023/8/2,2023/4/3 19:42
1,570020406530107,中国,贵州,六盘水,四线城市,1,2023/3/9,2023/3/9,2023/3/9 19:52
2,570020458000111,中国,广东,广州,一线城市,1,2023/6/25,2023/6/29,2023/6/29 12:38
3,570020914800114,中国,山东,青岛,新一线城市,1,2023/6/25,2023/7/2,2022/4/20 10:22
4,570021004330160,中国,江西,上饶,三线城市,1,2023/5/11,2023/7/9,2022/4/22 10:17
...,...,...,...,...,...,...,...,...,...
870455,570072597650156,中国,重庆,重庆,新一线城市,0,2023/6/30,2023/6/30,
870456,570072523900118,中国,山东,潍坊,三线城市,0,2023/6/30,2023/8/3,
870457,570072538870133,中国,黑龙江,大庆,三线城市,1,2023/6/30,2023/6/30,2023/6/30 19:08
870458,570072510060139,中国,陕西,西安,新一线城市,1,2023/6/30,2023/6/30,2023/6/30 21:03


In [28]:
#检测是否有重复row
user_df.duplicated().sum()

np.int64(0)

In [29]:
user_df['first_day'] = pd.to_datetime(user_df['first_day'],errors='coerce')
user_df['last_day'] = pd.to_datetime(user_df['last_day'],errors='coerce')
user_df['be_member_time'] = pd.to_datetime(user_df['be_member_time'],errors='coerce')

condition1 = user_df['first_day'] > user_df['be_member_time']
user_df = user_df[~condition1]
condition2 = user_df['first_day'] > user_df['last_day']
user_df = user_df[~condition2]
condition3 = user_df['last_day'] < user_df['be_member_time']
user_df = user_df[~condition3]

In [30]:
user_df['renew_time'] = (user_df['be_member_time'] + pd.DateOffset(days=365))
user_df['is_renew'] = (user_df['renew_time'] <= user_df['last_day']).astype(int)
user_df.head(5)

Unnamed: 0,user_id,country,province_name,city_name,city_level,is_member_flag,first_day,last_day,be_member_time,renew_time,is_renew
0,570009353210116,中国,山东,潍坊,三线城市,1,2023-04-03,2023-08-02,2023-04-03 19:42:00,2024-04-02 19:42:00,0
9,570023426180132,中国,北京,北京,一线城市,0,2023-03-04,2023-03-09,NaT,NaT,0
12,570025239650135,中国,上海,上海,一线城市,1,2023-03-08,2023-08-17,2023-07-15 18:25:00,2024-07-14 18:25:00,0
17,570026463920104,中国,黑龙江,哈尔滨,二线城市,1,NaT,2023-07-18,2022-08-16 14:37:00,2023-08-16 14:37:00,0
18,570027668630107,中国,四川,成都,新一线城市,0,2023-03-25,2023-03-25,NaT,NaT,0


In [31]:
user_df['mbr_days'] = (user_df['be_member_time'] - user_df['first_day']).dt.days
user_df['mbr_lose_days'] = (user_df['last_day'] - user_df['be_member_time']).dt.days

In [32]:
user_df.to_csv('databases/clean_user_df.csv')