In [None]:
import pandas as pd
import numpy as np
from pyecharts.charts import Pie,Bar,Line,Map,Map3D,Funnel
from pyecharts import options as opts
import matplotlib.pyplot as plt
import warnings
import seaborn as sns
from pyecharts.commons.utils import JsCode
from pyecharts.globals import ThemeType, ChartType
import textwrap

# 字体设置
plt.rcParams['font.sans-serif']=['Microsoft YaHei'] 
plt.rcParams['axes.unicode_minus']=False
plt.rc('font',family = 'Microsoft YaHei',size = '15')
warnings.filterwarnings("ignore")

%matplotlib inline

# 一、导入数据及数据预处理

In [50]:
df = pd.read_csv('tmall_order_report.csv')
df.head(10) #测试导入结果，显示前10行数据

Unnamed: 0,订单编号,总金额,买家实际支付金额,收货地址,订单创建时间,订单付款时间,退款金额
0,1,178.8,0.0,上海,2020-02-21 00:00:00,,0.0
1,2,21.0,21.0,内蒙古自治区,2020-02-20 23:59:54,2020-02-21 00:00:02,0.0
2,3,37.0,0.0,安徽省,2020-02-20 23:59:35,,0.0
3,4,157.0,157.0,湖南省,2020-02-20 23:58:34,2020-02-20 23:58:44,0.0
4,5,64.8,0.0,江苏省,2020-02-20 23:57:04,2020-02-20 23:57:11,64.8
5,6,327.7,148.9,浙江省,2020-02-20 23:56:39,2020-02-20 23:56:53,178.8
6,7,357.0,357.0,天津,2020-02-20 23:56:36,2020-02-20 23:56:40,0.0
7,8,53.0,53.0,浙江省,2020-02-20 23:56:12,2020-02-20 23:56:16,0.0
8,9,43.0,0.0,湖南省,2020-02-20 23:54:53,2020-02-20 23:55:04,43.0
9,10,421.0,421.0,北京,2020-02-20 23:54:28,2020-02-20 23:54:33,0.0


In [52]:
# 去除字段名中的空格
new_columns = [col.strip() for col in df.columns]
df.columns = new_columns
# 显示 dataframe 信息
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28010 entries, 0 to 28009
Data columns (total 7 columns):
订单编号        28010 non-null int64
总金额         28010 non-null float64
买家实际支付金额    28010 non-null float64
收货地址        28010 non-null object
订单创建时间      28010 non-null object
订单付款时间      24087 non-null object
退款金额        28010 non-null float64
dtypes: float64(3), int64(1), object(3)
memory usage: 1.5+ MB


In [9]:
# 数据基本描述
print('数据的时间区间为',df['订单创建时间'].min(),'到',df['订单创建时间'].max())
print('收货地址总计有：',df['收货地址'].nunique(),'个')
df.describe()

数据的时间区间为 2020-02-01 00:14:15 到 2020-02-29 23:59:18
收货地址总计有： 31 个


Unnamed: 0,订单编号,总金额,买家实际支付金额,退款金额
count,28010.0,28010.0,28010.0,28010.0
mean,14005.5,106.953253,67.921712,20.433271
std,8085.934856,1136.587094,151.493434,71.501963
min,1.0,1.0,0.0,0.0
25%,7003.25,38.0,0.0,0.0
50%,14005.5,75.0,45.0,0.0
75%,21007.75,119.0,101.0,0.0
max,28010.0,188320.0,16065.0,3800.0


In [10]:
# 提取日期中的时间为后续分析做准备
df['订单创建时间'] = df['订单创建时间'].astype('datetime64')
df['订单付款时间'] = df['订单付款时间'].astype('datetime64')
df['月'] = df['订单付款时间'].dt.month
df['日'] = df['订单付款时间'].dt.day
df2 = df[~df['订单付款时间'].isnull()]
df2['月'] = df2['月'].apply(lambda x:int(x)).astype('str')
df2['日'] = df2['日'].apply(lambda x:int(x)).astype('str')
df2['日期'] = df2['月'] + '月' + df2['日'] + '日'
df2['周'] = df2['订单付款时间'].dt.weekday + 1
df2['周'] = '星期' + df2['周'].astype('str')
df2['月'] = df2['月'].astype('int')
df2['日'] = df2['日'].astype('int')
df2 = df2.sort_values(by = '订单付款时间')
df2['小时'] = df2['订单付款时间'].dt.hour
df2.head()

Unnamed: 0,订单编号,总金额,买家实际支付金额,收货地址,订单创建时间,订单付款时间,退款金额,月,日,日期,周,小时
6063,6064,38.0,0.0,四川省,2020-02-01 00:14:15,2020-02-01 00:14:20,38.0,2,1,2月1日,星期6,0
6062,6063,38.0,38.0,江苏省,2020-02-01 00:17:47,2020-02-01 00:17:58,0.0,2,1,2月1日,星期6,0
6061,6062,76.0,0.0,湖北省,2020-02-01 00:33:01,2020-02-01 00:33:08,76.0,2,1,2月1日,星期6,0
6060,6061,38.0,38.0,贵州省,2020-02-01 00:50:18,2020-02-01 00:50:25,0.0,2,1,2月1日,星期6,0
6059,6060,38.0,0.0,陕西省,2020-02-01 00:54:18,2020-02-01 00:54:23,38.0,2,1,2月1日,星期6,0


In [11]:
# 查看收货地址信息
df2.收货地址.unique()

array(['四川省', '江苏省', '湖北省', '贵州省', '陕西省', '上海', '重庆', '浙江省', '湖南省', '河北省',
       '北京', '广东省', '新疆维吾尔自治区', '河南省', '吉林省', '黑龙江省', '云南省', '安徽省', '天津',
       '山西省', '辽宁省', '江西省', '内蒙古自治区', '福建省', '广西壮族自治区', '海南省', '山东省',
       '青海省', '甘肃省', '宁夏回族自治区', '西藏自治区'], dtype=object)

In [12]:
df2['收货地址'] = df2.收货地址.apply(lambda x:x.strip('省|自治区'))
df2['收货地址'] = df2.收货地址.replace(['新疆维吾尔','广西壮族','宁夏回族'],['新疆','广西','宁夏'])
df2.head()
df2.收货地址.unique()

array(['四川', '江苏', '湖北', '贵州', '陕西', '上海', '重庆', '浙江', '湖南', '河北', '北京',
       '广东', '新疆', '河南', '吉林', '黑龙江', '云南', '安徽', '天津', '山西', '辽宁', '江西',
       '内蒙古', '福建', '广西', '海南', '山东', '青海', '甘肃', '宁夏', '西藏'],
      dtype=object)

In [13]:
# 查看缺失数据
df[df['订单付款时间'].isnull()].head()

Unnamed: 0,订单编号,总金额,买家实际支付金额,收货地址,订单创建时间,订单付款时间,退款金额,月,日
0,1,178.8,0.0,上海,2020-02-21 00:00:00,NaT,0.0,,
2,3,37.0,0.0,安徽省,2020-02-20 23:59:35,NaT,0.0,,
13,14,34.9,0.0,天津,2020-02-20 23:53:44,NaT,0.0,,
14,15,96.8,0.0,贵州省,2020-02-20 23:51:37,NaT,0.0,,
37,38,37.0,0.0,广东省,2020-02-20 23:43:56,NaT,0.0,,


In [14]:
# 查看是否有重复值
df[df['退款金额'] > df['总金额']]
print('重复值数量为：',df.duplicated().sum())

重复值数量为： 0


In [54]:
def kde_plot_array(df):
    """
    绘制概率密度图矩阵函数
    df:要绘制图像的dataframe
    绘制各个字段的概率密度分布，最终返回图像的show()
    """
    plt.figure(figsize = (24,16))  # 图一figure
    for num,columns in zip(range(len(df.columns)),df.columns):
        plt.subplot(round(len(df.columns)/2,0),2,num+1)
        # sns.set(font = 'SimHei',font_scale = 1.6)
        index = columns
        sns.kdeplot(df[columns],
                   shade = True,label = index,alpha = 0.7)
        plt.legend()
        plt.title('{}分布图'.format(index))
    return plt.show()

# 过滤极端数据
df.describe()
df[df.总金额 > 5000]
plot_df = df[(df.总金额 < 500)&(df.退款金额 < 400)][['总金额','买家实际支付金额','退款金额']]
kde_plot_array(plot_df)

In [17]:
df[df.总金额 > 3000]

Unnamed: 0,订单编号,总金额,买家实际支付金额,收货地址,订单创建时间,订单付款时间,退款金额,月,日
3143,3144,11400.0,11400.0,江苏省,2020-02-18 09:34:43,2020-02-18 09:34:53,0.0,2.0,18.0
3841,3842,3800.0,0.0,广东省,2020-02-09 23:50:33,2020-02-10 00:52:40,3800.0,2.0,10.0
5311,5312,3800.0,3800.0,上海,2020-02-04 10:41:53,2020-02-04 10:48:51,0.0,2.0,4.0
5764,5765,3800.0,0.0,河南省,2020-02-02 16:52:17,2020-02-02 16:52:22,3800.0,2.0,2.0
13511,13512,16065.0,16065.0,内蒙古自治区,2020-02-26 15:41:27,2020-02-26 15:42:24,0.0,2.0,26.0
19257,19258,188320.0,0.0,上海,2020-02-24 19:35:06,NaT,0.0,,
19550,19551,4000.0,4000.0,江苏省,2020-02-24 17:20:40,2020-02-24 17:20:47,0.0,2.0,24.0
22031,22032,4800.0,4800.0,重庆,2020-02-29 11:08:38,2020-02-29 11:08:55,0.0,2.0,29.0
22060,22061,4800.0,0.0,重庆,2020-02-29 10:57:33,NaT,0.0,,
27737,27738,3200.0,0.0,上海,2020-02-27 08:32:00,NaT,0.0,,


## 1. 成交金额

### a. 成交金额在时间维度上的变化

In [18]:
change = df2[['买家实际支付金额','日']].groupby('日').sum().round(2).reset_index().sort_values(by = '日')

In [19]:

def echarts_line(x,y,title = '主标题',subtitle = '副标题',label = '图例'):
    """
    x: 函数传入x轴标签数据
    y：函数传入y轴数据
    title：主标题
    subtitle：副标题
    label：图例
    """
    line = Line(
        init_opts=opts.InitOpts(
            bg_color='#080b30',  # 设置背景颜色
            theme='dark'         # 设置主题
            # width='980px',     # 设置图的宽度
            # height='800px'     # 设置图的高度
        )
    )
    line.add_xaxis(x)
    line.add_yaxis(
        label,
        y,
        is_symbol_show=False,  # 是否显示数据标签点
        is_smooth=True,        # 设置曲线平滑
        label_opts=opts.LabelOpts(
            is_show=False,     # 是否显示数据
        ),
        itemstyle_opts=opts.ItemStyleOpts(color='#00ca95'),  # 设置系列颜色
        # 线条粗细阴影设置
        linestyle_opts={
            "normal": {
                "color": "#4ADEDE",  #线条颜色
                "shadowColor": 'rgba(0, 0, 0, .3)', #阴影颜色和不透明度
                "shadowBlur": 2,     #阴影虚化大小
                "shadowOffsetY": 5,  #阴影y偏移量
                "shadowOffsetX": 5,  #阴影x偏移量
                "width": 6   # 线条粗细
            },
        },
        # 阴影设置
        areastyle_opts={
            "normal": {
                "color": JsCode("""new echarts.graphic.LinearGradient(0, 0, 0, 1, [{
                                offset: 0,
                                color: '#7BD5F5'
                            },
                            {
                                offset: 1,
                                color: 'rgba(0,202,149, 0)'
                            }
                        ], false)"""),  #设置底色色块渐变
                "shadowColor": 'rgba(0,202,149, 0.9)',  #设置底色阴影
                "shadowBlur": 20  #设置底色阴影大小
            }
        },
    )
    line.set_global_opts(
        # 标题设置
        title_opts=opts.TitleOpts(
            title=title, # 主标题
            subtitle=subtitle, # 副标题
            pos_left='center',  # 标题展示位置
            title_textstyle_opts=dict(color='#fff') # 设置标题字体颜色
        ),
        # 图例设置
        legend_opts=opts.LegendOpts(
            is_show=True, # 是否显示图例
            pos_left='right', # 图例显示位置
            pos_top='3%',  #图例距离顶部的距离
            orient='horizontal'  # 图例水平布局
        ),
    )
    return line.render_notebook()


In [20]:
echarts_line(change['日'],change['买家实际支付金额'],title = '成交金额变化图',subtitle = "成交金额在时间维度上的变化",
            label = '成交金额')

In [21]:
week_change = df2[['周','买家实际支付金额']].groupby('周').sum().round(2).reset_index()

In [22]:

def echarts_bar(x,y,title = '主标题',subtitle = '副标题',label = '图例'):
    """
    x: 函数传入x轴标签数据
    y：函数传入y轴数据
    title：主标题
    subtitle：副标题
    label：图例
    """
    bar = Bar(
            init_opts=opts.InitOpts(
            bg_color='#080b30',  # 设置背景颜色
            theme='dark'         # 设置主题
            # width='980px',     # 设置图的宽度
            # height='800px'     # 设置图的高度
        )
    )
    bar.add_xaxis(x)
    bar.add_yaxis(label,y,
        label_opts=opts.LabelOpts(is_show=True) # 是否显示数据
        ,category_gap="50%" # 柱子宽度设置
        ) 
    bar.set_series_opts( # 自定义图表样式
        label_opts=opts.LabelOpts(is_show=False), # 是否显示数据标签
        markpoint_opts=opts.MarkPointOpts(
            data=[
            opts.MarkPointItem(type_="min", name="最小值"), # 显示最小值标签
            opts.MarkPointItem(type_="max", name="最大值"), # 显示最大值标签
            opts.MarkPointItem(type_="average", name="平均值") # 显示均值标签
            ]
        ),
        itemstyle_opts={  
            "normal": {
                "color": JsCode(
                    """new echarts.graphic.LinearGradient(0, 0, 0, 1, [{
                        offset: 0,color: 'rgba(0, 244, 255, 1)'}
                        ,{offset: 1,color: 'rgba(0, 77, 167, 1)'}], false)
                    """
                ),       # 调整柱子颜色渐变
                "barBorderRadius": [100, 100, 100, 100],  # 调整柱子圆角弧度
                "shadowColor": "rgb(0, 160, 221)", # 调整阴影颜色
            }
        }
    )
    bar.set_global_opts(
        # 标题设置
        title_opts=opts.TitleOpts(
            title=title, # 主标题
            subtitle=subtitle, # 副标题
            pos_left='center',  # 标题展示位置
            title_textstyle_opts=dict(color='#fff') # 设置标题字体颜色
        ),
        # 图例设置
        legend_opts=opts.LegendOpts(
            is_show=True, # 是否显示图例
            pos_left='right', # 图例显示位置
            pos_top='3%',  #图例距离顶部的距离
            orient='horizontal'  # 图例水平布局
        ),
    )
    return bar.render_notebook()

In [23]:
echarts_bar(week_change['周'].tolist(),week_change['买家实际支付金额'].tolist(),title = '订单成交金额平均每周对比',
            subtitle = '每周对比图',label = '成交金额')

In [24]:
hour_change = df2[['小时','买家实际支付金额']].groupby('小时').sum().round(2).reset_index()

In [25]:
echarts_line(hour_change['小时'],hour_change['买家实际支付金额'],title = '每天各时段成交金额变化图'
            ,subtitle = '一天24小时哪个时间段成交金额多',label = '成交金额')

### b. 成交金额在地区维度上的分布

In [26]:
change_map = df2[['收货地址','买家实际支付金额']].groupby('收货地址').sum().round(2).reset_index().sort_values(by = '买家实际支付金额',
                ascending = False)

In [27]:

def echarts_map(province,data,title = '主标题',subtitle = '副标题',label = '图例'):
    """
    province：传入省份List
    data：传入各省对应的数据List
    title：主标题
    subtitle：副标题
    label：图例
    """
    map_ = Map(
            init_opts=opts.InitOpts(
            bg_color='#080b30',  # 设置背景颜色
            theme='dark'         # 设置主题
            # width='980px',     # 设置图的宽度
            # height='800px'     # 设置图的高度
            )
    )
    map_.add(label,[list(i) for i in zip(province,data)])
    map_.set_global_opts(
        # 标题设置
        title_opts=opts.TitleOpts(
            title=title, # 主标题
            subtitle=subtitle, # 副标题
            pos_left='center',  # 标题展示位置
            title_textstyle_opts=dict(color='#fff') # 设置标题字体颜色
        ),
        # 图例设置
        legend_opts=opts.LegendOpts(
            is_show=True, # 是否显示图例
            pos_left='right', # 图例显示位置
            pos_top='3%',  #图例距离顶部的距离
            orient='horizontal'  # 图例水平布局
        ),
        visualmap_opts = opts.VisualMapOpts(max_ = int(data.max()),is_piecewise = False)
    )
    return map_.render_notebook()


In [28]:
echarts_map(change_map['收货地址'],change_map['买家实际支付金额'],title = '成交金额分布图'
            ,subtitle = '成交金额在全国各地分布情况',label = '成交金额')

In [29]:
change_map = df2[['收货地址','买家实际支付金额']].groupby('收货地址').sum().round(2).reset_index().sort_values(by = '买家实际支付金额',
                ascending = False)

## 2. 退款金额

### a. 退款金额在时间维度上的变化

In [32]:
back_money = df2[['日','退款金额']].groupby('日').sum().round(2).reset_index()

In [33]:
echarts_bar(back_money['日'].tolist(),back_money['退款金额'].tolist(),title = '退款金额变化图'
            ,subtitle = '退款金额日变化图',label = '退款金额')

In [34]:
hour_back_money = df2[['小时','退款金额']].groupby('小时').sum().round(2).reset_index()

In [35]:
echarts_line(hour_back_money['小时'].tolist(),hour_back_money['退款金额'].tolist(),title = '退款金额变化图'
                ,subtitle = '退款金额每日小时变化图',label = '退款金额')

### b. 退款金额在地区维度上的分布

In [36]:
local_back_money = df2[['收货地址','退款金额']].groupby('收货地址').sum().round(2).reset_index()

In [37]:
echarts_map(local_back_money['收货地址'],local_back_money['退款金额'],title = '退款金额分布图'
            ,subtitle = '退款金额在全国各地分布情况',label = '退款金额')

In [38]:
map3d_with_bar3d(local_back_money['收货地址'],local_back_money['退款金额'],title = '退款金额分布图'
            ,subtitle = '退款金额在全国各地分布情况',label = '退款金额')