# 探索性分析

## 导入模块

In [1]:
from load_data import *
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import visualization
import util

sns.set(style="ticks", color_codes=True)

## 加载数据

In [2]:
ijcai_18_train, ijcai_18_test = load_data()

### 数据基本信息

本数据集共有27个原始特征字段（含id和label），共496509条记录。其中包括：

- 训练集：478,138条记录
- 测试集：18,371条记录

(训练集包含分类标签：is_trade)
![](../etc/Inter_Ad_Algo.svg)
在分析时，将合并训练集和测试集，方便数据处理

In [3]:
ijcai_18_test['is_trade'] = -1 # test label 标注为-1，用以区分训练集和数据集
ijcai_18_merged = pd.concat([ijcai_18_train, ijcai_18_test], axis=0)
ijcai_18_merged.head().T

Unnamed: 0,0,1,2,3,4
instance_id,108641074714126964,5754713551599725161,842679481291040981,937088850059189027,7975697065017708072
item_id,3412720377098676069,3412720377098676069,3412720377098676069,3412720377098676069,3412720377098676069
item_category_list,7908382889764677758;5799347067982556520,7908382889764677758;5799347067982556520,7908382889764677758;5799347067982556520,7908382889764677758;5799347067982556520,7908382889764677758;5799347067982556520
item_property_list,2072967855524022579;5131280576272319091;263639...,2072967855524022579;5131280576272319091;263639...,2072967855524022579;5131280576272319091;263639...,2072967855524022579;5131280576272319091;263639...,2072967855524022579;5131280576272319091;263639...
item_brand_id,1975590437749032870,1975590437749032870,1975590437749032870,1975590437749032870,1975590437749032870
item_city_id,3948283326616421003,3948283326616421003,3948283326616421003,3948283326616421003,3948283326616421003
item_price_level,3,3,3,3,3
item_sales_level,3,3,3,3,3
item_collected_level,4,4,4,4,4
item_pv_level,14,14,14,14,14


In [4]:
ijcai_18_merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 496509 entries, 0 to 18370
Data columns (total 27 columns):
instance_id                  496509 non-null int64
item_id                      496509 non-null int64
item_category_list           496509 non-null object
item_property_list           496509 non-null object
item_brand_id                496509 non-null int64
item_city_id                 496509 non-null int64
item_price_level             496509 non-null int64
item_sales_level             496509 non-null int64
item_collected_level         496509 non-null int64
item_pv_level                496509 non-null int64
user_id                      496509 non-null int64
user_gender_id               496509 non-null int64
user_age_level               496509 non-null int64
user_occupation_id           496509 non-null int64
user_star_level              496509 non-null int64
context_id                   496509 non-null int64
context_timestamp            496509 non-null int64
context_page_id      

可以看出，其中有部分数据格式存在问题，需要手工修改

In [5]:
remodified_rows = (
    ('instance_id', np.object),
    ('item_id', np.object),
    ('item_brand_id', np.object),
    ('item_city_id', np.object),
    ('user_id', np.object),
    ('user_gender_id', np.object),
    ('user_occupation_id', np.object),
    ('context_id', np.object),
    ('context_page_id', np.object),
    ('shop_id', np.object),
    ('is_trade', np.object),
)
for row, retype in remodified_rows:
    ijcai_18_merged[row] = ijcai_18_merged[row].astype(retype)

In [6]:
ijcai_18_merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 496509 entries, 0 to 18370
Data columns (total 27 columns):
instance_id                  496509 non-null object
item_id                      496509 non-null object
item_category_list           496509 non-null object
item_property_list           496509 non-null object
item_brand_id                496509 non-null object
item_city_id                 496509 non-null object
item_price_level             496509 non-null int64
item_sales_level             496509 non-null int64
item_collected_level         496509 non-null int64
item_pv_level                496509 non-null int64
user_id                      496509 non-null object
user_gender_id               496509 non-null object
user_age_level               496509 non-null int64
user_occupation_id           496509 non-null object
user_star_level              496509 non-null int64
context_id                   496509 non-null object
context_timestamp            496509 non-null int64
context_page_

In [7]:
ijcai_18_test = ijcai_18_merged[ijcai_18_merged.is_trade == -1]
ijcai_18_train = ijcai_18_merged[ijcai_18_merged.is_trade != -1]

## 字段分析

In [8]:
ijcai_18_merged.instance_id.describe()

count                  496509
unique                 496455
top       3370133319712542740
freq                        3
Name: instance_id, dtype: int64

In [65]:
ijcai_18_merged.context_id.value_counts()#.value_counts().sort_index().cumsum().plot()

7259819519976136023    3
290808226164722964     3
2679502183138154630    2
4497638035869905621    2
2849679115978568915    2
8192490307717568017    2
6005263855939584311    2
1647541348423817807    2
3429462095177897954    2
5572499327546912720    2
1081933224899281574    2
393440996556281300     2
2909846917991242231    2
305737608670045906     2
4451796609054987809    2
304650542434805843     2
8100475734761363134    2
5716302324470047044    2
1969302545828967535    2
4408407584681704919    2
6603062751043810606    2
1174885052158715015    2
1582607348662732737    2
8957097032852481362    2
3171393537650530997    2
9089193198971348589    1
4289566347573798055    1
3251230813065130684    1
8158429610870400629    1
504606308740458076     1
                      ..
2665471288907840028    1
4938371623029455045    1
4957571136408435192    1
363497451686640106     1
2353617452388760042    1
6838188135414413802    1
6396934341396183531    1
1961135405395524078    1
1874266491345091056    1
