In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from tqdm import tqdm
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
pd.set_option('display.max_columns', 500)

In [2]:
'''
用户标识（uId）匿名化处理后的用户唯一标识（ID取值从1000001开始，依次递增）
年龄段（age_group）年龄范围（取值1； 2； 3； 4； 5；6；）'''
label = pd.read_csv("data/age_train.csv",header=None)
label.columns = ['uId','age_group']

In [3]:
'''
用户标识（uId）匿名化处理后的用户唯一标识（ID取值从1000001开始，依次递增）
'''
test = pd.read_csv("data/age_test.csv",header=None)
test.columns = ['uId']

In [4]:
'''
用户标识（uId）匿名化处理后的用户唯一标识（ID取值从1000001开始，依次递增）
开机次数（bootTimes）一段时间内(30天)手机的总开机次数
手机A特性使用次数（AFuncTimes）一段时间内(30天) 手机A特性使用次数
手机B特性使用次数（BFuncTimes）一段时间内(30天) 手机B特性使用次数
手机C特性使用次数（CFuncTimes）一段时间内(30天) 手机C特性使用次数
手机D特性使用次数（DFuncTimes）一段时间内(30天) 手机D特性使用次数
手机E特性使用次数（EFuncTimes）一段时间内(30天) 手机E特性使用次数
手机F特性使用次数（FFuncTimes）一段时间内(30天) 手机F特性使用次数
手机G特性使用情况（GFuncSum）一段时间内(30天)G特性使用情况（数值）'''
behavior = pd.read_csv("data/user_behavior_info.csv",header=None)
behavior.columns = ['uId','bootTimes','AFuncTimes','BFuncTimes','CFuncTimes','DFuncTimes','EFuncTimes','FFuncTimes','GFuncTimes']

behavior['ABCDEFTimes'] = behavior['AFuncTimes'] + behavior['BFuncTimes'] + behavior['CFuncTimes'] + behavior['DFuncTimes'] + \
                          behavior['EFuncTimes'] + behavior['FFuncTimes']
behavior['G_boot'] = behavior['GFuncTimes'] / behavior['bootTimes']
behavior['A_all'] = behavior['AFuncTimes'] / behavior['ABCDEFTimes']
behavior['B_all'] = behavior['BFuncTimes'] / behavior['ABCDEFTimes']
behavior['C_all'] = behavior['CFuncTimes'] / behavior['ABCDEFTimes']
behavior['D_all'] = behavior['DFuncTimes'] / behavior['ABCDEFTimes']
behavior['E_all'] = behavior['EFuncTimes'] / behavior['ABCDEFTimes']
behavior['F_all'] = behavior['FFuncTimes'] / behavior['ABCDEFTimes']
behavior['all_boot'] = behavior['ABCDEFTimes']/ behavior['bootTimes']

In [5]:
'''
应用标识（appId）appId为app应用的唯一标识
应用类型（category）app所属的应用类型
'''
app_info = pd.read_csv("data/app_info.csv",header=None)
app_info.columns = ['appId','category']


#  category 为主键，每个下的appid数量


In [61]:
app_info.nunique()

appId       167622
category        40
dtype: int64

In [6]:
'''
用户标识（uId）匿名化处理后的用户唯一标识（ID取值从1000001开始，依次递增）
性别（gender）男/女（取值空间0,1）
常住地（city）如深圳市、南京市等（匿名化处理，实际取值c001，c002….）
手机型号（prodName）如mate10、honor 10等（匿名化处理，实际取值p001、p002……）
手机ram容量（ramCapacity）手机ram的大小，以G为单位
ram剩余容量占比（ramLeftRation）手机剩余的容量占总容量的比例
rom容量（romCapacity）手机rom的大小，以G为单位
rom剩余容量占比（romLeftRation）手机剩余rom容量占总rom容量的比例
手机颜色（color）手机机身的颜色
字体大小（fontSize）手机设置的字体大小
上网类型（ct）2G/3G/4G/WIFI
移动运营商（carrier）移动/联通/电信/其他
手机系统版本（os）AndroId操作系统的版本号
'''

user_info = pd.read_csv('data/user_basic_info.csv',header=None)
user_info.columns = ['uId','gender','city','prodName','ramCapacity','ramLeftRation','romCapacity'
,'romLeftRation','color','fontSize','ct','carrier','os']

user_info['ramLeft'] = user_info['ramCapacity'] * user_info['ramLeftRation']
user_info['romLeft'] = user_info['romCapacity'] * user_info['romLeftRation']
user_info['rom_ram'] = user_info['romCapacity'] / user_info['ramCapacity']
user_info['ct_2g'] = user_info['ct'].apply(lambda x: 1 if isinstance(x,str) and '2g' in x else 0)
user_info['ct_3g'] = user_info['ct'].apply(lambda x: 1 if isinstance(x,str) and '3g' in x else 0)
user_info['ct_4g'] = user_info['ct'].apply(lambda x: 1 if isinstance(x,str) and '4g' in x else 0)
user_info['ct_wifi'] = user_info['ct'].apply(lambda x: 1 if isinstance(x,str) and 'wifi' in x else 0)
del user_info['ct']

user_info['os_first'] = user_info['os'].apply(lambda x:int(x) if not np.isnan(x) else -1)

In [69]:
cate = ['gender','city','prodName','color','ct','carrier','os']
user_info[cate].nunique()

gender        2
city        363
prodName    227
color       136
ct            7
carrier       4
os           16
dtype: int64

In [None]:
# 内存不够，这个先不用
# ---------------------------------------------------------------------
'''
用户标识（uId）匿名化处理后的用户唯一标识（ID取值从1000001开始，依次递增）
应用标识（appId）匿名化处理后的app唯一标识
使用时长（duration）1天内用户对某app的累计使用时长
打开次数（times）1天内用户对某app的累计打开次数
使用日期（use_date）用户对某app的使用日期

usage = pd.read_csv('data/user_app_usage.csv',header=None)
usage.columns = ['uId','appId','duration','times','use_date']
'''

In [7]:
'''
用户标识（uId）匿名化处理后的用户唯一标识（ID取值从1000001开始，依次递增）
应用标识（appId）匿名化处理后的app唯一标识'''
active = pd.read_csv("data/user_app_actived.csv",header=None)
active.columns = ['uId','appId']

active['appId'] = active['appId'].apply(lambda x:x.split('#'))
active['appNum'] = active['appId'].apply(lambda x:len(x) if x[0]!='\\N' else 0)
active['appNum'].describe()

count    2.512500e+06
mean     4.051719e+01
std      2.283547e+01
min      0.000000e+00
25%      2.400000e+01
50%      3.700000e+01
75%      5.200000e+01
max      8.880000e+02
Name: appNum, dtype: float64

In [12]:
tmp = active['appId'].values
appid= set()
for each in tmp:
    appid |= set(each)
len(appid)

9401

In [8]:
# 统计每个appId激活人数，和激活人的属性分布，年龄、性别、城市、手机型号等属性

key = app_info.appId.values
val = app_info.category.values

from collections import defaultdict as dd

app_map = dd(int)
for i in range(len(key)):
     app_map[key[i]] = val[i]
        
"""
# 统计每个用户 每个app类别下的激活app的比例，贝叶斯平滑 ,有的激活appid不在app_info中,那就不计算，也把其的数目减掉
tmp = active['appId'].values
res = []
for i in tqdm(range(len(tmp))):
    line = [0.0]*(len(app_map)+1)
    # 这个用户总共激活多少app
    cnt = len(tmp[i])
    for app in tmp[i]:
        line[app_map[app]] += 1
    for j in range(len(line)):
        line[j] /= cnt
    res.append(line[:])
"""

"\n# 统计每个用户 每个app类别下的激活app的比例，贝叶斯平滑 ,有的激活appid不在app_info中,那就不计算，也把其的数目减掉\ntmp = active['appId'].values\nres = []\nfor i in tqdm(range(len(tmp))):\n    line = [0.0]*(len(app_map)+1)\n    # 这个用户总共激活多少app\n    cnt = len(tmp[i])\n    for app in tmp[i]:\n        line[app_map[app]] += 1\n    for j in range(len(line)):\n        line[j] /= cnt\n    res.append(line[:])\n"

In [9]:
# 每个用户激活了几种类别的app
def app_cate_data(x,t):
    cate = dd(int)
    for each in x:    
        cate[app_map[each]] += 1
    tmp = cate.values()
    s = sum(tmp)+1
    # all_num
    if t == 0:
        return len(cate)
    # max_num
    elif t == 1:
        return max(tmp)/s
    # min_num
    else:
        return min(tmp)/s
        
active['app_cate_num'] = active['appId'].apply(lambda x: app_cate_data(x,0))

#平均每种app激活多少
active['app_cate_mean'] = active['appNum']/active['app_cate_num']

# 激活最多种类的app数目、占所有激活数目比例
active['app_cate_maxRate'] = active['appId'].apply(lambda x: app_cate_data(x,1))
active['app_cate_minRate'] = active['appId'].apply(lambda x: app_cate_data(x,2))
active['app_cate_max'] = active['app_cate_maxRate'] * (active['appNum']+1)
active['app_cate_min'] = active['app_cate_minRate'] * (active['appNum']+1)

In [16]:
train_cols = label.shape[0]
data = label.append(test)
data = data.merge(user_info,how='left',on='uId')
data = data.merge(active,how='left',on='uId')
data = data.merge(behavior,how='left',on='uId')
del data['appId']

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


In [17]:
i = 1
for f1 in ['prodName','city','gender','color','os_first','carrier','ct_2g','ct_3g','ct_4g','ct_wifi']:
    for f2 in ['appNum','ramCapacity','romCapacity','fontSize','bootTimes','AFuncTimes','BFuncTimes',
               'CFuncTimes','DFuncTimes','EFuncTimes','FFuncTimes','GFuncTimes']:
        if i==1:
            print(data.groupby(f1)[f2].agg(['mean','min','max','std','size']).reindex())
            i+=1
            break
        


               mean  min  max        std    size
prodName                                        
p001      44.043576    0  264  19.501422   94342
p0010     21.448759    0   92  11.372716    6850
p00100    39.994117    0  248  18.221258   50311
p00101     5.363636    0   24   8.065640      11
p00102    45.236699    0  393  21.804048   19510
p00103    49.350192    0  230  22.566456   58745
p00104    43.842393    0  199  22.354452    3293
p00105    47.588709    0  312  23.048764   21751
p00106    17.116841    0  123  11.285017   19839
p00107    39.948569    0  372  21.373881   87438
p00108    51.702743    0  641  25.483575  153399
p00109     1.000000    1    1        NaN       1
p0011     35.335126    0  745  17.889290   85914
p00110    56.106049    0  888  26.900636   39529
p00111    58.435125    0  266  30.742959    2158
p00112    57.721435    0  240  27.855572    6774
p00113    27.006240    0  129  13.976167    5609
p00114     3.250000    0   13   6.500000       4
p00115     0.500000 

In [22]:
data.merge(data.groupby(f1)[f2].agg(['mean','min','max','std','size']).reset_index(),how='left',on=f1)

Unnamed: 0,age_group,uId,gender,city,prodName,ramCapacity,ramLeftRation,romCapacity,romLeftRation,color,fontSize,carrier,os,ramLeft,romLeft,rom_ram,ct_2g,ct_3g,ct_4g,ct_wifi,os_first,appNum,app_cate_num,app_cate_mean,app_cate_maxRate,app_cate_minRate,app_cate_max,app_cate_min,bootTimes,AFuncTimes,BFuncTimes,CFuncTimes,DFuncTimes,EFuncTimes,FFuncTimes,GFuncTimes,ABCDEFTimes,G_boot,A_all,B_all,C_all,D_all,E_all,F_all,all_boot,mean,min,max,std,size
0,4.0,1000001,0,c00145,p00169,3.0,0.43,32.0,0.46,皓月银,1.15000,China_Mobile,8.0,1.29,14.72,10.666667,0,0,1,0,8,21,9,2.333333,0.409091,0.045455,9.0,1.0,108,0.00,0.00,1.00,0.07,0.00,0.00,3319,1.07,3.073148e+01,0.000000,0.000000,0.934579,0.065421,0.000000,0.000000,0.009907,16459.038760,0,41577926,123824.303513,237383
1,3.0,1000011,0,c00126,p0023,,,,,幻夜黑,,China_Mobile,8.1,,,,0,0,1,1,8,16,8,2.000000,0.176471,0.058824,3.0,1.0,0,0.00,0.00,0.00,0.00,0.00,0.00,220,0.00,inf,,,,,,,,11474.113781,0,39726854,84523.183426,2275117
2,5.0,1000015,1,c00306,p00169,3.0,0.34,32.0,0.06,皓月银,1.30000,China_Telecom,8.0,1.02,1.92,10.666667,0,0,0,1,8,19,9,2.111111,0.250000,0.050000,5.0,1.0,12,0.00,0.00,0.03,0.13,0.00,0.00,21881,0.16,1.823417e+03,0.000000,0.000000,0.187500,0.812500,0.000000,0.000000,0.013333,11474.113781,0,39726854,84523.183426,2275117
3,3.0,1000019,0,c00150,p0049,2.0,,17.0,,蓝色,,China_Unicom,8.1,,,8.500000,0,0,0,0,8,2,2,1.000000,0.333333,0.333333,1.0,1.0,0,0.00,0.00,0.00,0.00,0.00,0.00,0,0.00,,,,,,,,,16459.038760,0,41577926,123824.303513,237383
4,2.0,1000023,1,c0037,p0047,2.0,0.34,16.0,0.06,银色,1.00000,China_Telecom,7.0,0.68,0.96,8.000000,0,0,0,1,7,13,5,2.600000,0.428571,0.071429,6.0,1.0,5,0.00,0.00,0.00,0.13,0.00,0.00,0,0.13,0.000000e+00,0.000000,0.000000,0.000000,1.000000,0.000000,0.000000,0.026000,11474.113781,0,39726854,84523.183426,2275117
5,4.0,1000025,0,c006,p00171,4.0,0.31,64.0,0.20,琥珀金,1.00000,China_Telecom,8.0,1.24,12.80,16.000000,0,0,1,1,8,54,16,3.375000,0.181818,0.018182,10.0,1.0,97,0.00,0.17,3.97,1.17,1.60,0.00,0,6.91,0.000000e+00,0.000000,0.024602,0.574530,0.169320,0.231548,0.000000,0.071237,11474.113781,0,39726854,84523.183426,2275117
6,4.0,1000029,0,c0093,p00156,6.0,0.20,68.0,0.27,极光色,1.15000,China_Unicom,9.0,1.20,18.36,11.333333,0,0,1,1,9,65,15,4.333333,0.287879,0.015152,19.0,1.0,15,1.50,0.00,1.07,1.27,4.03,0.00,440,7.87,2.933333e+01,0.190597,0.000000,0.135959,0.161372,0.512071,0.000000,0.524667,11474.113781,0,39726854,84523.183426,2275117
7,2.0,1000035,0,c00316,p00156,6.0,0.26,68.0,0.64,宝石蓝,1.15000,China_Telecom,9.0,1.56,43.52,11.333333,0,0,0,1,9,29,9,3.222222,0.166667,0.066667,5.0,2.0,5,1.00,0.10,1.37,2.07,3.30,0.00,146,7.84,2.920000e+01,0.127551,0.012755,0.174745,0.264031,0.420918,0.000000,1.568000,11474.113781,0,39726854,84523.183426,2275117
8,4.0,1000037,0,c00324,p00107,4.0,0.40,64.0,0.02,香槟金,,China_Unicom,8.0,1.60,1.28,16.000000,0,0,0,1,8,69,15,4.600000,0.242857,0.014286,17.0,1.0,5,0.00,0.00,0.30,0.00,0.00,0.00,0,0.30,0.000000e+00,0.000000,0.000000,1.000000,0.000000,0.000000,0.000000,0.060000,11474.113781,0,39726854,84523.183426,2275117
9,3.0,1000038,0,c0076,p0011,4.0,0.29,64.0,0.16,流光金,1.00000,China_Mobile,7.0,1.16,10.24,16.000000,0,0,1,1,7,27,9,3.000000,0.250000,0.035714,7.0,1.0,42,0.00,0.13,0.60,0.73,1.00,0.00,0,2.46,0.000000e+00,0.000000,0.052846,0.243902,0.296748,0.406504,0.000000,0.058571,11474.113781,0,39726854,84523.183426,2275117


In [None]:
feat_dict = {}
for f in ['city','prodName','color','gender','ct','carrier','os']
    le = LabelEncoder()
    data[f] = le.fit_transform(data[f])
    feat_dict[f] = data[f].nunique()

In [52]:
train = data.iloc[:train_cols]
test = data.iloc[train_cols:]
del test['age_group']
y = train['age_group'] - 1
del train['age_group']
X = train
del train