# 特征工程与机器学习建模

### 自定义工具函数库

In [None]:
import pandas as pd
import numpy as np
import scipy as sp

#文件读取
def read_csv_file(f, logging=False):
    data = pd.read_csv(f)
    if logging:
        print(data.head(5))
        print(f," 包含以下列....")
        print(data.columns.values)
        print(data.describe())
        print(data.info())
    return data

#第一类编码
def categories_process_first_class(cate):
    cate = str(cate)
    if len(cate) == 1:
        if int(cate) == 0:
            return 0
    else:
        return int(cate[0])
    
#第2类编码
def categories_process_second_class(cate):
    cate = str(cate)
    if len(cate) < 3:
        return 0
    else:
        return int(cate[1:])
    
#年龄处理，切段
def age_process(age):
    age = int(age)
    if age == 0:
        return 0
    elif age < 15:
        return 1
    elif age < 25:
        return 2
    elif age < 40:
        return 3
    elif age < 60:
        return 4
    else:
        return 5
    
#省份处理
def process_province(hometown):
    hometown = str(hometown)
    province = int(hometown[0:2])
    return province

#城市处理
def process_city(hometown):
    hometown = str(hometown)
    if len(hometown) > 1:
        province = int(hometown[2:])
    else:
        province = 0

    return province


#几点钟
def get_time_day(t):
    t = str(t)
    t = int(t[0:2])
    return t


#一天切成4段
def get_time_hour(t):
    t = str(t)
    t = int(t[2:4])
    if t < 6:
        return 0
    elif t < 12:
        return 1
    elif t < 18:
        return 2
    else:
        return 3
    

    
#评估与计算logloss
def logloss(act, pred):
    epsilon = 1e-15
    pred = sp.maximum(epsilon, pred)
    pred = sp.minimum(1-epsilon, pred)
    l1 = sum(act*sp.log(pred) + sp.subtract(1, act)*sp.log(sp.subtract(1, pred)))
    li = li * -1.0/len(act)
    return l1



### 特征工程+随机森林建模

In [None]:
from sklearn.preprocessing import Binarizer
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
import numpy as np

#### 特征工程

In [None]:
#加载数据

train_data = read_csv_file('./data/train.csv', logging=True)
ad = read_csv_file('./data/ad.csv', logging=True)


In [None]:
#app_categories
app_categories = read_csv_file('./data/app_categories.csv', logging=True)
app_categories["app_categories_first_class"] = app_categories['appCategory'].apply(categories_process_first_class)
app_categories["app_categories_second_class"] = app_categories['appCategory'].apply(categories_process_second_class)

In [None]:
#user
user = read_csv_file('./data/user.csv', logging=True)
user["age_process"] = user['age'].apply(age_process)
user["hometown_province"] = user['hometown'].apply(process_province)
user["hometown_city"] = user['hometown'].apply(process_city)
user["residence_province"] = user['residence'].apply(process_province)
user["residence_city"] = user['residence'].apply(process_city)

In [None]:
#train_data
train_data['clickTime_day'] = train_data['clickTime'].apply(get_time_day)
train_data['clickTime_hour'] = train_data['clickTime'].apply(get_time_hour)

#test_data
test_data = read_csv_file('./data/test.csv', logging=True)
test_data['clickTime_day'] = test_data['clickTime'].apply(get_time_day)
test_data['clickTime_hour'] = test_data['clickTime'].apply(get_time_hour)


#### 合并数据

In [None]:
train_user = pd.merge(train_data, user, on='userID')
train_user_ad = pd.merge(train_user, ad, on='creativeID')
train_user_ad_app = pd.merge(train_user_ad, app_categories, on='appID')


#### 取出数据和 label

In [None]:
#特征部分
#todo 
feat_labels = ['creative','userID', 'positionID']
x_user_ad_app = train_user_ad_app.loc[:, feat_labels]
x_user_ad_app = x_user_ad_app.values
x_user_ad_app = np.array(x_user_ad_app, dtype='int32')

#标签部分
y_user_ad_app = train_user_ad_app.loc[:,['label']].values

### 随机森林建模

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, train_test_split

forest = RandomForestClassifier(n_estimators=100,
                               random_state=0,
                                n_jobs=-1
                               )
forest.fit(x_user_ad_app, y_user_ad_app.reshape(y_user_ad_app.shape[0],))
importances = forest.feature_importances_

#由高到低进行排序
indices = np.argsort(importances)[::-1]


### 特征重要度展示

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline #生成的图可以嵌入在notebook中

plt.title('Feature Importances')
plt.bar(range(x_user_ad_app.shape[1]),
       importances[indices],
       color='lightblue'
        align='center'
       )
plt.xticks(range(x_user_ad_app.shape[1]),
          feat_labels[indices],
           rotation=90
          )
plt.xlim([-1, x_user_ad_app.shape[1]])
plt.tight_layout()
#plt.savefig("./rf.png", dpi=300)
plt.show()


### 随机森林调参

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridsearchCV

param_grid = {'n_estimators': [10, 100, 500, 1000],
             'max_features':[0.6, 0.7, 0.8, 0.9]
             }

rf = RandomForestClassifier()
rfc = GridsearchCV(rf, param_grid, scoring='neg_log_loss', cv=3, n_jobs=2)
rfc.fit(x_user_ad_app, y_user_ad_app.reshape(y_user_ad_app.shape[0],))

print(rfc.best_score_)
print(rfc.best_params_)

### Xgboost 调参

In [None]:
import xgboost as xgb
import os
import numpy as np
from sklearn.model_selection import GridsearchCV

os.environ["OMP_NUM_THREADS"] = "8" #并行训练
rng = np.random.RandomState(4315)

import warnings
warnings.filterwarnings("ignore")

param_grid = {'n_estimators': [10, 50, 100, 400, 800, 1000, 1200],
              'max_depth':[3,4,5,7,9],
              'learning_rate':[0.1,0.2,0.3],
              'gamma':[0,0.2],
              'subsample':[0.8, 1]
             }

xgb_model = xgb.XGBClassifier()
rgs = GridsearchCV(xgb_model, param_grid, n_jobs=8)
rgs.fit(x_user_ad_app, y_user_ad_app.reshape(y_user_ad_app.shape[0],))

print(rgs.best_score_)
print(rgs.best_params_)


### 正负样本比

In [None]:
positive_num = train_user_ad_app[train_user_ad_app['label']==1].values.shape[0]
negative_num = train_user_ad_app[train_user_ad_app['label']==0].values.shape[0]

negative_num/float(negative_num)

我们可以看到正负样本数量相差非常大，数据严重类别不平衡

我们用Bagging修正过后，处理不均衡样本的B(l)agging来进行训练和试验。

In [None]:
from blagging import BlaggingClassifier