# 导入库文件

In [None]:
# 导入库文件
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
from sklearn.model_selection import GridSearchCV
import xgboost as xgb 
from sklearn.model_selection import train_test_split
import pickle
sns.set(style='white', context='notebook', palette='Set2')
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

import gc 
import os

# 获取数据

In [None]:
# 产生采样数据
def genarate_sample_data(begin):
    with open('train_df.pkl', 'rb') as f: 
        train_df = pickle.load(f)
        
    with open('test_df.pkl', 'rb') as f:
        test_df = pickle.load(f)
    
    return train_df, test_df

In [None]:
# 产生测试数据
def create_test_df(begin):
    if begin == True:
        test_data = pd.read_csv('../input/test_V2.csv')
        test_df = reduce_mem_usage(test_data)
        
        del test_data
        gc.collect()
        
    return test_df

In [None]:
# 载入莺尾花数据集
from sklearn import datasets

iris = datasets.load_iris() # 导入数据集
X = iris.data # 获得其特征向量
y = iris.target # 获得样本label

In [None]:
# 自制数据集
from sklearn.datasets.samples_generator import make_classification

X, y = make_classification(n_samples=6, n_features=5, n_informative=2, 
    n_redundant=2, n_classes=2, n_clusters_per_class=2, scale=1.0, 
    random_state=20)

# n_samples：指定样本数
# n_features：指定特征数
# n_classes：指定几分类
# random_state：随机种子，使得随机状可重

# Pandas备忘手册

In [None]:
# 数据变换代码示例
import numpy as np
import pandas as pd
frame = pd.DataFrame(np.random.randn(4, 3), columns=list('abc'),index=['a', 'b', 'c','d'])
print (frame)
#          a         b         c
#a -0.074178  0.217576  0.489068
#b  0.922744  1.524651  2.127485
#c -1.457947 -0.498123 -0.687133
#d  0.106064 -2.890129  0.981858

f = lambda x: x.max() - x.min()
print(frame.apply(f))


format = lambda x: x * x
print(frame.applymap(format))

#          a         b         c
#a  0.005502  0.047339  0.239188
#b  0.851457  2.324562  4.526193
#c  2.125609  0.248126  0.472152
#d  0.011250  8.352848  0.964045

print(frame['a'].map(format))
#a    0.005502
#b    0.851457
#c    2.125609
#d    0.011250
#Name: a, dtype: float64

作者：wong小尧
链接：https://www.jianshu.com/p/f5d6423709fc
來源：简书
简书著作权归作者所有，任何形式的转载都请联系作者获得授权并注明出处。

# 数据预处理

from sklearn import preprocessing

## 数据归一化

In [None]:
data = [[0, 0], [0, 0], [1, 1], [1, 1]]
# 1. 基于mean和std的标准化
scaler = preprocessing.StandardScaler().fit(train_data)
scaler.transform(train_data)
scaler.transform(test_data)

# 2. 将每个特征值归一化到一个固定范围
scaler = preprocessing.MinMaxScaler(feature_range=(0, 1)).fit(train_data)
scaler.transform(train_data)
scaler.transform(test_data)
#feature_range: 定义归一化范围，注用（）括起来

### 正则化

In [None]:
 X_normalized = preprocessing.normalize(X, norm='l2')

### 独热编码

In [None]:
data = [[0, 0, 3], [1, 1, 0], [0, 2, 1], [1, 0, 2]]
encoder = preprocessing.OneHotEncoder().fit(data)
enc.transform(data).toarray()

# 数据集拆分

In [None]:
from sklearn.mode_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# 定义模型

In [None]:
# 拟合模型
model.fit(X_train, y_train)
# 模型预测
model.predict(X_test)

# 获得这个模型的参数
model.get_params()
# 为模型进行打分
model.score(data_X, data_y) # 线性回归：R square； 分类问题： acc

### 线性回归

In [None]:
from sklearn.linear_model import LinearRegression
# 定义线性回归模型
model = LinearRegression(fit_intercept=True, normalize=False, 
    copy_X=True, n_jobs=1)
"""
参数
---
    fit_intercept：是否计算截距。False-模型没有截距
    normalize： 当fit_intercept设置为False时，该参数将被忽略。 如果为真，则回归前的回归系数X将通过减去平均值并除以l2-范数而归一化。
     n_jobs：指定线程数
"""

### 逻辑回归

In [None]:
from sklearn.linear_model import LogisticRegression
# 定义逻辑回归模型
model = LogisticRegression(penalty=’l2’, dual=False, tol=0.0001, C=1.0, 
    fit_intercept=True, intercept_scaling=1, class_weight=None, 
    random_state=None, solver=’liblinear’, max_iter=100, multi_class=’ovr’, 
    verbose=0, warm_start=False, n_jobs=1)

"""参数
---
    penalty：使用指定正则化项（默认：l2）
    dual: n_samples > n_features取False（默认）
    C：正则化强度的反，值越小正则化强度越大
    n_jobs: 指定线程数
    random_state：随机数生成器
    fit_intercept: 是否需要常量
"""

### 朴素贝叶斯

In [None]:
from sklearn import naive_bayes
model = naive_bayes.GaussianNB() # 高斯贝叶斯
model = naive_bayes.MultinomialNB(alpha=1.0, fit_prior=True, class_prior=None)
model = naive_bayes.BernoulliNB(alpha=1.0, binarize=0.0, fit_prior=True, class_prior=None)
"""
文本分类问题常用MultinomialNB
参数
---
    alpha：平滑参数
    fit_prior：是否要学习类的先验概率；false-使用统一的先验概率
    class_prior: 是否指定类的先验概率；若指定则不能根据参数调整
    binarize: 二值化的阈值，若为None，则假设输入由二进制向量组成
"""

### 决策树

In [None]:
from sklearn import tree 
model = tree.DecisionTreeClassifier(criterion=’gini’, max_depth=None, 
    min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, 
    max_features=None, random_state=None, max_leaf_nodes=None, 
    min_impurity_decrease=0.0, min_impurity_split=None,
     class_weight=None, presort=False)
"""参数
---
    criterion ：特征选择准则gini/entropy
    max_depth：树的最大深度，None-尽量下分
    min_samples_split：分裂内部节点，所需要的最小样本树
    min_samples_leaf：叶子节点所需要的最小样本数
    max_features: 寻找最优分割点时的最大特征数
    max_leaf_nodes：优先增长到最大叶子节点数
    min_impurity_decrease：如果这种分离导致杂质的减少大于或等于这个值，则节点将被拆分。
"""

### 支持向量机

In [None]:
from sklearn.svm import SVC
model = SVC(C=1.0, kernel=’rbf’, gamma=’auto’)
"""参数
---
    C：误差项的惩罚参数C
    gamma: 核相关系数。浮点数，If gamma is ‘auto’ then 1/n_features will be used instead.
"""

### KNN

In [None]:
from sklearn import neighbors
#定义kNN分类模型
model = neighbors.KNeighborsClassifier(n_neighbors=5, n_jobs=1) # 分类
model = neighbors.KNeighborsRegressor(n_neighbors=5, n_jobs=1) # 回归
"""参数
---
    n_neighbors： 使用邻居的数目
    n_jobs：并行任务数
"""

### 多层感知机MLP(DNN)

In [None]:
from sklearn.neural_network import MLPClassifier
# 定义多层感知机分类算法
model = MLPClassifier(activation='relu', solver='adam', alpha=0.0001)
"""参数
---
    hidden_layer_sizes: 元祖
    activation：激活函数
    solver ：优化算法{‘lbfgs’, ‘sgd’, ‘adam’}
    alpha：L2惩罚(正则化项)参数。
"""

#  模型评估与选择篇

### 交叉验证

In [None]:
from sklearn.model_selection import cross_val_score
cross_val_score(model, X, y=None, scoring=None, cv=None, n_jobs=1)
"""参数
---
    model：拟合数据的模型
    cv ： k-fold
    scoring: 打分参数-‘accuracy’、‘f1’、‘precision’、‘recall’ 、‘roc_auc’、'neg_log_loss'等等
"""

### 检验曲线

In [None]:
# 使用检验曲线，我们可以更加方便的改变模型参数，获取模型表现。
from sklearn.model_selection import validation_curve
train_score, test_score = validation_curve(model, X, y, param_name, param_range, cv=None, scoring=None, n_jobs=1)
"""参数
---
    model:用于fit和predict的对象
    X, y: 训练集的特征和标签
    param_name：将被改变的参数的名字
    param_range： 参数的改变范围
    cv：k-fold
   
返回值
---
   train_score: 训练集得分（array）
    test_score: 验证集得分（array）
"""

# 保存模型

In [None]:
import pickle

# 保存模型
with open('model.pickle', 'wb') as f:
    pickle.dump(model, f)

# 读取模型
with open('model.pickle', 'rb') as f:
    model = pickle.load(f)
model.predict(X_test)

In [None]:
from sklearn.externals import joblib

# 保存模型
joblib.dump(model, 'model.pickle')

#载入模型
model = joblib.load('model.pickle')

# 缺失值检测

In [None]:
pd.DataFrame({'train':train_df.isnull().sum(),'test':test_df.isnull().sum()})

# 序列化

In [None]:
# 序列化保存模型
with open('model001.pkl', 'wb') as f:
    pickle.dump(model, f)

# 逆序列化读取模型
with open('model001.pkl', 'rb') as f:
    model = pickle.load(f)

# 特征工程

In [None]:
def rank_by_team(df):
    df_mean = df.groupby(by=['matchId', 'groupId'])[feature_cols].mean()
    df_mean_rank = df_mean.groupby(by=['matchId', 'groupId']).rank(pct=True).reset_index()
    df = df.merge(df_mean_rank, on=['matchId', 'groupId'], how='left', suffixes=['', '_mean_rank'])
    return df

In [None]:
# 定义划分时间段函数
def getSeg(x):
    if x >=0 and x <=6:
        return 1
    elif x >=7 and x <= 12:
        return 2 
    elif x >=13 and x <=18:
        return 3 
    elif x >= 19 and x <=23:
        return 4 

train_df['hour_seg'] = train_df['hour'].apply(lambda x: getSeg(x))

In [None]:
# 对两个类别型特征求交叉特征后的count
# add cross feature 
first_feature = ['app_cate_id', 'f_channel', 'app_id']
second_feature = ['make', 'model', 'osv1', 'osv2', 'osv3', 'adid', 'advert_name', 'creative_id'
                 'carrier', 'nnt', 'devtype', 'os']
cross_feature = [] 
for feat_1 in first_feature:
    for feat_2 in second_feature:
        col_name = 'cross_' + feat_1 + '_and_' + 'feat_2'
        cross_feature.append(col_name)
        data[col_name] = data[feat_1].astype(str).values + '_' + data[feat_2].astype(str).values

# 求count计数特征
# ...

In [None]:
# 构建类别特征的nunique特征(如广告主id有多少个不同的广告id)
adid_nuq = ['moedel', 'make', 'os', 'city', 'province', 'user_tags', 'f_channel', 'app_id', 'carrier', 'nnt', 'dtype', 'app_cate_id', 'inner_slot_id']
for feat in adid_nuq:
    gp1 = data.groupby('adid')[feat].unique().reset_index().rename(columns={feat:'adid_%snuq_num' % feat})
    gp2 = data.groupby(feat)['adid'].unique().reset_index().rename(columns={'adid': '%s_adid_nuq_num' % feat})
    data = pd.merge(data, gp1, how='left', on=['adid'])
    data = pd.merge(data, gp2, hpw='left', on=[feat])

# 减小内存

In [None]:
# 减小内存的具体方法,参考https://www.kaggle.com/gemartin/load-data-reduce-memory-usage
def reduce_mem_usage(df):
    """ 遍历dataFrame的每一列并修改其中的数据类型可以减小内存       
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

In [None]:
# 创建所有特征按group分组再求平均后的百分比排名作为新的特征
def rank_by_team(df):
    cols_to_drop = ['Id', 'groupId', 'matchId', 'matchType', 'winPlacePerc']
    features = [col for col in df.columns if col not in cols_to_drop]
    agg = df.groupby(['matchId', 'groupId'])[features].mean()
    agg = agg.groupby('matchId').rank(pct=True)
    return df.merge(agg, suffixes=['', '_mean_rank'], how='left', on=['matchId', 'groupId'])

In [None]:
# 产出特征和特征名称
def feature_creation(df):
   # 定义保留列
    cols_to_drop = ['Id', 'groupId', 'matchId', 'matchType', 'winPlacePerc']
    feature_cols = [x for x in df.columns if x not in cols_to_drop]
    
    # 创造排序特征并去除一些无用列
    rank_df = rank_by_team(df)
    cols = [x for x in rank_df.columns if x not in cols_to_drop]
    rank_df = rank_df[cols]
    features_name = rank_df.columns
    X = rank_df.values
    
    return X, features_name

In [None]:
#产出标签
def label_creation(df):
    y = df[['winPlacePerc']]
    return y

### 绘制特征重要度

In [None]:
# 绘制特种重要度
def plot_feature_importance(model):
    importance = model.feature_importances_
    feature = features_name
    plt.figure(figsize=(10,10))
    plt.title('Feature Importance')
    feature_importance = pd.DataFrame(importance)
    feature_importance.columns = ['importance']
    feature_importance['feature'] = feature 
    feature_importance = feature_importance[['feature', 'importance']]
    feature_importance = feature_importance.sort_values(by='importance', ascending=False)
    sns.barplot(data=feature_importance, y='feature', x='importance')
    plt.tight_layout()
    plt.savefig('feature_importance_{}.png'.format(model_name))

In [None]:
# 基于卡方检验的特征重要度选择
# 卡方检验
SKB = SelectPercentile(chi2, percentile=95).fit(train_new, train_y)
train_new = SKB.transform(train_new)
test_new = SKB.transform(test_new)

In [None]:
# 产生训练数据
def create_train_df(begin):
    if begin == True:
        train_data = pd.read_csv('../input/train_V2.csv')
        train_df = reduce_mem_usage(train_data)
        train_df = train_df.sort_values(by=['matchId', 'groupId']).iloc[:1000000, :]
        
        del train_data
        gc.collect()
        
    return train_df

In [None]:
# 读取全部数据
def read_data(begin):
    if begin == True:
        train_df = pd.read_csv('../input/train_V2.csv')
        test_df = pd.read_csv('../input/test_V2.csv')
    else:
        print('未读取数据')
    return train_df, test_df

In [None]:
# 绘制相关性热力图
drop_features = ['Id', 'groupId', 'matchId']
feats = [f for f in train_df.columns if f not in drop_features]
plt.figure(figsize=(18, 16))
sns.heatmap(train_df[feats].corr(),vmax=1.0, annot=True, square=True, linewidths=0.1, linecolor='black', cmap='RdBu')
plt.show()

In [None]:
# 构建模型
params = {
    'num_leaves': 144,
    'learning_rate': 0.1,
    'n_estimators': 800,
    'max_depth':12,
    'max_bin':55,
    'bagging_fraction':0.8,
    'bagging_freq':5,
    'feature_fraction':0.9,
    'verbose':50, 
    'early_stopping_rounds':100
    }

# LightGBM parameters
model = lgb.LGBMRegressor(num_leaves=params['num_leaves'], learning_rate=params['learning_rate'], 
                    n_estimators=params['n_estimators'], max_depth=params['max_depth'],
                    max_bin = params['max_bin'], bagging_fraction = params['bagging_fraction'], 
                    bagging_freq = params['bagging_freq'], feature_fraction = params['feature_fraction'],
                   )

In [None]:
# 保存模型、清理内存
with open('new_model_{}.pkl'.format(model_name), 'wb') as f:
    pickle.dump(model, f)
    
del X_train, X_valid, y_train, y_valid 
gc.collect()

In [None]:
# 模型参数及五折构造结果
lgb_clf = lgb..LGBMClassifier(boosting_type='gbdt', num_leaves=48, max_depth=-1, learning_rate=0.02, n_estimators=6000, max_bin=425, subsample_for_bin=50000, objective='binary', min_split_gain=0, min_child_weight=5, min_child_samples=10, subsample=0.8, subsample_freq=1, colsample_bytree=0.8, reg_alpha=3, reg_lambda=0.1, seed=1000, n_jobs=-1, silent=True)
skf = list(StratifiedKFold(y_train, n_folds=5, shuffle=True, random_state=1024))
baseloss = [] 
loss = 0 
for i, (train_index, test_index) in enumerate(skf):
    print('Fold', i)
    lgb_model = lgb_clf.fit(X_train[train_index], y_train[train_index],
                           eval_names=['train', 'valid'],
                           eval_metric='logloss',
                           eval_set=[(X_train[train_index], y_train[train_index]), (X_train[test_index], y_train[test_index]), early_stopping_rounds=100])
    baseloss.append(lgb_model.best_score_['valid']['binary_logloss'])
    loss += lgb_model.best_score_['valid']['binary_logloss']
    test_pred = lgb_model.predict_proba(X_test, num_iteration=lgb_model.best_iteration_)[:, 1]
    print('test mean:', test_pred.mean())
    res['prob_%s' % str(i)] = test_pred 
print('logloss:', baseloss, loss/5)

In [None]:
#预测并产出结果

# 开始读取训练数据
test_df = create_test_df(begin)
#train_df, test_df = genarate_sample_data(begin)
# 数据变换和预测
test_df = drop_na(test_df)
X_test, features_name = feature_creation(test_df)
y_pred = model.predict(X_test)

# 产出结果
submission = pd.concat([test_df['Id'], pd.DataFrame(y_pred, columns=['winPlacePerc'])], axis=1)
submission['winPlacePerc'][submission['winPlacePerc']>1] == 1
submission.to_csv('../output/submission_{}.csv'.format(model_name), index=False)
print('产出结果成功')

In [None]:
# 分层抽样
from sklearn.cross_validation import KFold 
eval_size = 0.10 
kf = KFold(len(y), round(1. / eval_size))
train_indices, valid_indices = next(iter(kf))
X_train, y_train = X[train_indices], y[train_indices]
X_valid, y_valid = X[valid_indices], y[valid_indices]

In [None]:
# 构建回归类问题数据
import numpy as np 
np.random.seed(1337)

from keras.models import Sequential
from keras.layers import Dense 
import matplotlib.pyplot as plt 

#create some data 
X = np.linspace(-1, 1, 200)
np.random.shuffle(X)
# 最后训练出的结果，w越接近1, b越接近2 ,效果越好
y = 1 * X + 2 + np.random.normal(0, 0.05, (200,))

#plot data 
plt.scatter(X, y)
plt.show()

# Keras搭建DNN解决回归类问题模板

In [None]:
#from keras.models import Sequential
from keras.models import Sequential
from keras.layers.core import Dense, Activation
from keras.optimizers import SGD
from keras.layers.advanced_activations import LeakyReLU
from sklearn import preprocessing
import matplotlib.pyplot as plt
import numpy as np

#part1: train data  
#generate 100 numbers from -2pi to 2pi
x_train = np.linspace(-2*np.pi, 2*np.pi, 1000)#array: [1000,]  
x_train = np.array(x_train).reshape((len(x_train), 1)) #reshape to matrix with [100,1]
n=0.1*np.random.rand(len(x_train),1) #generate a matrix with size [len(x),1], value in (0,1),array: [1000,1]  
y_train=np.sin(x_train)+n

#训练数据集：零均值单位方差
x_train = preprocessing.scale(x_train)
scaler = preprocessing.StandardScaler().fit(x_train) 
y_train = scaler.transform(y_train)

#part2: test data  
x_test = np.linspace(-5,5,2000)
x_test = np.array(x_test).reshape((len(x_test), 1))
y_test=np.sin(x_test)
 
#零均值单位方差
x_test = scaler.transform(x_test)
y_test = scaler.transform(y_test)
#plot testing data
fig, ax = plt.subplots()
ax.plot(x_test, y_test,'g')

#prediction data
x_prd = np.linspace(-3,3,101)
x_prd = np.array(x_prd).reshape((len(x_prd), 1))
x_prd = scaler.transform(x_prd)
y_prd=np.sin(x_prd)
#plot testing data
fig, ax = plt.subplots()
ax.plot(x_prd, y_prd,'r')




model = Sequential()
model.add(Dense(100, init='uniform', input_dim=1))
#model.add(Activation(LeakyReLU(alpha=0.01))) 
model.add(Activation('relu'))

model.add(Dense(50))
#model.add(Activation(LeakyReLU(alpha=0.1))) 
model.add(Activation('relu'))

model.add(Dense(1))
#model.add(Activation(LeakyReLU(alpha=0.01))) 
model.add(Activation('tanh'))

#sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
model.compile(loss='mean_squared_error', optimizer='rmsprop', metrics=['accuracy'])
#model.compile(loss='mean_squared_error', optimizer=sgd, metrics=["accuracy"])


#model.fit(x_train, y_train, nb_epoch=64, batch_size=20, verbose=0) 
hist = model.fit(x_test, y_test, batch_size=10, nb_epoch=100, shuffle=True, verbose=0, validation_split=0.2)
#print(hist.history)
score = model.evaluate(x_test, y_test, batch_size=10)

# 进行预测
y_pred = model.predict(x_prd, batch_size=1)

y_pred

In [None]:
# DNN baseline2
# 导入库文件
import pandas as pd 
from keras.models import Sequential
from keras.layers.core import Dense, Activation
from keras.optimizers import SGD
from keras.layers.advanced_activations import LeakyReLU
from sklearn import preprocessing
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import pickle
import numpy as np
import warnings 
from warnings import filterwarnings
filterwarnings('ignore')

begin = True 

# 产生采样数据
def genarate_sample_data(begin):
    with open('train_df.pkl', 'rb') as f: 
        train_df = pickle.load(f)
        
    with open('test_df.pkl', 'rb') as f:
        test_df = pickle.load(f)
    
    return train_df, test_df

train_df, test_df = genarate_sample_data(begin)

train_df = train_df.drop(columns=['Id', 'groupId', 'matchId', 'matchType'])
test_df = test_df.drop(columns=['Id', 'groupId', 'matchId', 'matchType'])

train_df.head()

X = train_df.drop(columns=['winPlacePerc']).iloc[:1000, :]
y = train_df[['winPlacePerc']].iloc[:1000,:]

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=0)

X_train.shape

X_test = test_df.iloc[:1000,:]

X_test.shape

# 搭建模型
model = Sequential()
model.add(Dense(units=100, activation='tanh', input_dim=24))
model.add(Dense(units=100, activation='relu'))
model.add(Dense(units=1, activation='sigmoid'))

# 编译模型
model.compile(loss='mean_absolute_error', optimizer='rmsprop')

model.fit(X_train, y_train, batch_size=8, epochs=10)

# 训练集误差
model.evaluate(X_train, y_train, batch_size=8)

# 验证集误差
model.evaluate(X_valid, y_valid, batch_size=8)

# 对比验证数据和预测数据
y_pred = model.predict(x=X_valid, batch_size=3)
y_pred = pd.DataFrame(y_pred)
y_pred = y_pred.reset_index().iloc[:,1:]
y_valid = y_valid.reset_index().iloc[:,1:]
df = pd.concat([y_valid, y_pred], axis=1)
df.columns = ['winPlacePerc', 'Perdiction']
df.head()

# 预测测试数据
y_pred = model.predict(X_test)
df = pd.DataFrame(y_pred)
df.columns = ['prediction']
df.head()

# 模型融合

In [None]:
#https://www.leiphone.com/news/201709/zYIOJqMzR0mJARzj.html

# 手动实现stacking操作
# Out-of-Fold Predictions 
ntrain = train.shape[0]  # 891
ntest = test.shape[0]    # 428
kf = KFold(n_splits=5, random_state=2017)

def get_ood(clf, X_train, y_train, X_test):
    oof_train = np.zeros((ntrain,)) # 1 * 891
    oof_test = np.zeros((ntest,)) # 1 * 418
    oof_test_skf = np.empty((5, ntest)) # 5 * 418
    
    for i,(train_index, test_index) in enumerate(kf.split(X_train)): # X_train:891 * 7
        kf_X_train = X_train[train_index]
        kf_y_train = y_train[train_index]
        kf_X_test = X_train[test_index]
        
        clf.train(kf_X_train, kf_y_train)
        
        oof_train[test_index] = clf.predict(kf_X_test)
        oof_test_skf[i, :] = clf.predict(X_test)
        
    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)
    # oof_train.reshape(-1, 1): 891 * 1
    # oof_test.reshape(-1, 1): 418 * 1

In [None]:
from sklearn import datasets

iris = datasets.load_iris()
X, y = iris.data[:, 1:3], iris.target

from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB 
from sklearn.ensemble import RandomForestClassifier
from mlxtend.classifier import StackingClassifier
import numpy as np

clf1 = KNeighborsClassifier(n_neighbors=1)
clf2 = RandomForestClassifier(random_state=1)
clf3 = GaussianNB()
lr = LogisticRegression()
sclf = StackingClassifier(classifiers=[clf1, clf2, clf3], 
                          meta_classifier=lr)

print('3-fold cross validation:\n')

for clf, label in zip([clf1, clf2, clf3, sclf], 
                      ['KNN', 
                       'Random Forest', 
                       'Naive Bayes',
                       'StackingClassifier']):

    scores = model_selection.cross_val_score(clf, X, y, 
                                              cv=3, scoring='accuracy')
    print("Accuracy: %0.2f (+/- %0.2f) [%s]" 
          % (scores.mean(), scores.std(), label))

In [None]:
'''5折stacking'''
n_folds = 5
skf = list(StratifiedKFold(y, n_folds))
for j, clf in enumerate(clfs):
    '''依次训练各个单模型'''
    dataset_blend_test_j = np.zeros((X_predict.shape[0], len(skf)))
    for i, (train, test) in enumerate(skf):
        '''使用第i个部分作为预测，剩余的部分来训练模型，获得其预测的输出作为第i部分的新特征。'''
        X_train, y_train, X_test, y_test = X[train], y[train], X[test], y[test]
        clf.fit(X_train, y_train)
        y_submission = clf.predict_proba(X_test)[:, 1]
        dataset_blend_train[test, j] = y_submission
        dataset_blend_test_j[:, i] = clf.predict_proba(X_predict)[:, 1]
    '''对于测试集，直接用这k个模型的预测值均值作为新的特征。'''
    dataset_blend_test[:, j] = dataset_blend_test_j.mean(1)

'''融合使用的模型'''

In [None]:
'''切分训练数据集为d1,d2两部分'''
X_d1, X_d2, y_d1, y_d2 = train_test_split(X, y, test_size=0.5, random_state=2017)
dataset_blend_train = np.zeros((X_d2.shape[0], len(clfs)))
dataset_blend_test = np.zeros((X_predict.shape[0], len(clfs)))

for j, clf in enumerate(clfs):
    '''依次训练各个单模型'''
    # print(j, clf)
    '''使用第1个部分作为预测，第2部分来训练模型，获得其预测的输出作为第2部分的新特征。'''
    # X_train, y_train, X_test, y_test = X[train], y[train], X[test], y[test]
    clf.fit(X_train, y_train)
    y_submission = clf.predict_proba(X_test)[:, 1]
    dataset_blend_train[:, j] = y_submission
    '''对于测试集，直接用这k个模型的预测值作为新的特征。'''
    dataset_blend_test[:, j] = clf.predict_proba(X_predict)[:, 1]
    print("val auc Score: %f" % roc_auc_score(y_predict, dataset_blend_test[:, j]))

'''融合使用的模型'''
# clf = LogisticRegression()
clf = GradientBoostingClassifier(learning_rate=0.02, subsample=0.5, max_depth=6, n_estimators=30)
clf.fit(dataset_blend_train, y_test)
y_submission = clf.predict_proba(dataset_blend_test)[:, 1]

# MLXTEND模型融合

In [None]:
#【译】MLXTEND之StackingRegressor
# https://www.jianshu.com/p/cc748e4f29c5?from=timeline

# Stacking回归模型(不带交叉验证)

# 使用波士顿数据集
from mlxtend.regressor import StackingRegressor
from mlxtend.data import boston_housing_data 
from sklearn.linear_model import LinearRegression 
from sklearn.linear_model import Ridge 
from sklearn.svm import SVR 
import matplotlib.pyplot as plt 
import numpy as np 

# 生成一个样本数据集
np.random.seed(1)
X = np.sort(5 * np.random.rand(40, 1), axis=0)
y = np.sin(X).ravel()
y[::5] += 3 * (0.5 - np.random.rand(8))

# 初始化模型
lr = LinearRegression() 
svr_lin = SVR(kernel='linear')
ridge = Ridge(random_state=1)
svr_rbf = SVR(kernel='rbf')

# 融合四个模型
model = StackingRegressor(regressors=[lr, svr_lin, ridge], meta_regressor=svr_rbf)

# 训练stacking分类器
model.fit(X, y)
model.predict(X)

# 拟合结果的评估和可视化
print("Mean Squared Error: %.4f" % np.mean((model.predict(X) - y) ** 2))
print('Variance Score: %.4f' % model.score(X, y))
with plt.style.context(('seaborn-whitegrid')):
    plt.scatter(X, y, c='lightgray')
    plt.plot(X, stregr.predict(X), c='darkgreen', lw=2)
plt.show()

In [None]:
# MLXTEND Stacking回归网格搜索(不带交叉验证)
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso

# 初始化模型
lr = LinearRegression()
svr_lin = SVR(kernel='linear')
ridge = Ridge(random_state=1)
lasso = Lasso(random_state=1)
svr_rbf = SVR(kernel='rbf')
model = StackingRegressor(regressors=[svr_lin, lr, ridge, lasso], meta_regressor=svr_rbf)

params = {
    'lasso__alpha':[0.1, 1],
    'ridge__alpha':[0.1, 1],
    'svr__C':[0.1, 1],
    'meta-svr__C':[0.1, 1],
    'meta-svr__gamma':[0.1, 1]
}

grid = GridSearchCV(estimator=model, 
                    param_grid=params,
                    cv=5, 
                    refit = True)

grid.fit(X, y)
for params, mean_score, scores in grid.grid_scores_:
    print('%0.3f +/- %0.2f %r' % (mean_score, scores.std()/2.0, params))
    
# 拟合结果的评估和可视化
print("Mean Squared Error: %.4f"
% np.mean((grid.predict(X) - y) ** 2))
print('Variance Score: %.4f' % grid.score(X, y))
with plt.style.context(('seaborn-whitegrid')):
    plt.scatter(X, y, c='lightgray')
    plt.plot(X, grid.predict(X), c='darkgreen', lw=2)
plt.show() 

In [None]:
# 标准交叉验证Stacking回归分类器
from mlxtend.regressor import StackingCVRegressor
from sklearn.datasets import load_boston
from sklearn.svm import SVR
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
import numpy as np
RANDOM_SEED = 42
X, y = load_boston(return_X_y=True)
svr = SVR(kernel='linear')
lasso = Lasso()
rf = RandomForestRegressor(n_estimators=5,
random_state=RANDOM_SEED)
# The StackingCVRegressor uses scikit-learn's check_cv
# internally, which doesn't support a random seed. Thus
# NumPy's random seed need to be specified explicitely for
# deterministic behavior
np.random.seed(RANDOM_SEED)
stack = StackingCVRegressor(regressors=(svr, lasso, rf),
meta_regressor=lasso)
print('5-fold cross validation scores:\n')
for clf, label in zip([svr, lasso, rf, stack], ['SVM', 'Lasso','Random Forest','StackingClassifier']):
scores = cross_val_score(clf, X, y, cv=5)
print("R^2 Score: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))
# 5-fold cross validation scores:
# R^2 Score: 0.45 (+/- 0.29) [SVM]
# R^2 Score: 0.43 (+/- 0.14) [Lasso]
# R^2 Score: 0.52 (+/- 0.28) [Random Forest]
# R^2 Score: 0.58 (+/- 0.24) [StackingClassifier]
# The StackingCVRegressor uses scikit-learn's check_cv
# internally, which doesn't support a random seed. Thus
# NumPy's random seed need to be specified explicitely for
# deterministic behavior
np.random.seed(RANDOM_SEED)
stack = StackingCVRegressor(regressors=(svr, lasso, rf),
meta_regressor=lasso)
print('5-fold cross validation scores:\n')
for clf, label in zip([svr, lasso, rf, stack], ['SVM', 'Lasso','Random Forest','StackingClassifier']):
scores = cross_val_score(clf, X, y, cv=5, scoring='neg_mean_squared_error')
print("Neg. MSE Score: %0.2f (+/- %0.2f) [%s]"     

In [None]:
#标准Stacking 网格调参
from mlxtend.regressor import StackingCVRegressor
from sklearn.datasets import load_boston
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
X, y = load_boston(return_X_y=True)
ridge = Ridge()
lasso = Lasso()
rf = RandomForestRegressor(random_state=RANDOM_SEED)
# The StackingCVRegressor uses scikit-learn's check_cv
# internally, which doesn't support a random seed. Thus
# NumPy's random seed need to be specified explicitely for
# deterministic behavior
np.random.seed(RANDOM_SEED)


stack = StackingCVRegressor(regressors=(lasso, ridge),
                            meta_regressor=rf,
                            use_features_in_secondary=True)
params = {'lasso__alpha': [0.1, 1.0, 10.0],
          'ridge__alpha': [0.1, 1.0, 10.0]}

grid = GridSearchCV(
                  estimator=stack,param_grid={'lasso__alpha': [x/5.0 for x in range(1, 10)],
                'ridge__alpha': [x/20.0 for x in range(1, 10)],
                'meta-randomforestregressor__n_estimators': [10,100]},
    cv=5,
    refit=True
)

grid.fit(X, y)

print("Best: %f using %s" % (grid.best_score_, grid.best_params_))

#Best: 0.673590 using {'lasso__alpha': 0.4, 'meta-randomforestregressor__n_estimators': 10, 'ridge__alpha

cv_keys = ('mean_test_score', 'std_test_score', 'params')
for r, _ in enumerate(grid.cv_results_['mean_test_score']):
    print("%0.3f +/- %0.2f %r"
          % (grid.cv_results_[cv_keys[0]][r],
          grid.cv_results_[cv_keys[1]][r] / 2.0,
          grid.cv_results_[cv_keys[2]][r]))
    if r > 10:
    break
print('...')

print('Best parameters: %s' % grid.best_params_)
print('Accuracy: %.2f' % grid.best_score_)


# NLP 

In [None]:
# 分词
print(jieba.lcut("我在网易云课堂学习自然语言处理"))
print(jieba.lcut_for_search("小明硕士毕业于中国科学院计算所，后在斯坦福大学深造"))

In [None]:
# 添加自定义分词
jieba.suggest_freq(('中', '将'), True)
print('/'.join(jieba.cut('如果放到旧字典中将出错。', HMM=False)))

In [None]:
#词性标注
import jieba.posseg as pseg
words = pseg.cut("我在网易云课堂学习自然语言处理")
for word, flag in words:
    print('%s %s' % (word, flag))

In [None]:
# 基于TF-IDF的关键词抽取
import jieba.analyse as analyse
lines=open('data/NBA.txt','r',encoding='UTF-8').read()
print("  ".join(analyse.extract_tags(lines, topK=20, withWeight=False, allowPOS=())))

In [None]:
#基于textrank的关键词抽取
import jieba.analyse as analyse
lines = open('data/NBA.txt','r',encoding='UTF-8').read()
print("  ".join(analyse.textrank(lines, topK=20, withWeight=False, allowPOS=('ns', 'n', 'vn', 'v'))))

In [None]:
# 按行读取文本数据
# pandas读取数据
df = pd.read_csv("./data/entertainment_news.csv", encoding='utf-8').dropna()
# 转成list
content=df["content"].values.tolist()
# 分词与统计词频
segment=[]
for line in content:
    try:
        segs=jieba.lcut(line)
        for seg in segs:
            if len(seg)>1 and seg!='\r\n':
                segment.append(seg)
    except:
        print(line)
        continue

In [None]:
# 去停用词
words_df=pd.DataFrame({'segment':segment})
stopwords=pd.read_csv("data/stopwords.txt",index_col=False,quoting=3,sep="\t",names=['stopword'], encoding='utf-8')#quoting=3全不引用
words_df=words_df[~words_df.segment.isin(stopwords.stopword)]

In [None]:
# 词频统计
words_stat=words_df.groupby(by=['segment'])['segment'].agg({"计数":numpy.size})
words_stat=words_stat.reset_index().sort_values(by=["计数"],ascending=False)
words_stat.head()

In [None]:
# 构建词云
matplotlib.rcParams['figure.figsize'] = (10.0, 6.0)
wordcloud=WordCloud(font_path="data/simhei.ttf",background_color="black",max_font_size=80)
word_frequence = {x[0]:x[1] for x in words_stat.head(1000).values}
wordcloud=wordcloud.fit_words(word_frequence)
plt.imshow(wordcloud)

In [None]:
# 自定义背景
from scipy.misc import imread
matplotlib.rcParams['figure.figsize'] = (15.0, 15.0)
from wordcloud import WordCloud,ImageColorGenerator
bimg=imread('image/entertainment.jpeg')
wordcloud=WordCloud(background_color="white",mask=bimg,font_path='data/simhei.ttf',max_font_size=200)
word_frequence = {x[0]:x[1] for x in words_stat.head(1000).values}
wordcloud=wordcloud.fit_words(word_frequence)
bimgColors=ImageColorGenerator(bimg)
plt.axis("off")
plt.imshow(wordcloud.recolor(color_func=bimgColors))

In [None]:
# 在jupyternotebook中显示图片
<img src="./image/LSTM.png" width="500" height="40" align=center>