In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from tqdm import tqdm
from sklearn.svm import SVC, LinearSVC
from keras.models import Sequential
from keras.layers.recurrent import LSTM, GRU
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.embeddings import Embedding
from keras.layers.normalization import BatchNormalization
from keras.utils import np_utils
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score, log_loss
from sklearn.naive_bayes import MultinomialNB
from keras.layers import GlobalMaxPooling1D, Conv1D, MaxPooling1D, Flatten, Bidirectional, SpatialDropout1D
from keras.preprocessing import sequence, text
from keras.callbacks import EarlyStopping
from nltk import word_tokenize
from nltk.corpus import stopwords
from sklearn.calibration import CalibratedClassifierCV
from sklearn.externals import joblib
import pickle
import jieba

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
import numpy as np
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, KFold
import pandas as pd
import os
import sys
import logging

logging.basicConfig(
    level=logging.DEBUG,
    format="[%(asctime)s] %(levelname)s %(message)s",
    datefmt="%H:%M:%S", stream=sys.stdout)
logger = logging.getLogger(__name__)


class Ensembler(object):
    def __init__(self, model_dict, num_folds=3, task_type='classification', optimize=roc_auc_score,
                 lower_is_better=False, save_path=None):
        """
        Ensembler init function
        :param model_dict: 模型字典 
        :param num_folds: ensembling所用的fold数量
        :param task_type: 分类（classification） 还是回归（regression）
        :param optimize: 优化函数，比如 AUC, logloss, F1等，必须有2个函数，即y_test 和 y_pred
        :param lower_is_better: 优化函数（Optimization Function）的值越低越好还是越高越好
        :param save_path: 模型保存路径
        """

        self.model_dict = model_dict
        self.levels = len(self.model_dict)
        self.num_folds = num_folds
        self.task_type = task_type
        self.optimize = optimize
        self.lower_is_better = lower_is_better
        self.save_path = save_path

        self.training_data = None
        self.test_data = None
        self.y = None
        self.lbl_enc = None
        self.y_enc = None
        self.train_prediction_dict = None
        self.test_prediction_dict = None
        self.num_classes = None

    def fit(self, training_data, y, lentrain):
        """
        :param training_data: 二维表格形式的训练数据
        :param y: 二进制的, 多分类或回归
        :return: 用于预测的模型链（Chain of Models）

        """

        self.training_data = training_data
        self.y = y

        if self.task_type == 'classification':
            self.num_classes = len(np.unique(self.y))
            logger.info("Found %d classes", self.num_classes)
            self.lbl_enc = LabelEncoder()
            self.y_enc = self.lbl_enc.fit_transform(self.y)
            kf = StratifiedKFold(n_splits=self.num_folds)
            train_prediction_shape = (lentrain, self.num_classes)
        else:
            self.num_classes = -1
            self.y_enc = self.y
            kf = KFold(n_splits=self.num_folds)
            train_prediction_shape = (lentrain, 1)

        self.train_prediction_dict = {}
        for level in range(self.levels):
            self.train_prediction_dict[level] = np.zeros((train_prediction_shape[0],
                                                          train_prediction_shape[1] * len(self.model_dict[level])))

        for level in range(self.levels):

            if level == 0:
                temp_train = self.training_data
            else:
                temp_train = self.train_prediction_dict[level - 1]

            for model_num, model in enumerate(self.model_dict[level]):
                validation_scores = []
                foldnum = 1
                for train_index, valid_index in kf.split(self.train_prediction_dict[0], self.y_enc):
                    logger.info("Training Level %d Fold # %d. Model # %d", level, foldnum, model_num)

                    if level != 0:
                        l_training_data = temp_train[train_index]
                        l_validation_data = temp_train[valid_index]
                        model.fit(l_training_data, self.y_enc[train_index])
                    else:
                        l0_training_data = temp_train[0][model_num]
                        if type(l0_training_data) == list:
                            l_training_data = [x[train_index] for x in l0_training_data]
                            l_validation_data = [x[valid_index] for x in l0_training_data]
                        else:
                            l_training_data = l0_training_data[train_index]
                            l_validation_data = l0_training_data[valid_index]
                        model.fit(l_training_data, self.y_enc[train_index])

                    logger.info("Predicting Level %d. Fold # %d. Model # %d", level, foldnum, model_num)

                    if self.task_type == 'classification':
                        temp_train_predictions = model.predict_proba(l_validation_data)
                        self.train_prediction_dict[level][valid_index,
                        (model_num * self.num_classes):(model_num * self.num_classes) +
                                                       self.num_classes] = temp_train_predictions

                    else:
                        temp_train_predictions = model.predict(l_validation_data)
                        self.train_prediction_dict[level][valid_index, model_num] = temp_train_predictions
                    validation_score = self.optimize(self.y_enc[valid_index], temp_train_predictions)
                    validation_scores.append(validation_score)
                    logger.info("Level %d. Fold # %d. Model # %d. Validation Score = %f", level, foldnum, model_num,
                                validation_score)
                    foldnum += 1
                avg_score = np.mean(validation_scores)
                std_score = np.std(validation_scores)
                logger.info("Level %d. Model # %d. Mean Score = %f. Std Dev = %f", level, model_num,
                            avg_score, std_score)

            if self.save_path != None:
                logger.info("Saving predictions for level # %d", level)
                train_predictions_df = pd.DataFrame(self.train_prediction_dict[level])
                train_predictions_df.to_csv(os.path.join(self.save_path, "train_predictions_level_" + str(level) + ".csv"),
                                            index=False, header=None)

        return self.train_prediction_dict

    def predict(self, test_data, lentest):
        self.test_data = test_data
        if self.task_type == 'classification':
            test_prediction_shape = (lentest, self.num_classes)
        else:
            test_prediction_shape = (lentest, 1)

        self.test_prediction_dict = {}
        for level in range(self.levels):
            self.test_prediction_dict[level] = np.zeros((test_prediction_shape[0],
                                                         test_prediction_shape[1] * len(self.model_dict[level])))
        self.test_data = test_data
        for level in range(self.levels):
            if level == 0:
                temp_train = self.training_data
                temp_test = self.test_data
            else:
                temp_train = self.train_prediction_dict[level - 1]
                temp_test = self.test_prediction_dict[level - 1]

            for model_num, model in enumerate(self.model_dict[level]):

                logger.info("Training Fulldata Level %d. Model # %d", level, model_num)
                if level == 0:
                    model.fit(temp_train[0][model_num], self.y_enc)
                else:
                    model.fit(temp_train, self.y_enc)

                logger.info("Predicting Test Level %d. Model # %d", level, model_num)

                if self.task_type == 'classification':
                    if level == 0:
                        temp_test_predictions = model.predict_proba(temp_test[0][model_num])
                    else:
                        temp_test_predictions = model.predict_proba(temp_test)
                    self.test_prediction_dict[level][:, (model_num * self.num_classes): (model_num * self.num_classes) +
                                                                                        self.num_classes] = temp_test_predictions

                else:
                    if level == 0:
                        temp_test_predictions = model.predict(temp_test[0][model_num])
                    else:
                        temp_test_predictions = model.predict(temp_test)
                    self.test_prediction_dict[level][:, model_num] = temp_test_predictions
            if self.save_path != None:
                test_predictions_df = pd.DataFrame(self.test_prediction_dict[level])
                test_predictions_df.to_csv(os.path.join(self.save_path, "test_predictions_level_" + str(level) + ".csv"),
                                       index=False, header=None)
            
        return self.test_prediction_dict

In [5]:
#df_train = pd.read_csv('data/train_after_preprocess.csv')
df_train = pd.read_csv('data/train.csv', lineterminator='\n')
df_train.head()

Unnamed: 0,ID,review,label
0,1,Jo bhi ap se tou behtar hoon,Negative
1,2,ya Allah meri sister Affia ki madad farma,Positive
2,3,Yeh khud chahta a is umar main shadi krna. ha...,Negative
3,4,Tc ? Apky mun xe exe alfax achy nae lgty 😒💃,Negative
4,5,Good,Positive


In [6]:
print(len(df_train['review']))
type(df_train['review'])

6328


pandas.core.series.Series

In [7]:
#df_test = pd.read_csv("data/test_after_preprocess.csv")
df_test = pd.read_csv("data/test.csv", lineterminator='\n')
df_test.head()

Unnamed: 0,ID,review
0,1,Phr tissuw se saaf
1,2,Jail Road Per Firing Se 1 Shakhs Janbahaq
2,3,mehfil loot li aunty ne
3,4,Rehnay do butt sahb nay galiya boht deni hain
4,5,Zabardast


In [8]:
print(len(df_test['review']))
type(df_test['review'])

2712


pandas.core.series.Series

In [9]:
X = np.array([review for review in df_train['review']])
Y = np.array([1 if label == 'Positive' else 0 for label in df_train['label']])
X_test = np.array([review for review in df_test['review']])

In [10]:
print(X.shape)
print(X_test.shape)
X_all = np.concatenate((X, X_test), axis = 0)
print(X_all.shape)

(6328,)
(2712,)
(9040,)


In [11]:
X_train, X_valid, Y_train, Y_valid = train_test_split(X, Y, test_size=0.3, shuffle=True)
print(X_train.shape)
print(Y_train.shape)
print(X_valid.shape)
print(Y_valid.shape)

(4429,)
(4429,)
(1899,)
(1899,)


In [12]:
tfidf = TfidfVectorizer(min_df=3, max_df = 0.9, ngram_range=(1, 2), use_idf=True, smooth_idf=True, sublinear_tf=True)
tfidf.fit(X_all)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.9, max_features=None, min_df=3,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [13]:
#print(tfidf.vocabulary_)

In [14]:
X_train_vec = tfidf.transform(X_train)
X_valid_vec = tfidf.transform(X_valid)
X_train_all = tfidf.transform(X)
X_test_vec = tfidf.transform(X_test)
print(X_train_vec.shape)
print(Y_train.shape)
print(X_valid_vec.shape)
print(X_valid.shape)
print(X_train_all.shape)
print(Y.shape)
print(X_test_vec.shape)

(4429, 11403)
(4429,)
(1899, 11403)
(1899,)
(6328, 11403)
(6328,)
(2712, 11403)


In [15]:
Counter = CountVectorizer(min_df=3, max_df=0.9, ngram_range=(1,2))
Counter.fit(X_all)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.9, max_features=None, min_df=3,
        ngram_range=(1, 2), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [16]:
X_train_cou_vec = Counter.transform(X_train)
X_valid_cou_vec = Counter.transform(X_valid)
X_train_cou_all = Counter.transform(X)
print(X_train_cou_vec.shape)
print(Y_train.shape)
print(X_valid_cou_vec.shape)
print(X_valid.shape)
print(X_train_cou_all.shape)
print(Y.shape)

(4429, 11403)
(4429,)
(1899, 11403)
(1899,)
(6328, 11403)
(6328,)


In [17]:
#为每个level的集成指定使用数据：
X_train_data_dict = {0: [X_train_vec, X_train_vec, X_train_cou_vec, X_train_cou_vec], 1: [X_train_vec]}
X_test_data_dict = {0: [X_valid_vec, X_valid_vec, X_valid_cou_vec, X_valid_cou_vec], 1: [X_valid_vec]}

model_dict = {0: [CalibratedClassifierCV(LinearSVC()), MultinomialNB(), CalibratedClassifierCV(LinearSVC()), MultinomialNB()],
             1: [xgb.XGBClassifier()]}

ens = Ensembler(model_dict=model_dict, num_folds=2, task_type='classification',
                optimize=log_loss, lower_is_better=True)

pred_train = ens.fit(X_train_data_dict, Y_train, lentrain=X_train_vec.shape[0])
Y_train_predict_postive = np.array([item[1] for item in pred_train[1]])
print('AUC = ' + str(roc_auc_score(Y_train, Y_train_predict_postive)))

[23:40:14] INFO Found 2 classes
[23:40:14] INFO Training Level 0 Fold # 1. Model # 0
[23:40:14] INFO Predicting Level 0. Fold # 1. Model # 0
[23:40:14] INFO Level 0. Fold # 1. Model # 0. Validation Score = 0.533386
[23:40:14] INFO Training Level 0 Fold # 2. Model # 0
[23:40:14] INFO Predicting Level 0. Fold # 2. Model # 0
[23:40:14] INFO Level 0. Fold # 2. Model # 0. Validation Score = 0.521883
[23:40:14] INFO Level 0. Model # 0. Mean Score = 0.527634. Std Dev = 0.005751
[23:40:14] INFO Training Level 0 Fold # 1. Model # 1
[23:40:14] INFO Predicting Level 0. Fold # 1. Model # 1
[23:40:14] INFO Level 0. Fold # 1. Model # 1. Validation Score = 0.546037
[23:40:14] INFO Training Level 0 Fold # 2. Model # 1
[23:40:14] INFO Predicting Level 0. Fold # 2. Model # 1
[23:40:14] INFO Level 0. Fold # 2. Model # 1. Validation Score = 0.541650
[23:40:14] INFO Level 0. Model # 1. Mean Score = 0.543844. Std Dev = 0.002194
[23:40:14] INFO Training Level 0 Fold # 1. Model # 2
[23:40:14] INFO Predicting 

In [18]:
pred_test = ens.predict(X_test_data_dict, lentest=X_valid_vec.shape[0])
Y_valid_predict_postive = np.array([item[1] for item in pred_test[1]])
print('AUC = ' + str(roc_auc_score(Y_valid, Y_valid_predict_postive)))

[23:40:21] INFO Training Fulldata Level 0. Model # 0
[23:40:21] INFO Predicting Test Level 0. Model # 0
[23:40:21] INFO Training Fulldata Level 0. Model # 1
[23:40:21] INFO Predicting Test Level 0. Model # 1
[23:40:21] INFO Training Fulldata Level 0. Model # 2
[23:40:21] INFO Predicting Test Level 0. Model # 2
[23:40:21] INFO Training Fulldata Level 0. Model # 3
[23:40:21] INFO Predicting Test Level 0. Model # 3
[23:40:21] INFO Training Fulldata Level 1. Model # 0
[23:40:21] INFO Predicting Test Level 1. Model # 0
AUC = 0.8430070707519901


In [19]:
X_train_data_dict = {0: [X_train_all, X_train_all, X_train_all, X_train_all], 1: [X_train_all]}
ens.fit(X_train_data_dict, Y, lentrain=X_train_all.shape[0])
Y_train_all_predict = ens.predict(X_train_data_dict, lentest=X_train_all.shape[0])
Y_train_all_predict_postive = np.array([item[1] for item in Y_train_all_predict[1]])
print('AUC = ' + str(roc_auc_score(Y, Y_train_all_predict_postive)))

[23:40:24] INFO Found 2 classes
[23:40:24] INFO Training Level 0 Fold # 1. Model # 0
[23:40:24] INFO Predicting Level 0. Fold # 1. Model # 0
[23:40:24] INFO Level 0. Fold # 1. Model # 0. Validation Score = 0.516057
[23:40:24] INFO Training Level 0 Fold # 2. Model # 0
[23:40:24] INFO Predicting Level 0. Fold # 2. Model # 0
[23:40:24] INFO Level 0. Fold # 2. Model # 0. Validation Score = 0.518933
[23:40:24] INFO Level 0. Model # 0. Mean Score = 0.517495. Std Dev = 0.001438
[23:40:24] INFO Training Level 0 Fold # 1. Model # 1
[23:40:24] INFO Predicting Level 0. Fold # 1. Model # 1
[23:40:24] INFO Level 0. Fold # 1. Model # 1. Validation Score = 0.526562
[23:40:24] INFO Training Level 0 Fold # 2. Model # 1
[23:40:24] INFO Predicting Level 0. Fold # 2. Model # 1
[23:40:24] INFO Level 0. Fold # 2. Model # 1. Validation Score = 0.525773
[23:40:24] INFO Level 0. Model # 1. Mean Score = 0.526167. Std Dev = 0.000395
[23:40:24] INFO Training Level 0 Fold # 1. Model # 2
[23:40:24] INFO Predicting 

In [20]:
X_test_data_dict = {0: [X_test_vec, X_test_vec, X_test_vec, X_test_vec], 1: [X_test_vec]}
Y_predict = ens.predict(X_test_data_dict, lentest=X_test_vec.shape[0])
Y_predict_positive = [item[1] for item in Y_predict[1]]
test_ids = df_test['ID']
Data = {'ID':test_ids, 'Pred':Y_predict_positive}
result = pd.DataFrame(Data, columns=['ID', 'Pred'])
result.to_csv('test_pred.csv', header = True)
result.head()

[23:40:28] INFO Training Fulldata Level 0. Model # 0
[23:40:28] INFO Predicting Test Level 0. Model # 0
[23:40:28] INFO Training Fulldata Level 0. Model # 1
[23:40:28] INFO Predicting Test Level 0. Model # 1
[23:40:28] INFO Training Fulldata Level 0. Model # 2
[23:40:28] INFO Predicting Test Level 0. Model # 2
[23:40:28] INFO Training Fulldata Level 0. Model # 3
[23:40:28] INFO Predicting Test Level 0. Model # 3
[23:40:28] INFO Training Fulldata Level 1. Model # 0
[23:40:28] INFO Predicting Test Level 1. Model # 0


Unnamed: 0,ID,Pred
0,1,0.184122
1,2,0.023978
2,3,0.339775
3,4,0.315381
4,5,0.895368
