In [1]:
import features as ft
t_data = ft.gen_common_ratio_feature()[:25000]

read the data for common_ratio_features_train
data for common_ratio_features_train read


In [23]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.metrics import roc_auc_score

from time import time

from sklearn.metrics import accuracy_score, fbeta_score, make_scorer, roc_auc_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier
# from lightgbm import LGBMClassifier

from scipy.stats import skew, kurtosis
from scipy.spatial.distance import cosine, cityblock, jaccard, canberra, euclidean, minkowski, braycurtis
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import TfidfVectorizer
seed = 42
rand_number = seed
np.random.seed(seed)

from sklearn.decomposition import LatentDirichletAllocation
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import NMF
from sklearn.decomposition import PCA

from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import log_loss


def one_hot(y):
    oc = OneHotEncoder(categories='auto')
    y_r = y.reshape(-1,1)
    oc.fit(y_r)
    r = oc.transform(y_r).toarray()
    return r


def get_log_loss(y_true, y_pred):
    y_p = one_hot(y_pred)
    return log_loss(y_true, y_p)


def time_cnt(f, tag="func"):
    print("function '%s'starts" % tag)
    t_start = time()
    ret = f()
    t_end = time()
    t_used = t_end - t_start
    print("function '%s' use: %f s" % (tag, t_used))
    return ret


def prepare_train_set(data, features):
    t_data = data[data!=np.inf].dropna()
    feature_data = t_data.drop_duplicates(features, keep='last')
    input_data = feature_data[features]
    input_data = input_data.astype(np.float64)
    result = feature_data[['is_duplicate']]
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(input_data, result, test_size = 0.2, random_state = 0,
                                                        stratify = result)
    return X_train, X_test, y_train, y_test


def dist_features_for(data, q1, q2, tag="tfidf"):
    q1 = np.nan_to_num(q1)
    q2 = np.nan_to_num(q2)

    def add_dist_for(func):
        col_name = '{d}_distance_{t}'.format(d=func.__name__, t=tag)
        data[col_name] = [func(x, y)  for (x, y) in zip(q1, q2)]
        return col_name

    feats = []
    feats .append( add_dist_for(cosine))
    feats .append( add_dist_for(cityblock))
    feats .append( add_dist_for(jaccard))
    feats .append( add_dist_for(canberra))
    feats .append( add_dist_for(euclidean))
    feats .append( add_dist_for(minkowski))
    feats .append( add_dist_for(braycurtis))

    data['skew_q1vec_{t}'.format(t=tag)] = [skew(x) for x in q1]
    feats .append( 'skew_q1vec_{t}'.format(t=tag))
    data['skew_q2vec_{t}'.format(t=tag)] = [skew(x) for x in q2]
    feats .append( 'skew_q2vec_{t}'.format(t=tag))
    data['kur_q1vec_{t}'.format(t=tag)] = [kurtosis(x) for x in q1]
    feats .append( 'kur_q1vec_{t}'.format(t=tag))
    data['kur_q2vec_{t}'.format(t=tag)] = [kurtosis(x) for x in q2]
    feats .append( 'kur_q2vec_{t}'.format(t=tag))

    return data, feats


def prepare_vec_dist_train_set(data, vec_gen, tag="tag"):
    vec = time_cnt(vec_gen, tag="vec gen for %s" % tag)
    single_set_size = int(vec.shape[0]/2)
    q1 = vec[:single_set_size]
    q2 = vec[single_set_size:]
    
    print("dist features for %s starts to gen" % tag)
    dist_features_data, features = dist_features_for(data, q1, q2, tag=tag)
    
    print("train sets for %s starts to gen" % tag)
    X_train, X_test, y_train, y_test = prepare_train_set(dist_features_data, features)
    return X_train, X_test, y_train, y_test


def test_perform_for(X_train, X_test, y_train, y_test):
    rf_clf = RandomForestClassifier(random_state=rand_number)
    gbdt_clf = GradientBoostingClassifier(random_state=rand_number)
    lr_clf = LogisticRegression(random_state=rand_number)
    sgd_clf = SGDClassifier(random_state=rand_number)
    xgb_clf = XGBClassifier(random_state=rand_number)
#     lgb_clf = LGBMClassifier(random_state=rand_number)
#     models=[(rf_clf, "RandomForest"), (gbdt_clf, "GBDT"), (lr_clf, "LogsitcRegression"), (sgd_clf, "SGD"), 
#             (xgb_clf, "XGBoost")] , (lgb_clf, "lightGBM")]
    models=[(xgb_clf, "XGBoost")]

    perform = []
    for t in models:
        model, name = t
        t_start = time()
        model.fit(X_train, y_train.values.ravel())
        t_end = time()
        y_predprob = model.predict(X_train)
        print(name, "training time cost:", (t_end-t_start))
        y_t = model.predict(X_test)
        res = [get_log_loss(y_train, y_predprob), get_log_loss(y_test, y_t)]
        perform.append((name, res))
    return perform


def tfidf():
    ft = ['question1', "question2"]
    train = t_data.loc[:, ft]
    
    print('Generate tfidf')
    feats= ft
    vect_orig = TfidfVectorizer(max_features=None,ngram_range=(1, 1), min_df=3)

    corpus = []
    for f in feats:
        train.loc[:, f] = train.loc[:, f].astype(str)
        corpus+=train[f].values.tolist()
    vect_orig.fit(corpus)
    
    train_tfidf = vect_orig.transform(corpus)
    return train_tfidf


def try_n_for_transfer(transfer, tag="svd300"):
    X_train, X_test, y_train, y_test = prepare_vec_dist_train_set(
        t_data, lambda: transfer.fit_transform(ti), tag=tag)
    performance = test_perform_for(X_train, X_test, y_train, y_test)
    return ("%s performance:"%tag, performance)
    


features=[ 'cosine_distance_pca300', 'cityblock_distance_pca300', 'jaccard_distance_pca300',
       'canberra_distance_pca300', 'euclidean_distance_pca300', 'minkowski_distance_pca300', 'braycurtis_distance_pca300',
       'skew_q1vec_pca300', 'skew_q2vec_pca300', 'kur_q1vec_pca300', 'kur_q2vec_pca300']
target_col = "is_duplicate"

In [3]:
ti = tfidf()

Generate tfidf


In [27]:
ms = [
      (TruncatedSVD, [50, 100, 200, 300]),
      (NMF, [10, 20, 30]),
      (LatentDirichletAllocation, [10, 20])
     ]
reports = []
for model_func, n_list in ms:
    for n in n_list:
        model = model_func(n_components=n)
        tag = "%s_%d" % (model_func.__name__, n)
        reports.append(try_n_for_transfer(model, tag=tag))

print("================= result =================")
for report in reports:
    tags, performances = report
    print("%s\t%s" % (tags, str(performances)))

function 'vec gen for TruncatedSVD_50'starts
function 'vec gen for TruncatedSVD_50' use: 0.780925 s
dist features for TruncatedSVD_50 starts to gen


  dist = 1.0 - uv / np.sqrt(uu * vv)


train sets for TruncatedSVD_50 starts to gen
XGBoost training time cost: 0.9808001518249512
function 'vec gen for TruncatedSVD_100'starts
function 'vec gen for TruncatedSVD_100' use: 1.593821 s
dist features for TruncatedSVD_100 starts to gen


  dist = 1.0 - uv / np.sqrt(uu * vv)


train sets for TruncatedSVD_100 starts to gen
XGBoost training time cost: 0.997204065322876
function 'vec gen for TruncatedSVD_200'starts
function 'vec gen for TruncatedSVD_200' use: 3.320833 s
dist features for TruncatedSVD_200 starts to gen


  dist = 1.0 - uv / np.sqrt(uu * vv)


train sets for TruncatedSVD_200 starts to gen
XGBoost training time cost: 0.9678761959075928
function 'vec gen for TruncatedSVD_300'starts
function 'vec gen for TruncatedSVD_300' use: 5.459460 s
dist features for TruncatedSVD_300 starts to gen


  dist = 1.0 - uv / np.sqrt(uu * vv)


train sets for TruncatedSVD_300 starts to gen
XGBoost training time cost: 0.9741940498352051
function 'vec gen for NMF_10'starts
function 'vec gen for NMF_10' use: 1.315125 s
dist features for NMF_10 starts to gen


  dist = 1.0 - uv / np.sqrt(uu * vv)


train sets for NMF_10 starts to gen
XGBoost training time cost: 0.9557170867919922
function 'vec gen for NMF_20'starts
function 'vec gen for NMF_20' use: 8.124760 s
dist features for NMF_20 starts to gen


  dist = 1.0 - uv / np.sqrt(uu * vv)


train sets for NMF_20 starts to gen
XGBoost training time cost: 0.9860780239105225
function 'vec gen for NMF_30'starts
function 'vec gen for NMF_30' use: 7.182917 s
dist features for NMF_30 starts to gen


  dist = 1.0 - uv / np.sqrt(uu * vv)


train sets for NMF_30 starts to gen
XGBoost training time cost: 0.9736499786376953
function 'vec gen for LatentDirichletAllocation_10'starts
function 'vec gen for LatentDirichletAllocation_10' use: 67.426340 s
dist features for LatentDirichletAllocation_10 starts to gen
train sets for LatentDirichletAllocation_10 starts to gen
XGBoost training time cost: 0.8143031597137451
function 'vec gen for LatentDirichletAllocation_20'starts
function 'vec gen for LatentDirichletAllocation_20' use: 63.568054 s
dist features for LatentDirichletAllocation_20 starts to gen
train sets for LatentDirichletAllocation_20 starts to gen
XGBoost training time cost: 0.8660018444061279
TruncatedSVD_50 performance:	[('XGBoost', [10.379990387737232, 10.94262575153517])]
TruncatedSVD_100 performance:	[('XGBoost', [10.09786930100949, 10.685990880489394])]
TruncatedSVD_200 performance:	[('XGBoost', [9.924485101040071, 10.633894837227968])]
TruncatedSVD_300 performance:	[('XGBoost', [9.742208778814193, 10.38444019962

In [28]:
ms = [
      (NMF, [50, 100, 150])
     ]
reports = []
for model_func, n_list in ms:
    for n in n_list:
        model = model_func(n_components=n)
        tag = "%s_%d" % (model_func.__name__, n)
        reports.append(try_n_for_transfer(model, tag=tag))

print("================= result =================")
for report in reports:
    tags, performances = report
    print("%s\t%s" % (tags, str(performances)))

function 'vec gen for NMF_50'starts
function 'vec gen for NMF_50' use: 19.773376 s
dist features for NMF_50 starts to gen


  dist = 1.0 - uv / np.sqrt(uu * vv)


train sets for NMF_50 starts to gen
XGBoost training time cost: 0.9852018356323242
function 'vec gen for NMF_100'starts
function 'vec gen for NMF_100' use: 57.651311 s
dist features for NMF_100 starts to gen


  dist = 1.0 - uv / np.sqrt(uu * vv)


train sets for NMF_100 starts to gen
XGBoost training time cost: 0.9566638469696045
function 'vec gen for NMF_150'starts
function 'vec gen for NMF_150' use: 165.686871 s
dist features for NMF_150 starts to gen


  dist = 1.0 - uv / np.sqrt(uu * vv)


train sets for NMF_150 starts to gen
XGBoost training time cost: 1.10664701461792
NMF_50 performance:	[('XGBoost', [10.483362544757968, 10.721634612579086])]
NMF_100 performance:	[('XGBoost', [10.387119502014524, 10.714505866161147])]
NMF_150 performance:	[('XGBoost', [10.358603044905355, 10.693119626907333])]


In [7]:
def prepare_vec_dist_train_set(data, vec_gen, tag="tag"):
    ms = [
          (TruncatedSVD, [50, 100, 200, 300]),
          (NMF, [10, 20, 30]),
          (LatentDirichletAllocation, [10, 20])
         ]
    reports = []
    for model_func, n_list in ms:
        for n in n_list:
            model = model_func(n_components=n)
            tag = "%s_%d" % (model_func.__name__, n)
            reports.append(try_n_for_transfer(model, tag=tag))
            
    vec = time_cnt(vec_gen, tag="vec gen for %s" % tag)
    single_set_size = int(vec.shape[0]/2)
    q1 = vec[:single_set_size]
    q2 = vec[single_set_size:]
    
    print("dist features for %s starts to gen" % tag)
    dist_features_data, features = dist_features_for(data, q1, q2, tag=tag)
    


function 'vec gen for nmf100'starts
function 'vec gen for nmf100' use: 89.899767 s
dist features for nmf100 starts to gen


  dist = 1.0 - uv / np.sqrt(uu * vv)


train sets for nmf100 starts to gen




RandomForest training time cost: 0.5205888748168945
[0.6379569677730905, 11.432149677556074]


  y = column_or_1d(y, warn=True)


GBDT training time cost: 1.982990026473999
[10.103955327579383, 10.69804031546862]
LogsitcRegression training time cost: 0.08344411849975586
[11.88595803085617, 11.93105895276114]
SGD training time cost: 0.010377168655395508
[12.898135566317384, 13.206841242214095]


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


XGBoost training time cost: 1.0862648487091064
[10.246515543841523, 10.69804031546862]


In [8]:
n=50
model = NMF(n_components=n, init='random', random_state=seed)
X_train, X_test, y_train, y_test = prepare_vec_dist_train_set(t_data, lambda: model.fit_transform(ti), tag="nmf%d"%n)
test_perform_for(X_train, X_test, y_train, y_test)

function 'vec gen for nmf50'starts
function 'vec gen for nmf50' use: 31.636093 s
dist features for nmf50 starts to gen


  dist = 1.0 - uv / np.sqrt(uu * vv)


train sets for nmf50 starts to gen




RandomForest training time cost: 0.5087840557098389
[0.5914418074639141, 11.394162049816831]


  y = column_or_1d(y, warn=True)


GBDT training time cost: 1.8231358528137207
[10.284317935208344, 10.838349266898936]
LogsitcRegression training time cost: 0.08590579032897949
[12.103179638282608, 12.33476829783173]
SGD training time cost: 0.010394811630249023
[13.084759264525426, 13.175613277117776]


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


XGBoost training time cost: 1.0082080364227295
[10.460681606711137, 10.838349266898938]


In [47]:
n=20
model = NMF(n_components=n, init='random', random_state=seed)
X_train, X_test, y_train, y_test = prepare_vec_dist_train_set(t_data, lambda: model.fit_transform(ti), tag="nmf%d"%n)
test_perform_for(X_train, X_test, y_train, y_test)

  del sys.path[0]


RandomForest training time cost: 13.27211308479309
[0.5555409806166707, 10.368125418735417]


  y = column_or_1d(y, warn=True)


GBDT training time cost: 70.59147572517395
[9.954190191819691, 9.963558735729595]


  y = column_or_1d(y, warn=True)


LogsitcRegression training time cost: 4.357003927230835
[10.37515913365344, 10.286858748786647]


  y = column_or_1d(y, warn=True)


SGD training time cost: 0.3970022201538086
[11.271201878041868, 11.18918378522867]


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


XGBoost training time cost: 33.20322251319885
[9.997253244230228, 9.976367069580215]


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


lightGBM training time cost: 3.113046884536743
[9.608719614308256, 9.73963372668598]
