In [51]:
import features as ft
t_data = ft.gen_common_ratio_feature()

read the data for common_ratio_features_train
data for common_ratio_features_train read


In [56]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.metrics import roc_auc_score

from time import time

from sklearn.metrics import accuracy_score, fbeta_score, make_scorer, roc_auc_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from scipy.stats import skew, kurtosis
from scipy.spatial.distance import cosine, cityblock, jaccard, canberra, euclidean, minkowski, braycurtis
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import TfidfVectorizer
seed = 42
rand_number = seed
np.random.seed(seed)

from sklearn.decomposition import LatentDirichletAllocation
from sklearn.decomposition import TruncatedSVD 

from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import log_loss


def one_hot(y):
    oc = OneHotEncoder(categories='auto')
    y_r = y.reshape(-1,1)
    oc.fit(y_r)
    r = oc.transform(y_r).toarray()
    return r


def get_log_loss(y_true, y_pred):
    y_p = one_hot(y_pred)
    return log_loss(y_true, y_p)


def time_cnt(f, tag="func"):
    print("function '%s'starts" % tag)
    t_start = time()
    ret = f()
    t_end = time()
    t_used = t_end - t_start
    print("function '%s' use: %f s" % (tag, t_used))
    return ret


def prepare_train_set(data, features):
    t_data = data[data!=np.inf].dropna()
    feature_data = t_data.drop_duplicates(features, keep='last')
    input_data = feature_data[features]
    input_data = input_data.astype(np.float64)
    result = feature_data[['is_duplicate']]
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(input_data, result, test_size = 0.2, random_state = 0,
                                                        stratify = result)
    return X_train, X_test, y_train, y_test


def dist_features_for(data, q1, q2, tag="tfidf"):
    q1 = np.nan_to_num(q1)
    q2 = np.nan_to_num(q2)

    def add_dist_for(func):
        col_name = '{d}_distance_{t}'.format(d=func.__name__, t=tag)
        data[col_name] = [func(x, y)  for (x, y) in zip(q1, q2)]
        return col_name

    feats = []
    feats += add_dist_for(cosine)
    feats += add_dist_for(cityblock)
    feats += add_dist_for(jaccard)
    feats += add_dist_for(canberra)
    feats += add_dist_for(euclidean)
    feats += add_dist_for(minkowski)
    feats += add_dist_for(braycurtis)

    data['skew_q1vec_{t}'.format(t=tag)] = [skew(x) for x in q1]
    feats += 'skew_q1vec_{t}'.format(t=tag)
    data['skew_q2vec_{t}'.format(t=tag)] = [skew(x) for x in q2]
    feats += 'skew_q2vec_{t}'.format(t=tag)
    data['kur_q1vec_{t}'.format(t=tag)] = [kurtosis(x) for x in q1]
    feats += 'kur_q1vec_{t}'.format(t=tag)
    data['kur_q2vec_{t}'.format(t=tag)] = [kurtosis(x) for x in q2]
    feats += 'kur_q2vec_{t}'.format(t=tag)

    return data, feats


def prepare_vec_dist_train_set(data, vec_gen, tag="tag"):
    print("vec data %s starts to gen" % tag)
    vec = time_cnt(vec_gen, tag="vec gen for %s" % tag)
    single_set_size = int(vec.shape[0]/2)
    q1 = vec[:single_set_size]
    q2 = vec[single_set_size:]
    
    print("dist features for %s starts to gen" % tag)
    dist_features_data, features = dist_features_for(data, q1, q2, tag=tag)
    
    print("train sets for %s starts to gen" % tag)
    X_train, X_test, y_train, y_test = prepare_train_set(dist_features_data, features)
    return X_train, X_test, y_train, y_test


def test_perform_for(X_train, X_test, y_train, y_test):
    rf_clf = RandomForestClassifier(random_state=rand_number)
    gbdt_clf = GradientBoostingClassifier(random_state=rand_number)
    lr_clf = LogisticRegression(random_state=rand_number)
    sgd_clf = SGDClassifier(random_state=rand_number)
    xgb_clf = XGBClassifier(random_state=rand_number)
    lgb_clf = LGBMClassifier(random_state=rand_number)
    models=[(rf_clf, "RandomForest"), (gbdt_clf, "GBDT"), (lr_clf, "LogsitcRegression"), (sgd_clf, "SGD"), 
            (xgb_clf, "XGBoost"), (lgb_clf, "lightGBM")]

    for t in models:
        model, name = t
        t_start = time()
        model.fit(X_train, y_train)
        t_end = time()
        y_predprob = model.predict(X_train)
        print(name, "training time cost:", (t_end-t_start))
        y_t = model.predict(X_test)
        res = [get_log_loss(y_train, y_predprob), get_log_loss(y_test, y_t)]
        print(res)


features=[ 'cosine_distance_pca300', 'cityblock_distance_pca300', 'jaccard_distance_pca300',
       'canberra_distance_pca300', 'euclidean_distance_pca300', 'minkowski_distance_pca300', 'braycurtis_distance_pca300',
       'skew_q1vec_pca300', 'skew_q2vec_pca300', 'kur_q1vec_pca300', 'kur_q2vec_pca300']
target_col = "is_duplicate"

In [19]:
tfidf = pd.read_pickle("train_tfidf.pkl")

In [20]:
tfidf.shape

(808580, 39919)

In [21]:
pca = TruncatedSVD (n_components =300)
pca300 = time_cnt(lambda: pca.fit_transform(tfidf), "TSVD300")

function 'TSVD300'starts
function 'TSVD300' use: 195.672060 s


In [None]:
svd200 = TruncatedSVD (n_components =200)
X_train, X_test, y_train, y_test = prepare_vec_dist_train_set(t_data, lambda: pca.fit_transform(tfidf), tag="svd200")
test_perform_for(X_train, X_test, y_train, y_test)

vec data svd200 starts to gen
dist features for svd200 starts to gen


  dist = 1.0 - uv / np.sqrt(uu * vv)


['a', 'b']

In [29]:
data300 = time_cnt(lambda: dist_features_for(t_data, q1, q2, tag="pca300"), tag="dist cal")

function 'dist cal'starts
function 'dist cal' use: 323.766199 s


  del sys.path[0]


RandomForest training time cost: 13.27211308479309
[0.5555409806166707, 10.368125418735417]


  y = column_or_1d(y, warn=True)


GBDT training time cost: 70.59147572517395
[9.954190191819691, 9.963558735729595]


  y = column_or_1d(y, warn=True)


LogsitcRegression training time cost: 4.357003927230835
[10.37515913365344, 10.286858748786647]


  y = column_or_1d(y, warn=True)


SGD training time cost: 0.3970022201538086
[11.271201878041868, 11.18918378522867]


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


XGBoost training time cost: 33.20322251319885
[9.997253244230228, 9.976367069580215]


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


lightGBM training time cost: 3.113046884536743
[9.608719614308256, 9.73963372668598]
