In [1]:
import features_mp as ft
t_data = ft.gen_n_feature(50000)



read the data for pos_vec_train_0_50000
data for pos_vec_train_0_50000 read


In [5]:
import pandas as pd
import numpy as np
from time import time

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import log_loss

from scipy.stats import skew, kurtosis
from scipy.spatial.distance import cosine, cityblock, jaccard, canberra, euclidean, minkowski, braycurtis
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import TfidfVectorizer
seed = 42
rand_number = seed
np.random.seed(seed)

from sklearn.decomposition import LatentDirichletAllocation
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import NMF
from sklearn.decomposition import PCA


def one_hot(y):
    oc = OneHotEncoder(categories='auto')
    y_r = y.reshape(-1,1)
    oc.fit(y_r)
    r = oc.transform(y_r).toarray()
    return r


def get_log_loss(y_true, y_pred):
    y_p = one_hot(y_pred)
    return log_loss(y_true, y_p)


def time_cnt(f, tag="func"):
    print("function '%s' starts" % tag)
    t_start = time()
    ret = f()
    t_end = time()
    t_used = t_end - t_start
    print("function '%s' use: %f s" % (tag, t_used))
    return ret


def prepare_train_set(data, features):
    t_data = data[data!=np.inf].dropna()
    feature_data = t_data.drop_duplicates(features, keep='last')
    input_data = feature_data[features]
    input_data = input_data.astype(np.float64)
    result = feature_data[['is_duplicate']]
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(input_data, result, test_size = 0.2, random_state = 0,
                                                        stratify = result)
    return X_train, X_test, y_train, y_test


def dist_features_for(data, q1, q2, tag="tfidf"):
    q1 = np.nan_to_num(q1)
    q2 = np.nan_to_num(q2)

    def add_dist_for(func):
        col_name = '{d}_distance_{t}'.format(d=func.__name__, t=tag)
        data[col_name] = [func(x, y)  for (x, y) in zip(q1, q2)]
        return col_name

    feats = []
    feats .append( add_dist_for(cosine))
    feats .append( add_dist_for(cityblock))
    feats .append( add_dist_for(jaccard))
    feats .append( add_dist_for(canberra))
    feats .append( add_dist_for(euclidean))
    feats .append( add_dist_for(minkowski))
    feats .append( add_dist_for(braycurtis))

    data['skew_q1vec_{t}'.format(t=tag)] = [skew(x) for x in q1]
    feats .append( 'skew_q1vec_{t}'.format(t=tag))
    data['skew_q2vec_{t}'.format(t=tag)] = [skew(x) for x in q2]
    feats .append( 'skew_q2vec_{t}'.format(t=tag))
    data['kur_q1vec_{t}'.format(t=tag)] = [kurtosis(x) for x in q1]
    feats .append( 'kur_q1vec_{t}'.format(t=tag))
    data['kur_q2vec_{t}'.format(t=tag)] = [kurtosis(x) for x in q2]
    feats .append( 'kur_q2vec_{t}'.format(t=tag))

    return data, feats


def prepare_vec_dist_train_set(data, vec_gen, tag="tag"):
    vec = time_cnt(vec_gen, tag="vec gen for %s" % tag)
    single_set_size = int(vec.shape[0]/2)
    q1 = vec[:single_set_size]
    q2 = vec[single_set_size:]
    
    print("dist features for %s starts to gen" % tag)
    dist_features_data, features = dist_features_for(data, q1, q2, tag=tag)
    
    print("train sets for %s starts to gen" % tag)
    X_train, X_test, y_train, y_test = prepare_train_set(dist_features_data, features)
    return X_train, X_test, y_train, y_test


def test_data_performace_with_features(data, features):
    X_train, X_test, y_train, y_test = prepare_train_set(data, features)
    return test_perform_for(X_train, X_test, y_train, y_test)


def test_perform_for(X_train, X_test, y_train, y_test):
    rf_clf = RandomForestClassifier(random_state=rand_number)
    gbdt_clf = GradientBoostingClassifier(random_state=rand_number)
    lr_clf = LogisticRegression(random_state=rand_number)
    sgd_clf = SGDClassifier(random_state=rand_number)
    xgb_clf = XGBClassifier(random_state=rand_number)
    lgb_clf = LGBMClassifier(random_state=rand_number)
#     models=[(rf_clf, "RandomForest"), (gbdt_clf, "GBDT"), (lr_clf, "LogsitcRegression"), (sgd_clf, "SGD"), 
#             (xgb_clf, "XGBoost")] , (lgb_clf, "lightGBM")]
#     models=[(xgb_clf, "XGBoost"), (lgb_clf, "lightGBM")]
    models=[(lgb_clf, "lightGBM")]

    perform = []
    for t in models:
        model, name = t
        t_start = time()
        model.fit(X_train, y_train.values.ravel())
        t_end = time()
        y_predprob = model.predict_proba(X_train)
        print(name, "training time cost:", (t_end-t_start))
        y_t = model.predict_proba(X_test)
        res = [log_loss(y_train, y_predprob), log_loss(y_test, y_t)]
        perform.append((name, res))
    return perform


def tfidf():
    ft = ['question1', "question2"]
    train = t_data.loc[:, ft]
    
    print('Generate tfidf')
    feats= ft
    vect_orig = TfidfVectorizer(max_features=None,ngram_range=(1, 1), min_df=3)

    corpus = []
    for f in feats:
        train.loc[:, f] = train.loc[:, f].astype(str)
        corpus+=train[f].values.tolist()
    vect_orig.fit(corpus)
    
    train_tfidf = vect_orig.transform(corpus)
    return train_tfidf


def try_n_for_transfer(transfer, tag="svd300"):
    X_train, X_test, y_train, y_test = prepare_vec_dist_train_set(
        t_data, lambda: transfer.fit_transform(ti), tag=tag)
    performance = test_perform_for(X_train, X_test, y_train, y_test)
    return ("%s performance:"%tag, performance)
    


features=[ 'cosine_distance_pca300', 'cityblock_distance_pca300', 'jaccard_distance_pca300',
       'canberra_distance_pca300', 'euclidean_distance_pca300', 'minkowski_distance_pca300', 'braycurtis_distance_pca300',
       'skew_q1vec_pca300', 'skew_q2vec_pca300', 'kur_q1vec_pca300', 'kur_q2vec_pca300']
target_col = "is_duplicate"

In [3]:
ti = tfidf()

Generate tfidf


In [6]:
ms = [
      (TruncatedSVD, [50, 100, 200, 300]),
      (NMF, [10, 20, 30]),
      (LatentDirichletAllocation, [10, 20])
     ]
reports = []
for model_func, n_list in ms:
    for n in n_list:
        model = model_func(n_components=n)
        tag = "%s_%d" % (model_func.__name__, n)
        reports.append(try_n_for_transfer(model, tag=tag))

print("================= result =================")
for report in reports:
    tags, performances = report
    print("%s\t%s" % (tags, str(performances)))

function 'vec gen for TruncatedSVD_50' starts
function 'vec gen for TruncatedSVD_50' use: 3.461020 s
dist features for TruncatedSVD_50 starts to gen


  dist = 1.0 - uv / np.sqrt(uu * vv)


train sets for TruncatedSVD_50 starts to gen
lightGBM training time cost: 0.6639974117279053
function 'vec gen for TruncatedSVD_100' starts
function 'vec gen for TruncatedSVD_100' use: 6.968017 s
dist features for TruncatedSVD_100 starts to gen


  dist = 1.0 - uv / np.sqrt(uu * vv)


train sets for TruncatedSVD_100 starts to gen
lightGBM training time cost: 0.8199977874755859
function 'vec gen for TruncatedSVD_200' starts
function 'vec gen for TruncatedSVD_200' use: 15.738045 s
dist features for TruncatedSVD_200 starts to gen


  dist = 1.0 - uv / np.sqrt(uu * vv)


train sets for TruncatedSVD_200 starts to gen
lightGBM training time cost: 0.6700310707092285
function 'vec gen for TruncatedSVD_300' starts
function 'vec gen for TruncatedSVD_300' use: 20.068002 s
dist features for TruncatedSVD_300 starts to gen


  dist = 1.0 - uv / np.sqrt(uu * vv)


train sets for TruncatedSVD_300 starts to gen
lightGBM training time cost: 0.6999993324279785
function 'vec gen for NMF_10' starts
function 'vec gen for NMF_10' use: 4.708004 s
dist features for NMF_10 starts to gen


  dist = 1.0 - uv / np.sqrt(uu * vv)


train sets for NMF_10 starts to gen
lightGBM training time cost: 0.6239991188049316
function 'vec gen for NMF_20' starts
function 'vec gen for NMF_20' use: 11.239999 s
dist features for NMF_20 starts to gen


  dist = 1.0 - uv / np.sqrt(uu * vv)


train sets for NMF_20 starts to gen
lightGBM training time cost: 0.6229958534240723
function 'vec gen for NMF_30' starts
function 'vec gen for NMF_30' use: 32.194003 s
dist features for NMF_30 starts to gen


  dist = 1.0 - uv / np.sqrt(uu * vv)


train sets for NMF_30 starts to gen
lightGBM training time cost: 0.6369955539703369
function 'vec gen for LatentDirichletAllocation_10' starts
function 'vec gen for LatentDirichletAllocation_10' use: 133.117587 s
dist features for LatentDirichletAllocation_10 starts to gen
train sets for LatentDirichletAllocation_10 starts to gen
lightGBM training time cost: 0.581002950668335
function 'vec gen for LatentDirichletAllocation_20' starts
function 'vec gen for LatentDirichletAllocation_20' use: 131.473919 s
dist features for LatentDirichletAllocation_20 starts to gen
train sets for LatentDirichletAllocation_20 starts to gen
lightGBM training time cost: 0.5700032711029053
TruncatedSVD_50 performance:	[('lightGBM', [0.5310311505349541, 0.5665983960828199])]
TruncatedSVD_100 performance:	[('lightGBM', [0.5117958128700166, 0.5571301109523962])]
TruncatedSVD_200 performance:	[('lightGBM', [0.5001674887803823, 0.5495351528648251])]
TruncatedSVD_300 performance:	[('lightGBM', [0.4988523688347255, 

In [7]:
def prepare_vec_dist_train_set(data, vec_gen, tag="tag"):
    ms = [
          (TruncatedSVD, [50, 100, 200, 300]),
          (NMF, [10, 20, 30]),
          (LatentDirichletAllocation, [10, 20])
         ]
    reports = []
    for model_func, n_list in ms:
        for n in n_list:
            model = model_func(n_components=n)
            tag = "%s_%d" % (model_func.__name__, n)
            reports.append(try_n_for_transfer(model, tag=tag))
            
    vec = time_cnt(vec_gen, tag="vec gen for %s" % tag)
    single_set_size = int(vec.shape[0]/2)
    q1 = vec[:single_set_size]
    q2 = vec[single_set_size:]
    
    print("dist features for %s starts to gen" % tag)
    dist_features_data, features = dist_features_for(data, q1, q2, tag=tag)
    


function 'vec gen for nmf100'starts
function 'vec gen for nmf100' use: 89.899767 s
dist features for nmf100 starts to gen


  dist = 1.0 - uv / np.sqrt(uu * vv)


train sets for nmf100 starts to gen




RandomForest training time cost: 0.5205888748168945
[0.6379569677730905, 11.432149677556074]


  y = column_or_1d(y, warn=True)


GBDT training time cost: 1.982990026473999
[10.103955327579383, 10.69804031546862]
LogsitcRegression training time cost: 0.08344411849975586
[11.88595803085617, 11.93105895276114]
SGD training time cost: 0.010377168655395508
[12.898135566317384, 13.206841242214095]


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


XGBoost training time cost: 1.0862648487091064
[10.246515543841523, 10.69804031546862]


In [23]:
def tfidf_for(data):
    ft = ['question1', "question2"]
    train = data.loc[:, ft]
    
    feats= ft
    vect_orig = TfidfVectorizer(max_features=None,ngram_range=(1, 1), min_df=3)

    corpus = []
    for f in feats:
        train.loc[:, f] = train.loc[:, f].astype(str)
        corpus+=train[f].values.tolist()
    vect_orig.fit(corpus)
    
    train_tfidf = vect_orig.transform(corpus)
    return train_tfidf



def add_vec_features(data, transfer, tag="svd300"):
    ti = time_cnt(lambda: tfidf_for(data), tag="generate tfidf")
    
    vec = time_cnt(lambda: transfer.fit_transform(ti), tag="transfer tfidf matrix")
    single_set_size = int(vec.shape[0]/2)
    q1 = vec[:single_set_size]
    q2 = vec[single_set_size:]
    width = int(vec.shape[1])
    shortTag = tag.replace(' ', '')
    cols = [["q{i}_{s}_{sub_num}".format(i=i, s=shortTag, sub_num=x) for x in range(width)] for i in range(1,3)]
    print(cols)
    dq1 = pd.DataFrame(q1, columns=cols[0])
    dq2 = pd.DataFrame(q2, columns=cols[1])
    return pd.concat([data, dq1, dq2], axis=1)

In [24]:
n=20
model = NMF(n_components=n, init='random', random_state=seed)

data = add_vec_features(t_data, model, tag="nmf20")

function 'generate tfidf'starts
function 'generate tfidf' use: 1.600985 s
function 'transfer tfidf matrix'starts
function 'transfer tfidf matrix' use: 12.259087 s
[['q1_nmf20_0', 'q1_nmf20_1', 'q1_nmf20_2', 'q1_nmf20_3', 'q1_nmf20_4', 'q1_nmf20_5', 'q1_nmf20_6', 'q1_nmf20_7', 'q1_nmf20_8', 'q1_nmf20_9', 'q1_nmf20_10', 'q1_nmf20_11', 'q1_nmf20_12', 'q1_nmf20_13', 'q1_nmf20_14', 'q1_nmf20_15', 'q1_nmf20_16', 'q1_nmf20_17', 'q1_nmf20_18', 'q1_nmf20_19'], ['q2_nmf20_0', 'q2_nmf20_1', 'q2_nmf20_2', 'q2_nmf20_3', 'q2_nmf20_4', 'q2_nmf20_5', 'q2_nmf20_6', 'q2_nmf20_7', 'q2_nmf20_8', 'q2_nmf20_9', 'q2_nmf20_10', 'q2_nmf20_11', 'q2_nmf20_12', 'q2_nmf20_13', 'q2_nmf20_14', 'q2_nmf20_15', 'q2_nmf20_16', 'q2_nmf20_17', 'q2_nmf20_18', 'q2_nmf20_19']]


In [34]:
test_data_performace_with_features(data, [
    'q1_nmf20_0', 'q1_nmf20_1', 'q1_nmf20_2',
       'q1_nmf20_3', 'q1_nmf20_4', 'q1_nmf20_5', 'q1_nmf20_6', 'q1_nmf20_7',
       'q1_nmf20_8', 'q1_nmf20_9', 'q1_nmf20_10', 'q1_nmf20_11', 'q1_nmf20_12',
       'q1_nmf20_13', 'q1_nmf20_14', 'q1_nmf20_15', 'q1_nmf20_16',
       'q1_nmf20_17', 'q1_nmf20_18', 'q1_nmf20_19', 'q2_nmf20_0', 'q2_nmf20_1',
       'q2_nmf20_2', 'q2_nmf20_3', 'q2_nmf20_4', 'q2_nmf20_5', 'q2_nmf20_6',
       'q2_nmf20_7', 'q2_nmf20_8', 'q2_nmf20_9', 'q2_nmf20_10', 'q2_nmf20_11',
       'q2_nmf20_12', 'q2_nmf20_13', 'q2_nmf20_14', 'q2_nmf20_15',
       'q2_nmf20_16', 'q2_nmf20_17', 'q2_nmf20_18', 'q2_nmf20_19'
])

XGBoost training time cost: 5.134984493255615
lightGBM training time cost: 1.6410515308380127


[('XGBoost', [10.065270578321838, 11.047276808112168]),
 ('lightGBM', [7.274510357774387, 10.327421996744858])]

In [37]:
import nltk
t_data['question1'].values

array(['What is the step by step guide to invest in share market in india?',
       'What is the story of Kohinoor (Koh-i-Noor) Diamond?',
       'How can I increase the speed of my internet connection while using a VPN?',
       ...,
       "How can Kaprekar's constant (6174) be proved using MS Excel?",
       'Is Hillary Clinton a dishonest candidate?',
       'What is it like to work at a mine in Australia?'], dtype=object)

In [16]:
[x for x in range(1,3)]

[1, 2]

In [62]:
c = "How can Kaprekar's constant !(1=1) (6174) be proved using MS Excel?"
def lemmatize_all(sentence):
    sentence = sentence.lower()
    wnl = nltk.WordNetLemmatizer()
    for word, tag in nltk.pos_tag(nltk.word_tokenize(sentence)):
        if tag.startswith('NN'):
            yield wnl.lemmatize(word, pos='n')
        elif tag.startswith('VB'):
            yield wnl.lemmatize(word, pos='v')
        elif tag.startswith('JJ'):
            yield wnl.lemmatize(word, pos='a')
        elif tag.startswith('R'):
            yield wnl.lemmatize(word, pos='r')
        else:
            yield word

' '.join(lemmatize_all(c))

"how can kaprekar 's constant ! ( 1=1 ) ( 6174 ) be prove use ms excel ?"