In [1]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
import tqdm
import pyemd
from fuzzywuzzy import fuzz
import gensim
#import keras



In [2]:
"""import nltk
nltk.download('stopwords')"""

"import nltk\nnltk.download('stopwords')"

In [3]:
stop_words = stopwords.words('english')


In [4]:
import os

def read_or_gen_by_list(l):
    current = len(l) - 1
    if current < 0:
        print("Error: input list is empty")
        return None
    return r_or_g_by_list_n(l, current)

def r_or_g_by_list_n(l, i):
    name = l[i][0]
    func = l[i][1]
    path = "data/{name}.csv".format(name=name)
    if os.path.exists(path):
        print("read the data for {name}".format(name=name))
        return pd.read_csv(path, sep=",")
    else:
        d = None
        print("generate the data for {name}".format(name=name))
        if i == 0:
            d = func()
        else:
            parameter = r_or_g_by_list_n(l, i-1)
            d = func(parameter)
        d.to_csv(path, index=False)
        return d

In [5]:
import string
sep = ":"
punctuation = "\"',./:;?[\]`{}~"
trans_table = str.maketrans('', '', punctuation)

print(string.punctuation)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [6]:
def col_gen(data, name, func, target_col="question"):
    for i in range(1, 3):
        n = "%s%d" % (name, i)
        question_col = "%s%d" % (target_col, i)
        data[n] = data[question_col].apply(func)
    return data

In [7]:
def basic_feature():
    data = pd.read_csv('data/train.csv', sep=',')
    data = data.drop(['id', 'qid1', 'qid2'], axis=1)
    
    data = col_gen(data, 'len', lambda x: len(str(x)))

    data = col_gen(data, "words", lambda v: sep.join(str(v).translate(trans_table).lower().split()))
    data['len_diff'] = (data['len1'] - data['len2']).apply(lambda x: x if x >= 0 else -x)
    data = col_gen(data, 'word_len', lambda x: len(str(x).split(sep)), target_col="words")
    data['word_len_diff'] = (data['word_len1'] - data['word_len2']).apply(lambda x: x if x>=0 else -x)
    
    data["common_word"] = data.apply(lambda x: len(
        set(str(x['words1']).split(sep)).intersection(
            set(str(x['words2']).split(sep)))
    ), axis=1)
    return data

In [8]:
import time
def cal_wmd(row, model):
    words1 = [x for x in str(row['words1']).split(sep) if x not in stop_words]
    words2 = [x for x in str(row['words2']).split(sep) if x not in stop_words]
    return model.wmdistance(words1, words2) 

In [9]:
def wmd_feature(input_data):
    print("start reading model")
    start = time.time()
    model = gensim.models.KeyedVectors.load_word2vec_format(
        'data/GoogleNews-vectors-negative300.bin.gz', binary=True)
    time_cost = time.time()-start
    print("model read in {t}s".format(t=time_cost))
    input_data['wmd'] = input_data.apply(lambda x: cal_wmd(x, model), axis=1)
    return input_data

In [10]:
def normal_wmd_feature(input_data):
    print("start to read and initalize model")
    start = time.time()
    norm_model = gensim.models.KeyedVectors.load_word2vec_format(
        'data/GoogleNews-vectors-negative300.bin.gz', binary=True)
    norm_model.init_sims(replace=True)
    time_cost = time.time()-start
    print("model initialized in {t}s".format(t=time_cost))
    input_data['normal_wmd'] = input_data.apply(lambda x: cal_wmd(x, norm_model), axis=1)
    return input_data

In [11]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
import string

def cal_tfidf(corps):
    vectorizer = CountVectorizer()
    transformer = TfidfTransformer()
    tfidf = transformer.fit_transform(vectorizer.fit_transform(corps))
    weight = tfidf.toarray()
    return weight
#cal_tfidf(["this is an Example", "example for an test"])

In [12]:
def cal_tfidf_for_row(row):
    q1 = str(row['words1']).split(sep)
    q2 = str(row['words2']).split(sep)
    words1 = [x for x in q1 if x not in stop_words]
    words2 = [x for x in q2 if x not in stop_words]
    corps = None
    if len(words1) <= 3 or len(words2) <= 3:
        corps = [' '.join(q1), ' '.join(q2)]
    else:
        corps = [' '.join(words1), ' '.join(words2)]
    tfidf = None
    try:
        tfidf = cal_tfidf(corps)
    except BaseException as e:
        print(str(e))
        print("corps:")
        print(corps, row['question1'], row['question2'])
        tfidf = [[0, 1], [1, 0]]
    return tfidf

In [13]:
from scipy.spatial.distance import cosine, cityblock, jaccard, canberra, euclidean, minkowski, braycurtis
def cal_dists(row):
    tfidf = cal_tfidf_for_row(row)
    v1 = tfidf[0]
    v2 = tfidf[1]
    dists = [cosine(v1, v2), euclidean(v1, v2), minkowski(v1, v2), braycurtis(v1, v2)]
    return sep.join([str(x) for x in dists])

In [14]:
funcs = ["cosine", "euclidean", "minkowski", "braycurtis"]

In [15]:
def ab(v, index):
    print(v)
    nv = str(v).split(sep)
    print(nv[index])
    return float(nv)

def abstract_col(col_name, index, input_data):
    print("to abstract {col}".format(col=col_name))
    input_data[col_name] = input_data['dists'].apply(lambda x: float(str(x).split(sep)[index]))
    print("{col} is abstracted".format(col=col_name))
    return input_data

def cal_dist_all(input_data):
    print("to calculate basic distance")
    input_data['dists'] = input_data.apply(lambda x: cal_dists(x), axis=1)
    print("basic distance calculated")
    return input_data
    
def cal_dist_cols(input_data):
    for i, n in enumerate(funcs):
        input_data = abstract_col("{n}_dist".format(n=n), int(i), input_data)
    return input_data

In [16]:
def fuzz_match(input_data):
    input_data['qratio'] = input_data.apply(lambda x: fuzz.QRatio(str(x['question1']), 
                                        str(x['question2'])), axis=1)
    print("fuzz: qratio finished")
    input_data['wratio'] = input_data.apply(lambda x: fuzz.WRatio(str(x['question1']), 
                                        str(x['question2'])), axis=1)
    print("fuzz: wratio finished")
    input_data['partial_ratio'] = input_data.apply(lambda x: fuzz.partial_ratio(str(x['question1']),
                                        str(x['question2'])), axis=1)
    print("fuzz: partial_ratio finished")
    input_data['partial_token_set_ratio'] = input_data.apply(lambda x: fuzz.partial_token_set_ratio(
        str(x['question1']), str(x['question2'])), axis=1)
    print("fuzz: partial_token_set_ratio finished")
    input_data['partial_token_sort_ratio'] = input_data.apply(lambda x: fuzz.partial_token_sort_ratio(
        str(x['question1']), str(x['question2'])), axis=1)
    print("fuzz: partial_token_sort_ratio finished")
    input_data['token_set_ratio'] = input_data.apply(lambda x: fuzz.token_set_ratio(
        str(x['question1']), str(x['question2'])), axis=1)
    print("fuzz: token_set_ratio finished")
    input_data['token_sort_ratio'] = input_data.apply(lambda x: fuzz.token_sort_ratio(
        str(x['question1']), str(x['question2'])), axis=1)
    print("fuzz: token_sort_ratio finished")
    
    return input_data

In [17]:
operations = [
    ("basic_feature", basic_feature)
    ,("dist_base_feature", cal_dist_all)
    ,("dist_feature", cal_dist_cols)
    ,("add_wmd_feature", wmd_feature)
    ,("match_feature", fuzz_match)
    #,("add_normal_wmd_feature", normal_wmd_feature)
]
features_data = read_or_gen_by_list(operations)

read the data for match_feature


In [18]:
features_data[:3]

Unnamed: 0,question1,question2,is_duplicate,len1,len2,words1,words2,len_diff,word_len1,word_len2,...,minkowski_dist,braycurtis_dist,wmd,qratio,wratio,partial_ratio,partial_token_set_ratio,partial_token_sort_ratio,token_set_ratio,token_sort_ratio
0,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0,66,57,what:is:the:step:by:step:guide:to:invest:in:sh...,what:is:the:step:by:step:guide:to:invest:in:sh...,9,14,12,...,0.457094,0.149261,0.640008,93,95,98,100,89,100,93
1,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0,51,88,what:is:the:story:of:kohinoor:(koh-i-noor):dia...,what:would:happen:if:the:indian:government:sto...,37,8,13,...,1.085361,0.6233,3.170188,66,86,73,100,75,86,63
2,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0,73,59,how:can:i:increase:the:speed:of:my:internet:co...,how:can:internet:speed:be:increased:by:hacking...,14,14,10,...,1.244375,0.72546,1.922139,54,63,53,100,71,66,66


In [22]:
all_columns = ['question1', 'question2', 'is_duplicate', 'len1', 'len2', 'words1', 'words2', 'len_diff', 'word_len1', 'word_len2',
               'word_len_diff', 'common_word', 'dists', 'cosine_dist', 'euclidean_dist', 'minkowski_dist', 'braycurtis_dist', 'wmd',
        'qratio','wratio','partial_ratio','partial_token_set_ratio','partial_token_sort_ratio','token_set_ratio','token_sort_ratio']
feature_columns = ['len1', 'len2', 'len_diff', 'word_len1', 'word_len2', 'word_len_diff', 'common_word', 'cosine_dist',
                   'euclidean_dist', 'minkowski_dist', 'braycurtis_dist', 'wmd', 'qratio','wratio','partial_ratio',
                   'partial_token_set_ratio','partial_token_sort_ratio','token_set_ratio','token_sort_ratio']
features_data = features_data[:30000]
input_values = features_data[feature_columns]
tag = features_data["is_duplicate"]
input_values = input_values.replace([np.inf, -np.inf], 100).fillna(0).astype(np.float64)
#input_values = input_values.astype("float")
#input_values = input_values.fillna(0)

In [23]:
input_values.sample(n=3)

Unnamed: 0,len1,len2,len_diff,word_len1,word_len2,word_len_diff,common_word,cosine_dist,euclidean_dist,minkowski_dist,braycurtis_dist,wmd,qratio,wratio,partial_ratio,partial_token_set_ratio,partial_token_sort_ratio,token_set_ratio,token_sort_ratio
13106,60.0,36.0,24.0,12.0,8.0,4.0,4.0,0.725388,1.204482,1.204482,0.687676,2.098477,51.0,86.0,58.0,100.0,69.0,70.0,62.0
2280,113.0,48.0,65.0,20.0,10.0,10.0,2.0,0.885532,1.330814,1.330814,0.895695,2.970346,42.0,86.0,50.0,100.0,49.0,46.0,41.0
11800,152.0,308.0,156.0,26.0,58.0,32.0,9.0,0.928654,1.362831,1.362831,0.915282,2.912138,40.0,86.0,47.0,100.0,54.0,52.0,46.0


In [24]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(input_values, tag, test_size = 0.2, random_state = 0,
                                                    stratify = tag)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=0,
                                                    stratify = y_train)

In [25]:
from sklearn.metrics import log_loss
from sklearn.preprocessing import OneHotEncoder

def cal_log_loss(y_true, y_pred):
    one_hot = OneHotEncoder( sparse=False)
    oy_true = one_hot.fit_transform(y_true.reshape(-1, 1))
    oy_pred = one_hot.fit_transform(y_pred.reshape(-1, 1))
    return log_loss(oy_true, oy_pred)

In [26]:
from sklearn.linear_model import LogisticRegression
lr_clf = LogisticRegression(solver='lbfgs').fit(X_train, y_train)
y_pred = lr_clf.predict(X_val)




In [27]:
from sklearn.metrics import fbeta_score, make_scorer
print(fbeta_score(y_val, y_pred, beta=1))

0.5171800947867298


In [29]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import fbeta_score, make_scorer
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=30)
parameters = {'criterion': ["gini", "entropy"], 'max_depth': [10, 30], 'min_samples_split': [0.5, 3],
             'min_samples_leaf': [1, 5]}

scorer = make_scorer(fbeta_score, beta=0.5)

grid_obj = GridSearchCV(clf, param_grid=parameters, scoring=scorer, cv=5)
grid_fit = grid_obj.fit(X_train, y_train)

best_clf = grid_fit.best_estimator_

best_predictions = best_clf.predict(X_val)
print(fbeta_score(y_val, best_predictions, beta = 0.5))

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


0.6402144772117963
