In [13]:
import pandas as pd
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

In [22]:
from nltk.corpus import stopwords
stops = set(stopwords.words("english"))
def word_match_share(row):
    q1words = {}
    q2words = {}
    for word in str(row['question1']).lower().split():
        if word not in stops:
            q1words[word] = 1
    for word in str(row['question2']).lower().split():
        if word not in stops:
            q2words[word] = 1
    if len(q1words) == 0 or len(q2words) == 0:
        return 0
    shared_words_in_q1 = [w for w in q1words.keys() if w in q2words]
    R = 2*len(shared_words_in_q1)/(len(q1words) + len(q2words))
    return R

In [28]:
from collections import Counter
def get_weight(count, eps=10000, min_count=2):
    if count < min_count:
        return 0
    else:
        return 1 / (count + eps)
eps = 5000 
train_qs = pd.Series(df_train['question1'].tolist() + df_train['question2'].tolist()).astype(str)
words = (" ".join(train_qs)).lower().split()
counts = Counter(words)
weights = {word: get_weight(count) for word, count in counts.items()}

In [48]:
import numpy as np
def tfidf_word_match_share(row):
    q1words = {}
    q2words = {}
    for word in str(row['question1']).lower().split():
        if word not in stops:
            q1words[word] = 1
    for word in str(row['question2']).lower().split():
        if word not in stops:
            q2words[word] = 1
    if len(q1words) == 0 or len(q2words) == 0:
        return 0
    
    shared_weights = [weights.get(w, 0) for w in q1words.keys() if w in q2words] + [weights.get(w, 0) for w in q2words.keys() if w in q1words]
    total_weights = [weights.get(w, 0) for w in q1words] + [weights.get(w, 0) for w in q2words]
    
    R = np.sum(shared_weights) / np.sum(total_weights)
    return R



In [50]:
from collections import defaultdict
q_dict = defaultdict(set)
ques = pd.concat([df_train[['question1', 'question2']],df_test[['question1', 'question2']]], axis=0).reset_index(drop='index')
for i in range(ques.shape[0]):
        q_dict[ques.question1[i]].add(ques.question2[i])
        q_dict[ques.question2[i]].add(ques.question1[i])
def q1_freq(row):
    return(len(q_dict[row['question1']]))
    
def q2_freq(row):
    return(len(q_dict[row['question2']]))
    
def q1_q2_intersect(row):
    return(len(set(q_dict[row['question1']]).intersection(set(q_dict[row['question2']]))))

In [52]:
x_train = pd.DataFrame()
x_test = pd.DataFrame()
x_train['word_match'] = df_train.apply(word_match_share, axis=1, raw=True)
x_train['tfidf_word_match'] = df_train.apply(tfidf_word_match_share, axis=1, raw=True)
x_train['q1_q2_intersect'] = df_train.apply(q1_q2_intersect, axis=1, raw=True)
x_train['q1_freq'] = df_train.apply(q1_freq, axis=1, raw=True)
x_train['q2_freq'] = df_train.apply(q2_freq, axis=1, raw=True)
x_test['word_match'] = df_test.apply(word_match_share, axis=1, raw=True)
x_test['tfidf_word_match'] = df_test.apply(tfidf_word_match_share, axis=1, raw=True)
x_test['q1_q2_intersect'] = df_test.apply(q1_q2_intersect, axis=1, raw=True)
x_test['q1_freq'] = df_test.apply(q1_freq, axis=1, raw=True)
x_test['q2_freq'] = df_test.apply(q2_freq, axis=1, raw=True)
y_train = df_train['is_duplicate'].values



In [53]:
pos_train = x_train[y_train == 1]
neg_train = x_train[y_train == 0]
p = 0.165
scale = ((len(pos_train) / (len(pos_train) + len(neg_train))) / p) - 1
while scale > 1:
    neg_train = pd.concat([neg_train, neg_train])
    scale -=1
neg_train = pd.concat([neg_train, neg_train[:int(scale * len(neg_train))]])

x_train = pd.concat([pos_train, neg_train])
y_train = (np.zeros(len(pos_train)) + 1).tolist() + np.zeros(len(neg_train)).tolist()
del pos_train, neg_train

In [54]:
from sklearn.model_selection import train_test_split

x_train, x_valid, y_train, y_valid = train_test_split(x_train, y_train, test_size=0.2, random_state=4242)

In [55]:
import xgboost as xgb

# Set our parameters for xgboost
params = {}
params['objective'] = 'binary:logistic'
params['eval_metric'] = 'logloss'
params['eta'] = 0.02
params['max_depth'] = 4

d_train = xgb.DMatrix(x_train, label=y_train)
d_valid = xgb.DMatrix(x_valid, label=y_valid)

watchlist = [(d_train, 'train'), (d_valid, 'valid')]

bst = xgb.train(params, d_train, 400, watchlist, early_stopping_rounds=50, verbose_eval=10)


[0]	train-logloss:0.679414	valid-logloss:0.679473
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 50 rounds.
[10]	train-logloss:0.567426	valid-logloss:0.567989
[20]	train-logloss:0.48825	valid-logloss:0.489194
[30]	train-logloss:0.430304	valid-logloss:0.431549
[40]	train-logloss:0.386895	valid-logloss:0.388407
[50]	train-logloss:0.35366	valid-logloss:0.355421
[60]	train-logloss:0.327965	valid-logloss:0.329964
[70]	train-logloss:0.30792	valid-logloss:0.310102
[80]	train-logloss:0.292252	valid-logloss:0.294604
[90]	train-logloss:0.279946	valid-logloss:0.282441
[100]	train-logloss:0.270292	valid-logloss:0.272905
[110]	train-logloss:0.262489	valid-logloss:0.265209
[120]	train-logloss:0.255829	valid-logloss:0.25864
[130]	train-logloss:0.250483	valid-logloss:0.253376
[140]	train-logloss:0.245983	valid-logloss:0.248943
[150]	train-logloss:0.242276	valid-logloss:0.245282
[160]	train-logloss:0.23922	vali

In [56]:
d_test = xgb.DMatrix(x_test)
p_test = bst.predict(d_test)

sub = pd.DataFrame()
sub['test_id'] = df_test['test_id']
sub['is_duplicate'] = p_test
sub.to_csv('simple_xgb.csv', index=False)