In [1]:
import pandas as pd
import numpy as np
pd.options.mode.chained_assignment = None

from fuzzywuzzy import fuzz
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.utils import shuffle

from nltk.corpus import stopwords
from nltk.metrics import jaccard_distance



In [2]:
data = shuffle(pd.read_csv('train.csv'))
# df = df.head()

## Intuitive Features

In [3]:
data.shape

(404290, 6)

In [4]:
# TODO: temporary us
train_data = data[:10000]
# test_data = data[10000:11000]

In [5]:
def process(df):
    # feature set 1
    df['q1_char_length_with_space'] = df.question1.apply(lambda x: len(str(x))) # with space
    df['q1_char_length_without_space'] = df.question1.apply(lambda x: len(str(x).replace(' ', ''))) # without space
    df['q1_word_length'] = df.question1.apply(lambda x: len(str(x).split(' ')))
    df['q1_question_mark'] = df.question1.apply(lambda x: str(x).count('?')) # TODO: prob not good?

    df['q2_char_length_with_space'] = df.question2.apply(lambda x: len(str(x))) # with space
    df['q2_char_length_without_space'] = df.question2.apply(lambda x: len(str(x).replace(' ', ''))) # without space
    df['q2_word_length'] = df.question2.apply(lambda x: len(str(x).split(' ')))
    df['q2_question_mark'] = df.question2.apply(lambda x: str(x).count('?')) # TODO: prob not good?

    df['word_length_diff'] = abs(df.q2_word_length - df.q1_word_length)
    df['char_length_diff'] = abs(df.q2_char_length_without_space - df.q1_char_length_without_space)
    df['common_words'] = df.apply(lambda x: len(set(str(x.question1).lower().split()).intersection(set(str(x.question2).lower().split()))), axis=1)

    # feature set 2
    df['fuzz_qratio'] = df.apply(lambda x: fuzz.QRatio(str(x.question1), str(x.question2)), axis=1)
    df['fuzz_WRatio'] = df.apply(lambda x: fuzz.WRatio(str(x.question1), str(x.question2)), axis=1)
    df['fuzz_partial_ratio'] = df.apply(lambda x: fuzz.partial_ratio(str(x.question1), str(x.question2)), axis=1)
    df['fuzz_partial_token_set_ratio'] = df.apply(lambda x: fuzz.partial_token_set_ratio(str(x.question1), str(x.question2)), axis=1)
    df['fuzz_partial_token_sort_ratio'] = df.apply(lambda x: fuzz.partial_token_sort_ratio(str(x.question1), str(x.question2)), axis=1)
    df['fuzz_token_set_ratio'] = df.apply(lambda x: fuzz.token_set_ratio(str(x.question1), str(x.question2)), axis=1)
    df['fuzz_token_sort_ratio'] = df.apply(lambda x: fuzz.token_sort_ratio(str(x.question1), str(x.question2)), axis=1)
    
    return df

In [6]:
train_df = process(train_data)
# test_df = process(test_data)

In [7]:
train_df.iloc[0]

id                                                                    126335
qid1                                                                   35698
qid2                                                                   74317
question1                                 How can I hack a Facebook account?
question2                        How do paid hackers hack Facebook profiles?
is_duplicate                                                               1
q1_char_length_with_space                                                 34
q1_char_length_without_space                                              28
q1_word_length                                                             7
q1_question_mark                                                           1
q2_char_length_with_space                                                 43
q2_char_length_without_space                                              37
q2_word_length                                                             7

In [8]:
train_cols = ['q1_char_length_with_space', 'q1_char_length_without_space',
       'q1_word_length', 'q1_question_mark', 'q2_char_length_with_space',
       'q2_char_length_without_space', 'q2_word_length', 'q2_question_mark',
       'word_length_diff', 'char_length_diff', 'common_words', 'fuzz_qratio',
       'fuzz_WRatio', 'fuzz_partial_ratio', 'fuzz_partial_token_set_ratio',
       'fuzz_partial_token_sort_ratio', 'fuzz_token_set_ratio',
       'fuzz_token_sort_ratio']
# train_cols = ['q1_char_length_with_space', 'q1_char_length_without_space',
#        'q1_word_length', 'q1_question_mark', 'q2_char_length_with_space',
#        'q2_char_length_without_space', 'q2_word_length', 'q2_question_mark',
#        'word_length_diff', 'char_length_diff', 'common_words']

In [9]:
label_col = ['is_duplicate']

In [10]:
from sklearn import svm
from sklearn import tree
from sklearn import linear_model
from sklearn import ensemble

from sklearn.model_selection import cross_val_score

In [11]:
X = train_df[train_cols]
y = train_df[label_col]
y = np.asarray(y).ravel()

In [12]:
my_svm = svm.SVC()
scores = cross_val_score(my_svm, X, y)
scores.mean()
# my_svm.fit(X, np.asarray(y).ravel())  
# my_svm.score(test_df[train_cols], test_df[label_col])

0.64589993165832615

In [13]:
my_lr = linear_model.LogisticRegression()
scores = cross_val_score(my_lr, X, y)
scores.mean()
# my_lr.fit(X, np.asarray(y).ravel())
# my_lr.score(test_df[train_cols], test_df[label_col])

0.66369889287448636

In [14]:
my_dt = tree.DecisionTreeClassifier()
scores = cross_val_score(my_dt, X, y)
scores.mean()
# my_dt.fit(X, np.asarray(y).ravel())
# my_dt.score(test_df[train_cols], test_df[label_col])

0.66459817280241429

In [15]:
my_rf = ensemble.RandomForestClassifier()
scores = cross_val_score(my_rf, X, y)
scores.mean()
# my_rf.fit(X, np.asarray(y).ravel())
# my_rf.score(test_df[train_cols], test_df[label_col])

0.69539831529492435

In [16]:
my_et = ensemble.ExtraTreesClassifier()
scores = cross_val_score(my_et, X, y)
scores.mean()
# my_et.fit(X, np.asarray(y).ravel())
# my_et.score(test_df[train_cols], test_df[label_col])

0.69509927546303463

In [17]:
my_gb = ensemble.GradientBoostingClassifier()
scores = cross_val_score(my_gb, X, y)
scores.mean()
# my_gb.fit(X, np.asarray(y).ravel())
# my_gb.score(test_df[train_cols], test_df[label_col])

0.71479917701933804

In [18]:
my_ab = ensemble.AdaBoostClassifier(n_estimators=100)
scores = cross_val_score(my_ab, X, y)
scores.mean()

0.6972981754189379