# Quora questions pairs
link: https://data.quora.com/First-Quora-Dataset-Release-Question-Pairs  
link: https://www.kaggle.com/c/quora-question-pairs/data


In [13]:
import pandas as pd
import numpy as np
import scipy as sp
def logloss(act, pred):
    epsilon = 1e-15
    pred = sp.maximum(epsilon, pred)
    pred = sp.minimum(1-epsilon, pred)
    ll = np.sum(act*sp.log(pred) + sp.subtract(1,act)*sp.log(sp.subtract(1,pred)))
    ll = ll * -1.0/len(act)
    return ll

train_data = pd.read_csv('./quora/quora_duplicate_questions.tsv',sep='\t').sample(5000, random_state=40)
train_data.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
29716,29716,54951,54952,What is swarm robot?,Will swarm robots exist in the near future?,0
310538,310538,434667,434668,"What is the intuition behind the ""potential"" f...","How the infix to postfix algorithm came to be,...",0
372778,372778,64563,217304,How do I learn ethical hacking in online?,"How do I learn ethical hacking, for free of co...",0
395532,395532,528527,1772,What is the significance of 9 9 5 to a Pythago...,How can you increase your height?,0
10026,10026,19463,19464,What is a love crime?,"If I love someone, is it a crime?",0


In [10]:
train_data.shape

(5000, 6)

In [17]:
n, d = train_data.shape
print "Duplicated questions %d"%sum(train_data['is_duplicate'])
print "Not duplicated questions %d"% (n- sum(train_data['is_duplicate']))
#porcentajes
print "Duplicated questions per cent %f "% (100*np.sum(train_data['is_duplicate']) / float(n))
print "Not duplicated questions per cent %f "% (100*(n- np.sum(train_data['is_duplicate'])) / float(n))

Duplicated questions 1894
Not duplicated questions 3106
Duplicated questions per cent 37.880000 
Not duplicated questions per cent 62.120000 


In [19]:
import re, time
from nltk.corpus import stopwords
from nltk import WordNetLemmatizer, word_tokenize
from nltk.stem.porter import PorterStemmer

def word_extractor(text):
    stemmer = PorterStemmer()
    commonwords = stopwords.words('english')
    try:
        text = re.sub(r'([a-z])\1+', r'\1\1',text)#substitute multiple letter by two
        text = re.sub(r'([^\s\w]|_)+', r'',text) #delete chars that are not letter or number and downcase
        text = re.sub(r'[^\w]', r' ',text)
    except: 
        if text != text:
            text = " "
        print text
    words = []
    wordtokens = [ stemmer.stem(word.lower())  for word in word_tokenize(text.decode('utf-8', 'ignore')) ]
    for word in wordtokens:
        if word not in commonwords:
            words.append(word)
    return words

question1 =  train_data.iloc[0][3]
question2 = train_data.iloc[0][4]
print question1
print question2
print 1==train_data.iloc[0].is_duplicate

print word_extractor(question1)
print word_extractor(question2)

#tokeniza
pregunta1 = [word_extractor(word) for word in train_data.question1 ]
pregunta2 = [word_extractor(word) for word in train_data.question2 ]

print "listo"

What is swarm robot?
Will swarm robots exist in the near future?
False
[u'swarm', u'robot']
[u'swarm', u'robot', u'exist', u'near', u'futur']
listo


## Vectorizar

In [41]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = CountVectorizer(tokenizer=lambda doc: doc, lowercase=False,ngram_range=(1, 1), binary='False')
#vectorizer = TfidfVectorizer(ngram_range=(1, 1), binary='False')
preguntas = np.concatenate((np.asarray(pregunta1),np.asarray(pregunta2)))
vectorizer.fit(np.asarray(preguntas))
vocab = vectorizer.get_feature_names()
print "Cantidad de palabras en el vocabulario: %d" %len(vocab)

pregunta1_vectorizada = vectorizer.transform(pregunta1)
pregunta2_vectorizada = vectorizer.transform(pregunta1)

Cantidad de palabras en el vocabulario: 7931


In [44]:

vector_pregunta1 = vectorizer.transform(pregunta1)  #+pregunta2])
vector_pregunta2 = vectorizer.transform(pregunta2)  #+pregunta2])
    
dist=list(np.array(vector_pregunta1.sum(axis=0)).reshape(-1,))
dist2=list(np.array(vector_pregunta2.sum(axis=0)).reshape(-1,))

# Se ordenan las palabras por cantidad
lista_train = zip(vocab, dist)
lista_train.sort(key=lambda x: x[1])
lista_train.reverse()
# Se ordenan las palabras por cantidad
lista_test = zip(vocab, dist2)
lista_test.sort(key=lambda x: x[1])
lista_test.reverse()

N = 25
pals_train = []
count_train =[]
pals_test = []
count_test = []
for i in range(N):
    tag, count = lista_train[i]
    pals_train.append(tag)
    count_train.append(count)
    tag_test, count_t = lista_test[i]
    pals_test.append(tag_test)
    count_test.append(count_t)    

a = [range(1,N+1),pals_train,count_train, ["#"]*N, range(1,N+1), pals_test,count_test]
table =  zip(*a)
from tabulate import tabulate
print tabulate(table, headers=["Pregunta 1","Palabra","Frecuencia","#", "Pregunta 2","Palabra","Frecuencia"],  tablefmt="rst")

  Pregunta 1  Palabra      Frecuencia  #      Pregunta 2  Palabra      Frecuencia
           1  whi                 507  #               1  whi                 529
           2  best                413  #               2  best                457
           3  doe                 381  #               3  doe                 324
           4  get                 239  #               4  get                 250
           5  like                178  #               5  like                180
           6  india               161  #               6  peopl               178
           7  peopl               159  #               7  way                 174
           8  use                 154  #               8  india               172
           9  way                 149  #               9  use                 167
          10  differ              147  #              10  good                143
          11  good                132  #              11  would               142
          12  wo

In [59]:
from scipy.sparse import hstack
X = hstack((pregunta1_vectorizada, pregunta2_vectorizada))

y = np.asarray(train_data.is_duplicate.values)

In [73]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.33, random_state=42)

In [56]:
X.shape

(5000, 15862)

In [76]:
from sklearn.naive_bayes import BernoulliNB
model = BernoulliNB()
model = model.fit(X_train, y_train)
print "Naive Bayes"
print "LogLoss en Train set predict model: %f"%logloss(y_train,model.predict_proba(X_train)[:,1])
print "LogLoss en Val set predict model: %f"%logloss(y_val,model.predict_proba(X_val)[:,1])
#print "Cross validation predict model: %f"%C_V(model,X.todense(),y)
#score_the_model(model,x,y,xt,yt,"BernoulliNB")

from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model = model.fit(X_train, y_train)
print "Multinomial Bayes"
print "LogLoss en Train set predict model: %f"%logloss(y_train,model.predict_proba(X_train)[:,1])
print "LogLoss en Val set predict model: %f"%logloss(y_val,model.predict_proba(X_val)[:,1])
#print "Cross validation predict model: %f"%C_V(model,X.todense(),y)

from sklearn.ensemble import RandomForestClassifier as RandomFor
model = RandomFor(max_depth=15)
model = model.fit(X_train, y_train)
print "Random Forest"
print "LogLoss en Train set predict model: %f"%logloss(y_train,model.predict_proba(X_train)[:,1])
print "LogLoss en Val set predict model: %f"%logloss(y_val,model.predict_proba(X_val)[:,1])
#print "Cross validation predict model: %f"%C_V(model,X.todense(),y)

from sklearn.linear_model import LogisticRegression
model = LogisticRegression(penalty='l2',C=0.1)
model = model.fit(X_train, y_train)
print "Logistic Regression"
print "LogLoss en Train set predict model: %f"%logloss(y_train,model.predict_proba(X_train)[:,1])
print "LogLoss en Val set predict model: %f"%logloss(y_val,model.predict_proba(X_val)[:,1])
#print "Cross validation predict model: %f"%C_V(model,X.todense(),y)

Naive Bayes
LogLoss en Train set predict model: 0.421424
LogLoss en Val set predict model: 1.207405
Multinomial Bayes
LogLoss en Train set predict model: 0.304714
LogLoss en Val set predict model: 1.086724
Random Forest
LogLoss en Train set predict model: 0.618490
LogLoss en Val set predict model: 0.642403
Logistic Regression
LogLoss en Train set predict model: 0.492190
LogLoss en Val set predict model: 0.616440
