# Kaggle - quora challenge
Team: marsamag
* Marcelo Barata Ribeiro
* Magno Mendes 
* Sayuri Takeda)

In [1]:
import os
import pickle

import numpy
import scipy
import pandas
import string
import warnings
import math
import matplotlib.pyplot as plt

import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk import word_tokenize

from fuzzywuzzy import fuzz

import gensim
from gensim import corpora, models, similarities
from gensim.models import Word2Vec

from sklearn import model_selection
from sklearn.cross_validation import train_test_split
from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score

import csv



In [2]:
list_stopwords = set(stopwords.words("english"))

Pick a small sample to test if every function is working

In [3]:
if __name__ == "__main__":
    quora_train = pandas.read_csv("/Dados/Kaggle/train.csv")
    quora_train = quora_train.head(20) # temporary data reduction

In [4]:
if __name__ == "__main__":
    print (type(quora_train))
    print(quora_train.tail())

<class 'pandas.core.frame.DataFrame'>
    id  qid1  qid2                                          question1  \
15  15    31    32  What would a Trump presidency mean for current...   
16  16    33    34                       What does manipulation mean?   
17  17    35    36  Why do girls want to be friends with the guy t...   
18  18    37    38  Why are so many Quora users posting questions ...   
19  19    39    40  Which is the best digital marketing institutio...   

                                            question2  is_duplicate  
15  How will a Trump presidency affect the student...             1  
16                      What does manipulation means?             1  
17           How do guys feel after rejecting a girl?             0  
18  Why do people ask Quora questions which can be...             1  
19  Which is the best digital marketing institute ...             0  


### Functions to process data
We used some functions made by Guilherme Wang and Lucas Medeiros team, because its functions were well writen compared to ours.

In [5]:
# Editing questions with NLTK package

def remove_stopwords(phrase,list_stopwords):
    """
    Receives a phrase and removes all stopwords from a list
    :param phrase: String. A phrase.
    :param list_stopwords: List. A list of stopwords
    :return: The same phrase without stopwords
    """
    final_phrase = []
    words = phrase.split(" ")
    for word in words:
        if word not in list_stopwords:
            final_phrase.append((word))
    
    final_phrase = ' '.join(final_phrase)
    
    return final_phrase
    
def remove_punctuation(phrase):
    """
    Receives a phrase and removes all punctuation from it
    :param phrase: String. A phrase.
    :return: The same phrase without punctuation
    """
    #Check if NA
    if type(phrase) is float:
        if math.isnan(phrase):
            return ("")
    
    translator = str.maketrans('', '', string.punctuation)
    phrase = phrase.translate(translator) #removing punctuation
        
    return phrase

def lemm_wordnet(phrase):
    """
    Receives a phrase and removes lemmatizes it
    :param phrase: String. A phrase.
    :return: The same phrase in lemmas
    """
    lemm = WordNetLemmatizer()
    
    #NA is a float type, so this if is to avoid conflict
    if type(phrase) is not float:
        phrase = [lemm.lemmatize(i) for i in phrase.split()]
        phrase = ' '.join(phrase)
    else:
        return ""
    return phrase
    
def remove_duplicate(phrase):
    """
    Receives a phrase and removes all duplicate words
    :param phrase: String. A phrase.
    :return: The same phrase with just unique words
    """
    aux_phrase = []
        
    if type(phrase) is not float:
        
        for i in phrase.split():
            
            if i not in aux_phrase:
                aux_phrase.append(i)
    
    phrase = ' '.join(aux_phrase)
    
    return phrase
    
    
def all_lower_case(phrase):    
    """
    Receives a phrase and makes it lower case
    :param phrase: String. A phrase.
    :return: The same phrase in lower case
    """
    if type(phrase) is not float:
            phrase = phrase.lower()
    return phrase
    
def stem_snowball(phrase):
    """
    Receives a phrase and returns the same phrase stemmed, lowercase phrase without stopwords
    :param phrase: String. A phrase.
    :return: String. Stemmed, lowercase phrase without stopwords
    """
    stemmer = SnowballStemmer("english")
    
    #Stem words according to stemmer
    final_phrase = []
    words = phrase.split(" ")
    for word in words:
        final_phrase.append((stemmer.stem(word)))
    
    final_phrase = ' '.join(final_phrase)
    
    return final_phrase

In [6]:
#cleaning tool is used so you can easily choose which functions you want to use to clean te text
def cleaning_tool(data, drop_na = True, lower_case = True, rm_duplicate = True, stopwords = True, 
                  punctuation = False, lemm = False, stem = False):
    """
    Function to process all data using calling functions from above, according to what was chosen.
    :param data: data frame.
    :param drop_na: If True drop all lines of data frame with NA
    :param lower_case: If True transform for lower case
    :param rm_duplicate: If True remove all duplicate words in questions
    :param stopwords: If True removes stopwords
    :param punctuation: If True removes punctuation
    :param lemm: If True returns the phrase lemmatized
    :param stem: If True returns the phrase stemmed
    :param list_of_stopwords: List of stopwords to be used
    :return: Question1 and Question2 processed according to parameters
    """
    if drop_na == True:
        data = data.dropna(0)
    
    if rm_duplicate == True:
        data["question1"] = data["question1"].apply(lambda x: remove_duplicate(x))
        data["question2"] = data["question2"].apply(lambda x: remove_duplicate(x))
    
    if lower_case == True:
        data["question1"] = data["question1"].apply(lambda x: all_lower_case(x))
        data["question2"] = data["question2"].apply(lambda x: all_lower_case(x))
    
    if stopwords == True:
        data["question1"] = data["question1"].apply(lambda x: remove_stopwords(x, list_stopwords))
        data["question2"] = data["question2"].apply(lambda x: remove_stopwords(x, list_stopwords))
       
    if punctuation == True:
        data["question1"] = data["question1"].apply(lambda x: remove_punctuation(x))
        data["question2"] = data["question2"].apply(lambda x: remove_punctuation(x))
        
    if lemm_wordnet == True:
        data["question1"] = data["question1"].apply(lambda x: lemm_wordnet(x))
        data["question2"] = data["question2"].apply(lambda x: lemm_wordnet(x))
        
    if stem_snowball == True:
        data["question1"] = data["question1"].apply(lambda x: stem_snowball(x))
        data["question2"] = data["question2"].apply(lambda x: stem_snowball(x))
    
    #We used it two times if some function create a new NA.
    if drop_na == True:
        data = data.dropna(0)    
        
    return data

### create basic features
Here we create some features for the data, such as phrase length, word length and fuzzyWuzzy features which use Levenstein distance

In [7]:
def make_basic_features(data):
    data["len_q1"] = data.question1.apply(lambda x: len(str(x)))
    data["len_q2"] = data.question2.apply(lambda x: len(str(x)))
    data["diff_len"] = data.len_q1 - data.len_q2
    data["len_char_q1"] = data.question1.apply(lambda x: len(''.join(set(str(x).replace(' ', '')))))
    data["len_char_q2"] = data.question2.apply(lambda x: len(''.join(set(str(x).replace(' ', '')))))
    data["len_word_q1"] = data.question1.apply(lambda x: len(str(x).split()))
    data["len_word_q2"] = data.question2.apply(lambda x: len(str(x).split()))
    data['common_words'] = data.apply(lambda x: len(set(str(x['question1']).lower().split()).intersection(set(str(x['question2']).lower().split()))), axis=1)
    return data

def make_fuzz_features(data):
    data['fuzz_qratio'] = data.apply(lambda x: fuzz.QRatio(str(x['question1']), str(x['question2'])), axis=1) 
    data['fuzz_WRatio'] = data.apply(lambda x: fuzz.WRatio(str(x['question1']), str(x['question2'])), axis=1) 
    data['fuzz_partial_ratio'] = data.apply(lambda x: fuzz.partial_ratio(str(x['question1']), str(x['question2'])), axis=1) 
    data['fuzz_partial_token _set_ratio'] = data.apply(lambda x : fuzz.partial_token_set_ratio(str(x['question1']), str(x['question2'])), axis=1) 
    data['fuzz_partial_token_sort_ratio'] = data.apply(lambda x: fuzz. partial_token_sort_ratio(str(x['question1']), str(x['question2'])), axis=1) 
    data ['fuzz_token_set_ratio'] = data.apply(lambda x: fuzz.token_set_ratio(str(x['question1']), str(x['question2'])), axis=1) 
    data['fuzz_token_sort_ratio'] = data.apply(lambda x: fuzz.token_sort_ratio(str(x['question1']), str(x['question2'])), axis=1)
    return data

### Word2Vec features (google news model)
Here we create word2vec features and some distance features, such as cossine distance, euclidean distance and Cabberra distance.

But we don't use those features for the submission, because it was too slow to process them. 

In [11]:
path_io_files = '/Dados/Word2vec/'

In [12]:
googlenews = os.path.join(path_io_files,'GoogleNews-vectors-negative300.bin.gz')
wikimedia = os.path.join(path_io_files,'model_wikimedia_w2v')

In [13]:
model_wikimedia = Word2Vec.load(wikimedia)

In [14]:
#teste com wikimedia em vez de google cloud
num_features = 400    # Word vector dimensionality

def makeFeatureVec(words, model, num_features):
    # Function to average all of the word vectors in a given paragraph
    # Pre-initialize an empty numpy array (for speed)
    featureVec = numpy.zeros((num_features,),dtype="float32")
    nwords = 0.
    # Index2word is a list that contains the names of the words in the model's vocabulary. 
    #Convert it to a set, for speed
    index2word_set = set(model.wv.index2word)
    # Loop over each word in the review and, if it is in the model's
    # vocabulary, add its feature vector to the total
    for word in words.split(" "):
        if word in index2word_set:
            nwords = nwords + 1.
            featureVec = numpy.add(featureVec,model[word])
    # Divide the result by the number of words to get the average
    featureVec = numpy.divide(featureVec,nwords)
    return featureVec

In [33]:
def make_vec_features(data):
    data['featureVec1'] = data['question1'].apply(lambda x: makeFeatureVec(str(x), model_wikimedia, num_features))
    data['featureVec2'] = data['question2'].apply(lambda x: makeFeatureVec(str(x), model_wikimedia, num_features))
    return data

def make_dist_features(data):
    data['dist_cosine'] = data.apply(lambda x: scipy.spatial.distance.cosine(x['featureVec1'], x['featureVec2']), axis=1) 
    data['dist_canber'] = data.apply(lambda x: scipy.spatial.distance.canberra(x['featureVec1'], x['featureVec2']), axis=1) 
    data['dist_euclid'] = data.apply(lambda x: scipy.spatial.distance.euclidean(x['featureVec1'], x['featureVec2']), axis=1)
    data['dist_braycu'] = data.apply(lambda x: scipy.spatial.distance.braycurtis(x['featureVec1'], x['featureVec2']), axis=1)
    #data['dist_jaccar'] = data.apply(lambda x: scipy.spatial.distance.jaccard(x['featureVec1'], x['featureVec2']), axis=1) 
    return data

### clean text features function
Here we delete features that are irrelevant for the machine learning step

In [9]:
def delete_features(data):
    del data['question1']
    del data['question2']
    try: 
        del data['featureVec1']
        del data['featureVec2']
    except: pass
    try: 
        del data['qid1']
        del data['qid2']
   #     del data['id']
    except: pass
    #try: #del data['test_id']
    #except: pass
    return data

### use features 

In [86]:
quora_train = cleaning_tool(quora_train)

In [87]:
quora_train = make_basic_features(quora_train)

In [88]:
quora_train = make_fuzz_features(quora_train)

In [92]:
quora_train = make_vec_features(quora_train)

In [93]:
quora_train = make_dist_features(quora_train)

ValueError: ('array must not contain infs or NaNs', 'occurred at index 292')

In [98]:
quora_train.ix[292]

id                                                                             292
qid1                                                                           584
qid2                                                                           585
question1                                                               cpagripcom
question2                                                            bestmytestcom
is_duplicate                                                                     0
len_q1                                                                          10
len_q2                                                                          13
diff_len                                                                        -3
len_char_q1                                                                      8
len_char_q2                                                                      8
len_word_q1                                                                      1
len_

### delete features

In [None]:
quora_train = delete_features(quora_train)

### train/test split

In [90]:
x_quora_train = quora_train.drop("is_duplicate", axis=1)
y_quora_train = quora_train["is_duplicate"]
quora_train_features, quora_test_features, quora_train_y, quora_test_y = model_selection.train_test_split(
    x_quora_train, y_quora_train, test_size = 0.3, random_state = 0)

### Machine Learning Models

In [91]:
randomforest = RandomForestClassifier(n_estimators=300, max_features='auto', bootstrap=False, 
                               oob_score=False, n_jobs=-1, random_state=0)
randomforest.fit(quora_train_features, quora_train_y)

predict = randomforest.predict_proba(quora_test_features)

print(log_loss(quora_test_y,predict))

0.638441541148


In [67]:
predict_sample = predict[0:10]
predict_sample

array([[ 0.45      ,  0.55      ],
       [ 0.61333333,  0.38666667],
       [ 0.69      ,  0.31      ],
       [ 0.61666667,  0.38333333],
       [ 0.87666667,  0.12333333],
       [ 0.58      ,  0.42      ],
       [ 0.95666667,  0.04333333],
       [ 0.66      ,  0.34      ],
       [ 0.98333333,  0.01666667],
       [ 0.77666667,  0.22333333]])

# Submission
Here we submit our predictions using all the training and testing databases

In [10]:
quora_train = pandas.read_csv("/Dados/Kaggle/train.csv")
quora_test = pandas.read_csv("/Dados/Kaggle/test.csv")

In [12]:
def use_selected_functions(quora_data):
    quora_data = cleaning_tool(quora_data)
    quora_data = make_basic_features(quora_data)
    quora_data = delete_features(quora_data)
    return quora_data

In [13]:
quora_train = use_selected_functions(quora_train)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is tryin

In [14]:
quora_test = use_selected_functions(quora_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is tryin

In [15]:
quora_train.tail()

Unnamed: 0,id,is_duplicate,len_q1,len_q2,diff_len,len_char_q1,len_char_q2,len_word_q1,len_word_q2,common_words
404285,404285,0,57,55,2,20,19,7,7,6
404286,404286,1,19,16,3,11,11,3,3,2
404287,404287,0,9,12,-3,6,11,2,2,1
404288,404288,0,63,80,-17,20,24,9,13,0
404289,404289,0,16,16,0,11,11,3,3,3


In [16]:
quora_test.tail()

Unnamed: 0,test_id,len_q1,len_q2,diff_len,len_char_q1,len_char_q2,len_word_q1,len_word_q2,common_words
2345791,2345791,44,23,21,18,14,7,4,0
2345792,2345792,33,34,-1,16,20,5,4,1
2345793,2345793,51,38,13,19,15,6,4,0
2345794,2345794,82,98,-16,21,23,11,12,10
2345795,2345795,46,29,17,18,15,5,4,2


In [17]:
x_quora_train = quora_train.drop("is_duplicate", axis=1)
y_quora_train = quora_train["is_duplicate"]
x_quora_test = quora_test

In [18]:
randomforest = RandomForestClassifier(n_estimators=300, max_features='auto', bootstrap=False, 
                               oob_score=False, n_jobs=-1, random_state=0)
randomforest.fit(x_quora_train, y_quora_train)

RandomForestClassifier(bootstrap=False, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=300, n_jobs=-1, oob_score=False, random_state=0,
            verbose=0, warm_start=False)

In [19]:
predict = randomforest.predict_proba(x_quora_test)
prediction_submission = [i[1] for i in predict]

In [24]:
len(prediction_submission)

2345790

### save pickle with predictions

In [36]:
predictions_list_file = 'predictions_list.pkl'
pickle.dump(prediction_submission, open(predictions_list_file, 'wb'))

### create csv file with predictions

In [37]:
count = 0
test_list = []
submission_file = "submission.csv"
with open(submission_file, 'w') as csvfile:
    writer = csv.writer(csvfile, delimiter=',')
    writer.writerow(['test_id'] + ['is_duplicate'])
    for pred in prediction_submission: 
        writer.writerow([count] + [float(pred)])
        count += 1