In [1]:
import matplotlib as plt
import pandas as pd
import numpy as np

import sklearn
import sklearn.feature_extraction.text as sklearnText
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [2]:
data_set = pd.read_csv("Data/fuzz.csv")

In [3]:
data_set

Unnamed: 0.1,Unnamed: 0,id,question1,question2,is_duplicate_x,is_duplicate_y,merged,fuzzRatio,fuzzPartial,fuzzTokenSort,fuzzTokenSet
0,0,0,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,,0,What is the step by step guide to invest in sh...,93,98,93,100
1,1,1,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,,0,What is the story of Kohinoor (Koh-i-Noor) Dia...,65,73,63,86
2,2,2,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,,0,How can I increase the speed of my internet co...,45,41,63,63
3,3,3,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,,0,Why am I mentally very lonely? How can I solve...,7,20,24,28
4,4,4,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,,0,"Which one dissolve in water quikly sugar, salt...",37,54,47,67
5,5,5,Astrology: I am a Capricorn Sun Cap moon and c...,"I'm a triple Capricorn (Sun, Moon and ascendan...",,1,Astrology: I am a Capricorn Sun Cap moon and c...,66,67,74,78
6,6,6,Should I buy tiago?,What keeps childern active and far from phone ...,,0,Should I buy tiago?!SPLIT!What keeps childern ...,17,32,23,24
7,7,7,How can I be a good geologist?,What should I do to be a great geologist?,,1,How can I be a good geologist?!SPLIT!What shou...,59,67,61,71
8,8,8,When do you use シ instead of し?,"When do you use ""&"" instead of ""and""?",,0,When do you use シ instead of し?!SPLIT!When do ...,85,90,87,93
9,9,9,Motorola (company): Can I hack my Charter Moto...,How do I hack Motorola DCX3400 for free internet?,,0,Motorola (company): Can I hack my Charter Moto...,50,57,44,65


In [67]:
def split_train_test(X, y, split_random_state = 1):
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = split_random_state)
    return X_train, X_test, y_train, y_test

def create_classifier(hidden_layer_sizes_tuple, max_iterations):
    '''create mlp classifiers with parameters'''
    from sklearn.neural_network import MLPClassifier       
    mlp = MLPClassifier(hidden_layer_sizes=hidden_layer_sizes_tuple, max_iter=max_iterations)
    return mlp
    
def create_regressor(hidden_layer_sizes_tuple, max_iterations):
    from sklearn.neural_network import MLPRegressor
    '''create mlp classifiers with parameters'''
    mlp = MLPRegressor(hidden_layer_sizes=hidden_layer_sizes_tuple, max_iter=max_iterations)
    return mlp

def createNeuralNetwork(X_train, X_test, y_train, y_test, hidden_layer_tuples, max_iterations, kind = 'classification', 
                        split_random_state = 1, use_Scaler = True):
    ''' create neural network and evaluate using training and test set.
        Standard scaler by default enabled, 'use_Scaler = False' to disable.
        'hidden_layer_tuples' & 'max_iterations' to define the parameters of the neural network
    '''
    # if enabled, use a standard scaler on the data
    if use_Scaler == True:
        from sklearn.preprocessing import StandardScaler
        scaler = StandardScaler()

        # Fit the training data
        scaler.fit(X_train)

        StandardScaler(copy=True, with_mean=True, with_std=True)

        # Apply the transformations to the data:
        X_train = scaler.transform(X_train)
        X_test = scaler.transform(X_test)
        
    if kind == 'classification':
        # create the classifier
        mlp = create_classifier(hidden_layer_tuples, max_iterations)
        mlp.fit(X_train,y_train)
        predictions = mlp.predict(X_test)
        from sklearn.metrics import classification_report,confusion_matrix
        try:
            print(confusion_matrix(y_test,predictions))
        except:
            print()
        
    elif kind == 'regression':
        mlp = create_regressor(hidden_layer_tuples, max_iterations)
        mlp.fit(X_train,y_train)
        predictions = mlp.predict(X_test)
    
    # run only if y is know, so do not run if classifying unknown data 
    if y_test!= []:
        score_train = mlp.score(X_train, y_train)
        score_test = mlp.score(X_test, y_test)
        print(score_train, score_test)
    
    #print(predictions)
    return predictions
    
X_train, X_test, y_train, y_test = split_train_test(data_set.iloc[:, -4:], data_set['is_duplicate_y'], split_random_state = 1)
createNeuralNetwork(X_train, X_test, y_train, y_test, (30,20,15,15), 3000, kind = 'classification')

[[37317 13701]
 [10784 18989]]


array([0, 0, 0, ..., 0, 1, 0], dtype=int64)

# Running on the test set (a.k.a. the assignment)

### create FuzzyWuzzy paramaters as input

In [50]:
test_data = pd.read_csv("Data/test_data.csv")

In [51]:
from fuzzywuzzy import fuzz
from fuzzywuzzy import process



In [52]:
test_data['merged'] = test_data['question1'] + '!SPLIT!' + test_data['question2']

In [53]:
def calcFuzzRation(merged):
    merged = str(merged)
    merged = merged.split('!SPLIT!')
    try:
        fuzzRatio = fuzz.ratio(merged[0],merged[1])
    except:
        fuzzRatio = 0
    
    return fuzzRatio

def calcFuzzPartial(merged):
    merged = str(merged)
    merged = merged.split('!SPLIT!')
    try:
        fuzzRatio = fuzz.partial_ratio(merged[0],merged[1])
    except:
        fuzzRatio = 0
    
    return fuzzRatio

def calcFuzzTokenSort(merged):
    merged = str(merged)
    merged = merged.split('!SPLIT!')
    try:
        fuzzRatio = fuzz.token_sort_ratio(merged[0],merged[1])
    except:
        fuzzRatio = 0
    
    return fuzzRatio

def calcFuzzTokenSet(merged):
    merged = str(merged)
    merged = merged.split('!SPLIT!')
    try:
        fuzzRatio = fuzz.token_set_ratio(merged[0],merged[1])
    except:
        fuzzRatio = 0
    
    return fuzzRatio

In [54]:
test_data['fuzzRatio'] = test_data['merged'].apply(calcFuzzRation)

In [55]:
test_data['fuzzPartial'] = test_data['merged'].apply(calcFuzzPartial)

In [56]:
test_data['fuzzTokenSort'] = test_data['merged'].apply(calcFuzzTokenSort)

In [57]:
test_data['fuzzTokenSet'] = test_data['merged'].apply(calcFuzzTokenSet)

In [58]:
test_data

Unnamed: 0,test_id,question1,question2,merged,fuzzRatio,fuzzPartial,fuzzTokenSort,fuzzTokenSet
0,15,What would a Trump presidency mean for current...,How will a Trump presidency affect the student...,What would a Trump presidency mean for current...,51,52,46,52
1,20,Why do rockets look white?,Why are rockets and boosters painted white?,Why do rockets look white?!SPLIT!Why are rocke...,64,65,66,81
2,21,What's causing someone to be jealous?,What can I do to avoid being jealous of someone?,What's causing someone to be jealous?!SPLIT!Wh...,45,65,75,78
3,23,How much is 30 kV in HP?,Where can I find a conversion chart for CC to ...,How much is 30 kV in HP?!SPLIT!Where can I fin...,7,20,23,23
4,34,What is the best travel website in spain?,What is the best travel website?,What is the best travel website in spain?!SPLI...,88,97,87,100
5,46,How did Darth Vader fought Darth Maul in Star ...,Does Quora have a character limit for profile ...,How did Darth Vader fought Darth Maul in Star ...,31,33,36,35
6,48,What are some examples of products that can be...,What are some of the products made from crude ...,What are some examples of products that can be...,77,68,82,90
7,51,Will a Blu Ray play on a regular DVD player? I...,How can you play a Blu Ray DVD on a regular DV...,Will a Blu Ray play on a regular DVD player? I...,67,67,82,91
8,54,How GST affects the CAs and tax officers?,Why can't I do my homework?,How GST affects the CAs and tax officers?!SPLI...,32,39,42,42
9,56,Who is israil friend?,Is my boyfriend lying about his true feelings ...,Who is israil friend?!SPLIT!Is my boyfriend ly...,24,48,28,62


In [90]:
X_train = data_set.iloc[:, -4:]
y_train = data_set['is_duplicate_y']

# there is no training data available, we only need to classify
y_test = []
X_test = test_data.iloc[:, -4:]

output = createNeuralNetwork(X_train, X_test, y_train, y_test, (30,20,15,15), 3000, kind = 'classification')




In [91]:
len(X_test),len(output)

(81126, 81126)

In [89]:
len(test_data)

81126

In [94]:
submission = pd.concat([test_data.iloc[:, 0], pd.DataFrame(output)], axis=1, sort=False)

In [95]:
submission.to_csv('result/submission.csv', index = False)