In [1]:
import matplotlib as plt
import pandas as pd
import numpy as np

import sklearn
import sklearn.feature_extraction.text as sklearnText
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

import difflib
from functools import partial

from fuzzywuzzy import fuzz
from fuzzywuzzy import process

  from numpy.core.umath_tests import inner1d


In [2]:
data_set = pd.read_csv("Data/fuzz.csv")

In [3]:
def stringmatcher(dataframe, column1, column2):
    return difflib.SequenceMatcher(None, str(dataframe[column1]), str(dataframe[column2])).ratio()

In [4]:
# Adds the sequence matching ratio to the dataframe
data_set['SMratio'] = data_set.apply(partial(stringmatcher, column1='question1', column2='question2'), axis=1)

In [5]:
data_set.head()

Unnamed: 0.1,Unnamed: 0,id,question1,question2,is_duplicate_x,is_duplicate_y,merged,fuzzRatio,fuzzPartial,fuzzTokenSort,fuzzTokenSet,SMratio
0,0,0,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,,0,What is the step by step guide to invest in sh...,93,98,93,100,0.926829
1,1,1,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,,0,What is the story of Kohinoor (Koh-i-Noor) Dia...,65,73,63,86,0.647482
2,2,2,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,,0,How can I increase the speed of my internet co...,45,41,63,63,0.454545
3,3,3,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,,0,Why am I mentally very lonely? How can I solve...,7,20,24,28,0.069565
4,4,4,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,,0,"Which one dissolve in water quikly sugar, salt...",37,54,47,67,0.365217


In [6]:
def split_train_test(X, y, split_random_state = 1):
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = split_random_state)
    return X_train, X_test, y_train, y_test

def create_classifier(hidden_layer_sizes_tuple, max_iterations):
    '''create mlp classifiers with parameters'''
    from sklearn.neural_network import MLPClassifier       
    mlp = MLPClassifier(hidden_layer_sizes=hidden_layer_sizes_tuple, max_iter=max_iterations)
    return mlp
    
def create_regressor(hidden_layer_sizes_tuple, max_iterations):
    from sklearn.neural_network import MLPRegressor
    '''create mlp classifiers with parameters'''
    mlp = MLPRegressor(hidden_layer_sizes=hidden_layer_sizes_tuple, max_iter=max_iterations)
    return mlp

def createNeuralNetwork(X_train, X_test, y_train, y_test, hidden_layer_tuples, max_iterations, kind = 'classification', 
                        split_random_state = 1, use_Scaler = True):
    ''' create neural network and evaluate using training and test set.
        Standard scaler by default enabled, 'use_Scaler = False' to disable.
        'hidden_layer_tuples' & 'max_iterations' to define the parameters of the neural network
    '''
    # if enabled, use a standard scaler on the data
    if use_Scaler == True:
        from sklearn.preprocessing import StandardScaler
        scaler = StandardScaler()

        # Fit the training data
        scaler.fit(X_train)

        StandardScaler(copy=True, with_mean=True, with_std=True)

        # Apply the transformations to the data:
        X_train = scaler.transform(X_train)
        X_test = scaler.transform(X_test)
        
    if kind == 'classification':
        # create the classifier
        mlp = create_classifier(hidden_layer_tuples, max_iterations)
        mlp.fit(X_train,y_train)
        predictions = mlp.predict(X_test)
        from sklearn.metrics import classification_report,confusion_matrix
        try:
            print(confusion_matrix(y_test,predictions))
        except:
            print()
        
    elif kind == 'regression':
        mlp = create_regressor(hidden_layer_tuples, max_iterations)
        mlp.fit(X_train,y_train)
        predictions = mlp.predict(X_test)
    
    # run only if y is know, so do not run if classifying unknown data 
    if y_test!= []:
        score_train = mlp.score(X_train, y_train)
        score_test = mlp.score(X_test, y_test)
        print(score_train, score_test)
    
    #print(predictions)
    return predictions
    
#X_train, X_test, y_train, y_test = split_train_test(data_set.iloc[:, -4:], data_set['is_duplicate_y'], split_random_state = 1)
#createNeuralNetwork(X_train, X_test, y_train, y_test, (30,20,15,15), 3000, kind = 'classification')

# Running on the test set (a.k.a. the assignment)

### create FuzzyWuzzy paramaters as input

In [7]:
test_data = pd.read_csv("Data/test_data.csv")

In [8]:
test_data['merged'] = test_data['question1'] + '!SPLIT!' + test_data['question2']

In [9]:
def calcFuzzRation(merged):
    merged = str(merged)
    merged = merged.split('!SPLIT!')
    try:
        fuzzRatio = fuzz.ratio(merged[0],merged[1])
    except:
        fuzzRatio = 0
    
    return fuzzRatio

def calcFuzzPartial(merged):
    merged = str(merged)
    merged = merged.split('!SPLIT!')
    try:
        fuzzRatio = fuzz.partial_ratio(merged[0],merged[1])
    except:
        fuzzRatio = 0
    
    return fuzzRatio

def calcFuzzTokenSort(merged):
    merged = str(merged)
    merged = merged.split('!SPLIT!')
    try:
        fuzzRatio = fuzz.token_sort_ratio(merged[0],merged[1])
    except:
        fuzzRatio = 0
    
    return fuzzRatio

def calcFuzzTokenSet(merged):
    merged = str(merged)
    merged = merged.split('!SPLIT!')
    try:
        fuzzRatio = fuzz.token_set_ratio(merged[0],merged[1])
    except:
        fuzzRatio = 0
    
    return fuzzRatio

In [10]:
test_data['fuzzRatio'] = test_data['merged'].apply(calcFuzzRation)

In [11]:
test_data['fuzzPartial'] = test_data['merged'].apply(calcFuzzPartial)

In [12]:
test_data['fuzzTokenSort'] = test_data['merged'].apply(calcFuzzTokenSort)

In [13]:
test_data['fuzzTokenSet'] = test_data['merged'].apply(calcFuzzTokenSet)

In [14]:
test_data['SMratio'] = test_data.apply(partial(stringmatcher, column1='question1', column2='question2'), axis=1)

In [15]:
test_data.head()

Unnamed: 0,test_id,question1,question2,merged,fuzzRatio,fuzzPartial,fuzzTokenSort,fuzzTokenSet,SMratio
0,15,What would a Trump presidency mean for current...,How will a Trump presidency affect the student...,What would a Trump presidency mean for current...,53,54,56,61,0.51087
1,20,Why do rockets look white?,Why are rockets and boosters painted white?,Why do rockets look white?!SPLIT!Why are rocke...,64,65,66,81,0.637681
2,21,What's causing someone to be jealous?,What can I do to avoid being jealous of someone?,What's causing someone to be jealous?!SPLIT!Wh...,59,65,75,78,0.447059
3,23,How much is 30 kV in HP?,Where can I find a conversion chart for CC to ...,How much is 30 kV in HP?!SPLIT!Where can I fin...,25,33,30,30,0.074074
4,34,What is the best travel website in spain?,What is the best travel website?,What is the best travel website in spain?!SPLI...,88,97,87,100,0.876712


In [16]:
# Hiermee haalt hij momenteel de 70% accuracy
X_train = data_set.iloc[:, -5:]
y_train = data_set['is_duplicate_y']

# there is no training data available, we only need to classify
y_test = []
X_test = test_data.iloc[:, -5:]



In [47]:
output = createNeuralNetwork(X_train, X_test, y_train, y_test, (30,20,15,15), 3000, kind = 'classification')




In [48]:
len(X_test),len(output)

(81126, 81126)

In [49]:
len(test_data)

81126

In [52]:
from collections import Counter
Counter(output)

Counter({0: 47762, 1: 33364})

In [53]:
submission = pd.concat([test_data.iloc[:, 0], pd.DataFrame(output)], axis=1, sort=False)
submission.head()

Unnamed: 0,test_id,0
0,15,0
1,20,1
2,21,1
3,23,0
4,34,0


In [54]:
submission.to_csv('result/submission.csv', index = False)

# 2
Ingebouwde bag of words, gekloot vooral

In [80]:
lol = ["this is stupid", "lol test"]
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer()
vect.fit(lol)
print(format(vect.vocabulary_))

{'this': 4, 'is': 0, 'stupid': 2, 'lol': 1, 'test': 3}


In [81]:
data_set.question1.values.astype('U')

array(['What is the step by step guide to invest in share market in india?',
       'What is the story of Kohinoor (Koh-i-Noor) Diamond?',
       'How can I increase the speed of my internet connection while using a VPN?',
       ..., 'What is one coin?',
       'What is the approx annual cost of living while studying in UIC Chicago, for an Indian student?',
       'What is like to have sex with cousin?'], dtype='<U623')

In [82]:
# Dit maakt dus zo'n rare sparsematrix. Het heeft ook geen zin om er
# een array van te maken aangezien de lengte daarvan een stuk langer is. 
# Het is hierdoor vrij lastig om deze door dat neural network heen te gooien.
vector = CountVectorizer().fit(data_set.merged.values.astype('U'))
Q1 = vect.transform(data_set.merged.values.astype('U'))
#print(format(repr(X_train)))

In [86]:
Q1

<323164x5 sparse matrix of type '<class 'numpy.int64'>'
	with 159352 stored elements in Compressed Sparse Row format>

In [85]:
len(Q1.toarray())

323164

In [61]:
# Test met merged questions als input
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
scores = cross_val_score(LogisticRegression(), Q1, y_train, cv=5)
print(format(np.mean(scores)))

0.6311965445413981


In [63]:
from sklearn.model_selection import GridSearchCV
param = {'C': [0.001, 0.01, 0.1, 1, 10]}
grid = GridSearchCV(LogisticRegression(), param, cv=5)
grid.fit(Q1, y_train)
print(format(grid.best_score_))
print(grid.best_params_)

0.6311965441695239
{'C': 0.1}


In [68]:
y = grid.predict(testQ1)

In [82]:
# Werkt dus om een of andere reden echt totaal niet..
from collections import Counter
Counter(y)

Counter({0: 81118, 1: 8})

In [83]:
testQ1 = vect.transform(test_data.merged.values.astype('U'))

# 3

In [17]:
# Basis functie die even alle onnodige worden eruit haalt
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
import string
def cleantext(text):
    text = str(text)
    stripped = re.sub("!SPLIT!"," ",text)
    stripped = re.sub(r'[^\w\s]','',stripped)
    stripped = stripped.lower()
    stopwordss = set(stopwords.words('english'))
    tokens = word_tokenize(stripped)
    stripped = [word for word in tokens if word not in stopwordss]
    result = ' '.join(stripped)
    return result

In [18]:
# Deze cellen duren heel lang rip
# Wat ik wilde doen hier is even kijken of het cleanen en de score scalen naar 
# Zelfde score range als fuzzy wuzzy, kijken of dat invloed heeft
# Alleen de code volledig runnen heb ik nog niet gekregen,
# Doe het vanmiddag
data_set['clean1'] = data_set.question1.apply(cleantext)

In [19]:
data_set['clean2'] = data_set.question2.apply(cleantext)

In [None]:
test_datadata['clean1'] = test_data.question1.apply(cleantext)

In [None]:
test_data['clean2'] = test_data.question2.apply(cleantext)

In [None]:
data_set['SMratioClean'] = data_set.apply(partial(stringmatcher, column1='clean1', column2='clean2'), axis=1)

In [None]:
data_set['SMratio'] = data_set.apply(partial(stringmatcher, column1='clean1', column2='clean2'), axis=1)

In [None]:
test_data['SMratioClean'] = test_data.apply(partial(stringmatcher, column1='clean1', column2='clean2'), axis=1)

In [None]:
test_data['SMratio'] = test_data.apply(partial(stringmatcher, column1='clean1', column2='clean2'), axis=1)

In [156]:
data_set.head()

Unnamed: 0.1,Unnamed: 0,id,question1,question2,is_duplicate_x,is_duplicate_y,merged,fuzzRatio,fuzzPartial,fuzzTokenSort,fuzzTokenSet,SMratio,clean1,clean2,SMratioClean
0,0,0,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,,0,What is the step by step guide to invest in sh...,93,98,93,100,0.926829,step step guide invest share market india,step step guide invest share market,0.921053
1,1,1,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,,0,What is the story of Kohinoor (Koh-i-Noor) Dia...,65,73,63,86,0.647482,story kohinoor kohinoor diamond,would happen indian government stole kohinoor ...,0.591837
2,2,2,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,,0,How can I increase the speed of my internet co...,45,41,63,63,0.454545,increase speed internet connection using vpn,internet speed increased hacking dns,0.55
3,3,3,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,,0,Why am I mentally very lonely? How can I solve...,7,20,24,28,0.069565,mentally lonely solve,find remainder math2324math divided 2423,0.229508
4,4,4,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,,0,"Which one dissolve in water quikly sugar, salt...",37,54,47,67,0.365217,one dissolve water quikly sugar salt methane c...,fish would survive salt water,0.247191


In [2]:
import pandas as pd
def multiplier(x):
    return x * 100

In [6]:
df = pd.DataFrame([[4,9],] * 3, columns = ['a', 'b'])
df['a'] = df['a'].apply(multiplier)
df

Unnamed: 0,a,b
0,400,9
1,400,9
2,400,9


In [None]:
data_set['SMratio'] = data_set['SMratio'].apply(multiplier)
data_set['SMratioClean'] = data_set['SMratioClean'].apply(multiplier)
test_data['SMratio'] = test_data['SMratio'].apply(multiplier)
test_data['SMratioClean'] = test_data['SMratioClean'].apply(multiplier)

In [None]:
data_set.head()

In [159]:
# Hiermee haalt hij momenteel de 70% accuracy
X_train = data_set[['fuzzRatio', 'fuzzPartial', 'fuzzTokenSort', 'fuzzTokenSet', 'SMratio', 'SMratioClean']]
y_train = data_set['is_duplicate_y']

# there is no training data available, we only need to classify
y_test = []
X_test = test_data[['fuzzRatio', 'fuzzPartial', 'fuzzTokenSort', 'fuzzTokenSet', 'SMratio', 'SMratioClean']]


Unnamed: 0,fuzzRatio,fuzzPartial,fuzzTokenSort,fuzzTokenSet,SMratio,SMratioClean
0,93,98,93,100,0.926829,0.921053
1,65,73,63,86,0.647482,0.591837
2,45,41,63,63,0.454545,0.550000
3,7,20,24,28,0.069565,0.229508
4,37,54,47,67,0.365217,0.247191
5,66,67,74,78,0.659091,0.640777
6,17,32,23,24,0.172840,0.192308
7,59,67,61,71,0.591549,0.758621
8,85,90,87,93,0.852941,0.846154
9,50,57,44,65,0.495413,0.512195


In [None]:
output = createNeuralNetwork(X_train, X_test, y_train, y_test, (30,20,15,15), 3000, kind = 'classification')

In [7]:
# https://www.linkedin.com/pulse/text-classification-using-bag-words-approach-nltk-scikit-rajendran