In [None]:
import pickle
import pandas as pd
import numpy as np
# tokenization
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
# model
import tensorflow as tf
import keras
from keras import backend as K
# from keras.layers import Embedding
from keras.models import Model,load_model
from keras import layers
from keras import Input
from keras import optimizers
from tensorflow.keras import regularizers
# data split and result analysis
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from collections import Counter
from sklearn.metrics import classification_report 
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_curve, auc, roc_curve
from sklearn.metrics import f1_score, recall_score, precision_score
import matplotlib.pyplot as plt
plt.rc("font", size=14)
import seaborn as sns
sns.set(style="white")
sns.set(style="whitegrid", color_codes=True)

In [None]:
def test_model(logreg, X_test, y_test):
    
    y_pred = logreg.predict(X_test)
    print("accuracy = %s"%logreg.score(X_test, y_test))
    print("f1 socre = %s"%f1_score(y_test, y_pred))
    print("recall__ = %s"%recall_score(y_test, y_pred))
    cm = confusion_matrix(y_test, y_pred)
    print("confusion matrix :")
    print(cm)
    print("classification report :")
    print(classification_report(y_test,y_pred))
    # PR curve
    precision, recall = pr_curve_lr(logreg, X_test, y_test)
    return precision, recall

def pr_curve_lr(logreg, X_test, y_test):
    # predict probabilities
    lr_probs = logreg.predict_proba(X_test)
    # keep probabilities for the positive outcome only
    lr_probs = lr_probs[:, 1]
    # predict class values
    y_hat = logreg.predict(X_test)
    lr_precision, lr_recall, threds = precision_recall_curve(y_test, lr_probs)
    lr_f1, lr_auc = f1_score(y_test, y_hat), auc(lr_recall, lr_precision)
    # summarize scores
    print('Logistic: f1=%.3f auc=%.3f' % (lr_f1, lr_auc))
    # plot the precision-recall curves
    no_skill = len(y_test[y_test==1]) / len(y_test)
    plt.plot([0, 1], [no_skill, no_skill], linestyle='--', label='No Skill')
    plt.plot(lr_recall, lr_precision, marker='.', label='Logistic Regression')
    # axis labels
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    # show the legend
    plt.legend()
    # show the plot
    plt.show()
    return lr_precision, lr_recall

def extract_features(data):
    '''
    Output the feature lists that will be used to train model
    Take the dataset as Input
    '''
    amazon_name = data["amazon_name"].tolist()
    google_name = data["google_name"].tolist()
    amazon_desc = data["amazon_description"].tolist()
    google_desc = data["google_description"].tolist()
    price_diff = np.asarray(data["price_diff"])
    price_indicator = np.asarray(data["price_nan_indicator"])
    
    return amazon_name, google_name, amazon_desc, google_desc, price_diff, price_indicator

def tokenization (tokenizer, text, maxlen):
    '''
    Output the integer lists which meet the input requirement of Model
    Take the tokenizer, text and considered max length as Input
    '''
    #convert to integer lists
    sequences = tokenizer.texts_to_sequences(text)
    # padding
    sequences = pad_sequences(sequences, maxlen)
    return sequences

In [None]:
def sigmoid_prob(model, X_test):
    '''
    Return probabilities for positive class calculated by sigmoid output layer
    Take trained model and val or test data as Input
    '''
    prob_y = model.predict(X_test, verbose=0) 
    return prob_y        

def label_pred(prob_y, threshold):
    '''
    Return the labels predicted by sigmoid output lalyer under certain threshold
    Take sigmoid output probability and threshold as Input
    '''
    pred_y = np.where(prob_y > threshold, 1, 0)
    return pred_y
 
def c_matrix(model, X_test, y_test, threshold):
    '''
    Output confusion matrix
    Take trained model, test data, test label and threshold as Input
    '''
    prob_y = model.predict(X_test, verbose=0) 
    pred_y = np.where(prob_y > threshold, 1, 0)
    cm = confusion_matrix(y_test, pred_y)
    print(cm)

def area_under_curve(model, X_test, y_test):
    '''
    Output AUC - Area Under Curve
    Take trained model, test data and test label as Input
    '''
    y_probs = sigmoid_prob(model, X_test)
    nn_precision, nn_recall, threds = precision_recall_curve(y_test, y_probs)   
    nn_auc = auc(nn_recall, nn_precision)
    return nn_auc, nn_recall, nn_precision

def pr_curve(model, X_test, y_test):
    '''
    PR curve and Area Under Curve
    Take trained model, test data and test labels as Input
    '''
    nn_auc, nn_recall, nn_precision = area_under_curve(model, X_test, y_test)
    print("The AUC is ",nn_auc)
    no_skill = len(y_test[y_test==1]) / len(y_test)
    
    fig, axes = plt.subplots(1, 1, figsize=(6,6))
    axes.plot([0, 1], [no_skill, no_skill], linestyle='--', label='No Skill')
    axes.plot(nn_recall, nn_precision, marker='.', label='DNN')
    # axis labels
    axes.set(xlabel='Recall')
    axes.set(ylabel='Precision')
    # show the legend
    axes.legend()

## Logistic Regression Test

In [None]:
filename = '../input/trained-models/LR_model.sav'
logreg = pickle.load(open(filename, 'rb'))

In [None]:
# Load numeric test data
numeric_test = pd.read_csv("../input/numeric-dataset/numeric_test.csv")
print(numeric_test["label"].value_counts())

X_test = numeric_test.iloc[:,0:4]
y_test = numeric_test.iloc[:,4]

In [None]:
lr_precision, lr_recall = test_model(logreg, X_test, y_test)

## Siamese Neural Network

In [None]:
path = '../input/prepared-datasets/'
train_set = pd.read_csv(path+'train_set.csv')
val_set = pd.read_csv(path+'val_set.csv')
test_set = pd.read_csv(path+'test_set.csv')

train_amazon_info = train_set["amazon_info"].tolist()
train_google_info = train_set["google_info"].tolist()
val_amazon_info = val_set["amazon_info"].tolist()
val_google_info = val_set["google_info"].tolist()
# Combine all text data
all_data_text = train_amazon_info + train_google_info + val_amazon_info + val_google_info

# Retrieve all the tokens in the dataset
def create_tokenizer(all_text):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(all_text)
    word_index = tokenizer.word_index
    print("Found %s unique tokens"%len(word_index))
    return tokenizer, word_index

# Create tokenizer according to all the words in train data
tokenizer, data_word_index = create_tokenizer(all_data_text)

# Preprocess test data and labels
test_amazon_name, test_google_name, test_amazon_desc, test_google_desc, test_price_diff, test_price_indicator = extract_features(test_set)

maxlen = 200
test_amazon_name_seq = tokenization(tokenizer, test_amazon_name, maxlen)
test_google_name_seq = tokenization(tokenizer, test_google_name, maxlen)
test_amazon_desc_seq = tokenization(tokenizer, test_amazon_desc, maxlen)
test_google_desc_seq = tokenization(tokenizer, test_google_desc, maxlen)

test_data = [test_amazon_name_seq, test_google_name_seq, test_amazon_desc_seq, test_google_desc_seq, test_price_diff, test_price_indicator]
test_labels = np.asarray(test_set["label"])

print(test_set["label"].value_counts())

In [None]:
from keras.models import load_model
average_model = load_model('../input/trained-models/model_average.h5')
lstm_model = load_model('../input/trained-models/model_LSTM.h5')

In [None]:
c_matrix(average_model, test_data, test_labels, threshold=0.5)
c_matrix(lstm_model, test_data, test_labels, threshold=0.5)

In [None]:
pr_curve(average_model, test_data, test_labels)

In [None]:
pr_curve(lstm_model, test_data, test_labels)

In [None]:
def sigmoid_prob(model, X_test):
    '''
    Return probabilities for positive class calculated by sigmoid output layer
    Take trained model and val or test data as Input
    '''
    prob_y = model.predict(X_test, verbose=0) 
    return prob_y 

def area_under_curve(model, X_test, y_test):
    '''
    Output AUC - Area Under Curve
    Take trained model, test data and test label as Input
    '''
    y_probs = sigmoid_prob(model, X_test)
    nn_precision, nn_recall, threds = precision_recall_curve(y_test, y_probs)   
    nn_auc = auc(nn_recall, nn_precision)
    return nn_auc, nn_recall, nn_precision

def label_pred(prob_y, threshold):
    '''
    Return the labels predicted by sigmoid output lalyer under certain threshold
    Take sigmoid output probability and threshold as Input
    '''
    pred_y = np.where(prob_y > threshold, 1, 0)
    return pred_y

In [None]:
# Average 0.5 f1
ave_prob = sigmoid_prob(average_model, test_data)
ave_pred = label_pred(ave_prob, 0.5)
ave_f1 = f1_score(test_labels, ave_pred)
print(ave_f1)

# LSTM 0.5 f1
lst_prob = sigmoid_prob(lstm_model, test_data)
lst_pred = label_pred(lst_prob, 0.5)
lst_f1 = f1_score(test_labels, lst_pred)
print(lst_f1)

In [None]:
# Get the probabilities given by LR
lr_probs = logreg.predict_proba(X_test)
# Keep probabilities for the positive outcome only
lr_probs = lr_probs[:, 1]
# Get precisions and recalls
lr_precision, lr_recall, threds = precision_recall_curve(y_test, lr_probs)
# Get AUC
lr_auc = auc(lr_recall, lr_precision)

In [None]:
ave_auc, ave_recall, ave_precision = area_under_curve(average_model, test_data, test_labels)
lst_auc, lst_recall, lst_precision = area_under_curve(lstm_model, test_data, test_labels)

In [None]:
plt.plot(ave_recall, ave_precision, marker='.', label="Siamese Average NN")
plt.plot(lst_recall, lst_precision, marker='.', label="Siamese LSTM NN")
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision and Recall Curve ')
plt.legend(loc='center left', bbox_to_anchor=(1, 0.8))
plt.savefig("PR_Ave_LSTM.png")
plt.show()

In [None]:
plt.plot(lr_recall, lr_precision, marker='.', label="LR")
plt.plot(ave_recall, ave_precision, marker='.', label="SDNN")
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision and Recall Curve ')
plt.legend()
plt.savefig("LR_Ave_PR.png")
plt.show()

In [None]:
lr_f1 = 2*lr_precision*lr_recall/(lr_precision+lr_recall)
print("LR Max f1: ",max(lr_f1))

In [None]:
ave_f1 = 2*ave_precision*ave_recall/(ave_precision+ave_recall)
max(ave_f1)


In [None]:
f1_scores = 2*recall*precision/(recall+precision)
print('Best threshold: ', thresholds[np.argmax(f1_scores)])
print('Best F1-Score: ', np.max(f1_scores))

In [None]:
plt.plot(lr_recall, lr_precision, marker='.', label="Logistic Regression")
plt.plot(ave_recall, ave_precision, marker='.', label="Siamese Average NN")
plt.plot(lst_recall, lst_precision, marker='.', label="Siamese LSTM NN")
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision and Recall Curve ')
plt.legend(loc='center left', bbox_to_anchor=(1, 0.8))
plt.savefig("PR_LR_Ave.png")
plt.show()

In [None]:
def plot_loss_acc(history):
    '''
    Plot train val loss and acc
    '''
    fig, axes = plt.subplots(1, 2, figsize=(16,6))
    axes[0].plot(history.history['loss'], label='loss')
    axes[0].plot(history.history['val_loss'], label='val_loss')
    axes[1].plot(history.history['acc'], label='acc')
    axes[1].plot(history.history['val_acc'], label='val_acc')

    for ax in axes:
        ax.legend()
        ax.grid(True)
        ax.set(xlabel='epoch')

In [None]:
nn_results = [["average",0.365,0.313,0.776,0.196],["lstm",0.37,0.347,0.293,0.425]]

df = pd.DataFrame(nn_results, columns=["method","AUC","F1","Recall","Precision"])

df

In [None]:
recall_bar = sns.barplot(x="method", y="AUC", data=df)
recall_bar.set(xlabel='', ylabel='Area Under Curve')
plt.savefig("ave_lst_auc.png")
plt.show()

In [None]:
recall_bar = sns.barplot(x="method", y="F1", data=df)
recall_bar.set(xlabel='', ylabel='F1 Score')
plt.savefig("ave_lst_f1.png")
plt.show()

In [None]:
recall_bar = sns.barplot(x="method", y="Recall", data=df)
recall_bar.set(xlabel='', ylabel='Recall')
plt.savefig("ave_lst_recall.png")
plt.show()

In [None]:
recall_bar = sns.barplot(x="method", y="Precision", data=df)
recall_bar.set(xlabel='', ylabel='Precision')
plt.savefig("ave_lst_precision.png")
plt.show()