In [4]:
import spacy
import pandas as pd
import csv

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

RELEVANCE = "relevance"
LENGTH_DIFFERENCE = "length_difference"
JACCARD_DISTANCE = "jaccard_distance"
COSINE_SIMILARITY = "cosine_similarity"
STATS_FILE = "basic_stats.csv"

def plot_stat(stat, title):
    data_set = pd.read_csv(STATS_FILE)
    relevance_good = data_set[data_set[RELEVANCE] == 1][[stat]].values
    relevance_bad = data_set[data_set[RELEVANCE] == 0][[stat]].values

    fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(6, 6), sharey=True)
    axes[0].boxplot(relevance_good, showfliers=False)
    axes[0].set_title('Relevance: Good')
    axes[0].get_xaxis().set_visible(False)

    axes[1].boxplot(relevance_bad, showfliers=False)
    axes[1].set_title('Relevance: Bad')
    axes[1].get_xaxis().set_visible(False)

    fig.suptitle(title)
    if stat == LENGTH_DIFFERENCE:
        plt.ylim(ymin=-100)

    # plt.show()
    plt.savefig("plots/{}.png".format(title))


plot_stat(LENGTH_DIFFERENCE, "Length difference")
plot_stat(COSINE_SIMILARITY, "Cosine similarity")
plot_stat(JACCARD_DISTANCE, "Jaccard distance")


In [5]:
# functions
from bs4 import BeautifulSoup
import re
import nltk

def length_difference(original, related):
    return abs(len(original) - len(related))


def jaccard_distance(original, related):
    org_tokens = set(nltk.word_tokenize(original))
    rel_tokens = set(nltk.word_tokenize(related))

    return nltk.jaccard_distance(org_tokens, rel_tokens)


def cosine_similarity(model, original, related):
    org_tokens = model(original)
    rel_tokens = model(related)
    return org_tokens.similarity(rel_tokens)

In [14]:
DATA_SET_FILE = 'train_and_validation.csv'

model = spacy.load('en')

with open('basic_stats.csv', 'w') as csvfile:
    fieldnames = ['id', 'question1_id', 'question2_id', 'jaccard_distance', 'length_difference',
                  'cosine_similarity', 'relevance']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames, lineterminator='\n')
    writer.writeheader()

    data_set = pd.read_csv(DATA_SET_FILE)
    for index, data_set_row in data_set.iterrows():
        if index % 100 == 0:
            print("processing data_set_row {}/{}".format(index, len(data_set)))
        row = {}
        question_1_text = data_set_row['question1']
        question_2_text = data_set_row['question2']
        row['id'] = index
        row['question1_id'] = data_set_row['question1_id']
        row['question2_id'] = data_set_row['question2_id']
        row['jaccard_distance'] = round(jaccard_distance(question_1_text, question_2_text), 3)
        row['length_difference'] = length_difference(question_1_text, question_2_text)
        row['cosine_similarity'] = round(cosine_similarity(model, question_1_text, question_2_text), 3)
        row['relevance'] = data_set_row['relevance']
        writer.writerow(row)

processing data_set_row 0/3169
processing data_set_row 100/3169
processing data_set_row 200/3169
processing data_set_row 300/3169
processing data_set_row 400/3169
processing data_set_row 500/3169
processing data_set_row 600/3169
processing data_set_row 700/3169
processing data_set_row 800/3169
processing data_set_row 900/3169
processing data_set_row 1000/3169
processing data_set_row 1100/3169
processing data_set_row 1200/3169
processing data_set_row 1300/3169
processing data_set_row 1400/3169
processing data_set_row 1500/3169
processing data_set_row 1600/3169
processing data_set_row 1700/3169
processing data_set_row 1800/3169
processing data_set_row 1900/3169
processing data_set_row 2000/3169
processing data_set_row 2100/3169
processing data_set_row 2200/3169
processing data_set_row 2300/3169
processing data_set_row 2400/3169
processing data_set_row 2500/3169
processing data_set_row 2600/3169
processing data_set_row 2700/3169
processing data_set_row 2800/3169
processing data_set_row 29

In [11]:
pd.read_csv(DATA_SET_FILE)

Unnamed: 0,id,question1_id,question2_id,question1,question2,relevance
0,0,Q268,Q268_R4,Good Bank. Which is a good bank as per your ex...,Best Bank. Hi Guys; I need to open a new bank ...,1
1,1,Q268,Q268_R5,Good Bank. Which is a good bank as per your ex...,What is the best bank to open an account?. See...,1
2,2,Q268,Q268_R10,Good Bank. Which is a good bank as per your ex...,Which Bank to use in Qatar?. Hi Does anyone ha...,1
3,3,Q268,Q268_R13,Good Bank. Which is a good bank as per your ex...,Which is the best bank around??. Hi everybody;...,1
4,4,Q268,Q268_R14,Good Bank. Which is a good bank as per your ex...,Best Credit Card in Doha. I would like to appl...,1
5,5,Q268,Q268_R16,Good Bank. Which is a good bank as per your ex...,Best Bank.. Hi ti all QL's; What bank you are ...,1
6,6,Q268,Q268_R19,Good Bank. Which is a good bank as per your ex...,Islamic Banks in Qatar. Any idea about good Is...,1
7,7,Q268,Q268_R27,Good Bank. Which is a good bank as per your ex...,PERSONAL LOAN AND WORK TERMINATION. I am curre...,0
8,8,Q268,Q268_R29,Good Bank. Which is a good bank as per your ex...,Best bank in Qatar?. Greetings everybody. I wi...,1
9,9,Q268,Q268_R31,Good Bank. Which is a good bank as per your ex...,what is the best bank to open a savings accoun...,1
