In [5]:
import os
from collections import Counter
from itertools import chain, tee
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'
import logging
logger = logging.getLogger()
logger.setLevel(logging.CRITICAL)
import warnings
warnings.filterwarnings('ignore')

In [6]:
import os

import sys
sys.path.append(os.path.abspath('../../'))
from nlp.tokenizer import *

In [7]:
filename = "../maria/train_dup.csv"

In [12]:
lines = !wc -l {filename}
words = !wc -w {filename}
lines, words

(['404299 ../maria/train_dup.csv'], ['8540919 ../maria/train_dup.csv'])

In [23]:
df = pd.read_csv(filename)
df.shape

(404287, 11)

In [25]:
df[df["question1"].isna() | df["question2"].isna()]

Unnamed: 0.1,Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,q1_all,q2_all,q1_ps,q2_ps


In [26]:
df = df.dropna().drop("Unnamed: 0", axis=1)

In [6]:
def size_iter(iterable):
    return len(list(iterable))

### Character counts

In [20]:
df["len1"] = df["question1"].str.len()
df["len2"] = df["question2"].str.len()
df.tail(1)

Unnamed: 0.1,Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,q1_all,q2_all,q1_ps,q2_ps,len1,len2
404286,404289,404289,537932,537933,What is like to have sex with cousin?,What is it like to have sex with your cousin?,0,9,11,4,4,37,45


In [16]:
stats = df[["len1", "len2"]].describe().T[["count","mean", "min", "max"]]
stats.columns = ["lines", "average number of characters", "minimum number of characters", "maximum number of characters"]
stats

Unnamed: 0,lines,average number of characters,minimum number of characters,maximum number of characters
len1,404287.0,59.536997,1.0,623.0
len2,404287.0,60.108507,1.0,1169.0


### count words

In [81]:
import numpy as np
def count_tokens(tokenizer, in_array):
    return np.fromiter((len(tokenizer(s)) for s in in_array), dtype=np.int16)

In [45]:
nlp = nlp_parser()
tokenizer = nlp.tokenizer

In [82]:
count_tokens(tokenizer, df['question1'].values[:5])

array([15, 11, 15, 13, 16], dtype=int16)

In [83]:
df["q1_all"] = count_tokens(tokenizer, df['question1'].values)
print("that's all")

that's all


In [52]:
df["q2_all"] = count_tokens(tokenizer, df['question2'].values)
print("that's all")

that's all


In [None]:
df.to_csv(filename)

### count words without punct and stop words

In [None]:
def count_without_punct_stop_words(col):
    return list(t.remove_all(punct, stop).count() for t in SpacyTokens(list(col.values)))

In [None]:
df["q1_ps"] = count_without_punct_stop_words(df['question1'])
df["q2_ps"] = count_without_punct_stop_words(df['question2'])
print("that's all")

In [None]:
df.to_csv(filename)

### creating questions words

In [42]:

questions = {"who":1<<1, "whom":1<<2, "whose":1<<3, "what":1<<4, "when":1<<5, "where":1<<6, "why":1<<7, "how":1<<8,
                                    "there":1<<9, "that":1<<10, "which":1<<11, "whither":1<<12, "whence":1<<13, "whether":1<<14, "whatsoever":1<<15}

def question_words(col):
    return list(list(t.filter(question)) for t in SpacyTokens(list(col.values)))

def get_questions(q):
    return sum(SpacyTokens(q).lower().map(lambda t:questions.get(t.text.lower(), 0)))


def create_wh_ds(df, target_column, out_column, filename):
    dfw = df[[target_column]][:].reset_index(drop=True)
    dfw[out_column] = np.vectorize(get_questions)(dfw[target_column])
    for q, mask in questions.items():
        dfw[q] = (np.bitwise_and(dfw[out_column], mask)!=0).astype(int)
    dfw.to_csv(filename)

create_wh_ds(df, "question1", "wh1", "./q1_question_word.csv")
create_wh_ds(df, "question2", "wh2", "./q2_question_word.csv")
print("that's all")

that's all


### create freq dict

In [18]:
def get_tokens(col):
    d = {}
    for t in SpacyTokens(list(col.values)).lower().flatten():
        d[t.text] = d.get(t.text,0) + 1
    return d

In [21]:
d_freq1 = get_tokens(df["question1"])
utils.to_pickle(d_freq1, "./question1_freq.pkl")
print("that's all")


that's all


In [20]:
d_freq2 = get_tokens(df["question2"])
utils.to_pickle(d_freq2, "./question2_freq.pkl")
print("that's all")

that's all


In [39]:
df1_freq = pd.DataFrame.from_dict(d_freq1, orient='index', columns=["count"]).reset_index()
df1_freq.columns = ["word","count"]
df2_freq = pd.DataFrame.from_dict(d_freq2, orient='index', columns=["count"]).reset_index()
df2_freq.columns = ["word","count"]

In [42]:
def get_token(q):
    return next(SpacyTokens(q))
tokens1 = np.vectorize(get_token)(df1_freq["word"].values)
tokens2 = np.vectorize(get_token)(df2_freq["word"].values)

In [58]:
df1_freq["token"] = tokens1
df2_freq["token"] = tokens2


df1_freq["is_digit"] = df1_freq["token"].apply(lambda t:t.is_digit)
df1_freq["is_oov"] = df1_freq["token"].apply(lambda t:t.is_oov)
df1_freq["is_punct"] = df1_freq["token"].apply(lambda t:t.is_punct)
df1_freq["is_stop"] = df1_freq["token"].apply(lambda t:t.is_stop)

df2_freq["is_digit"] = df2_freq["token"].apply(lambda t:t.is_digit)
df2_freq["is_oov"] = df2_freq["token"].apply(lambda t:t.is_oov)
df2_freq["is_punct"] = df2_freq["token"].apply(lambda t:t.is_punct)
df2_freq["is_stop"] = df2_freq["token"].apply(lambda t:t.is_stop)

df1_freq = df1_freq.drop("token", axis=1)
df2_freq = df2_freq.drop("token", axis=1)

df1_freq.to_csv("./q1_freq.csv")
df2_freq.to_csv("./q2_freq.csv")

### levenshtein distance

In [54]:
!pip install fuzzywuzzy



In [57]:
from fuzzywuzzy import fuzz
q1,q2 = df.sample(1)[["question1", "question2"]].values[0]
print(q1,'\n',q2)
print(fuzz.token_sort_ratio(q1,q2))

What is an Average ethnic composition of Sicilians? 
 What's the average ethnic composition of a Sicilian?
89


In [58]:
df["token_sort_ratio"] = np.vectorize(fuzz.token_sort_ratio)(df["question1"], df["question2"])

In [None]:
df.sort_values(by="token_sort_ratio", ascending=False).head(10)

### common and not common words

In [86]:
def get_tokens(s):
    return set(t.text for t in tokenizer(s))

In [87]:
df["q1_tokens"] = np.vectorize(get_tokens)(df["question1"].values)
df["q2_tokens"] = np.vectorize(get_tokens)(df["question2"].values)

In [93]:
def q1_q2_intersect(tokens1, tokens2):
    return " ".join(tokens1.intersection(tokens2))

def q1_q2_difference(tokens1, tokens2):
    return " ".join(tokens1.difference(tokens2))

In [96]:
df["intersect"] = np.vectorize(q1_q2_intersect)(df["q1_tokens"].values, df["q2_tokens"].values)
df["difference"] = np.vectorize(q1_q2_difference)(df["q1_tokens"].values, df["q2_tokens"].values)

In [97]:
df.tail(2)

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,q1_all,q2_all,q1_ps,q2_ps,token_sort_ratio,intersect,q1_tokens,q2_tokens,difference
404285,404288,537930,537931,What is the approx annual cost of living while...,I am having little hairfall problem but I want...,0,19,28,10,14,35,", ? of","{Chicago, studying, for, ,, ?, of, UIC, approx...","{one, but, ,, ., ?, of, wax, use, styling, cla...",Chicago annual Indian an the is studying while...
404286,404289,537932,537933,What is like to have sex with cousin?,What is it like to have sex with your cousin?,0,9,11,4,4,90,like with is have cousin ? to sex What,"{like, with, is, have, cousin, ?, to, sex, What}","{like, with, is, your, have, cousin, it, ?, to...",


### Lemmatization

In [104]:
def get_lemmas(s):
    return set(t.lemma_ for t in tokenizer(s))

In [105]:
df["lemmas1"] = np.vectorize(get_lemmas)(df["question1"].values)
df["lemmas2"] = np.vectorize(get_lemmas)(df["question2"].values)
df.tail(2)

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,q1_all,q2_all,q1_ps,q2_ps,token_sort_ratio,intersect,q1_tokens,q2_tokens,difference,lemmas1,lemmas2
404285,404288,537930,537931,What is the approx annual cost of living while...,I am having little hairfall problem but I want...,0,19,28,10,14,35,", ? of","{Chicago, studying, for, ,, ?, of, UIC, approx...","{one, but, ,, ., ?, of, wax, use, styling, cla...",Chicago annual Indian an the is studying while...,"{Chicago, for, ,, ?, of, UIC, approx, annual, ...","{one, but, ,, ., ?, of, wax, use, clay, I, pro..."
404286,404289,537932,537933,What is like to have sex with cousin?,What is it like to have sex with your cousin?,0,9,11,4,4,90,like with is have cousin ? to sex What,"{like, with, is, have, cousin, ?, to, sex, What}","{like, with, is, your, have, cousin, it, ?, to...",,"{like, with, have, cousin, ?, to, sex, What, be}","{like, with, your, have, cousin, it, ?, to, se..."


In [106]:
df.tail(2)

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,q1_all,q2_all,q1_ps,q2_ps,token_sort_ratio,intersect,q1_tokens,q2_tokens,difference,lemmas1,lemmas2
404285,404288,537930,537931,What is the approx annual cost of living while...,I am having little hairfall problem but I want...,0,19,28,10,14,35,", ? of","{Chicago, studying, for, ,, ?, of, UIC, approx...","{one, but, ,, ., ?, of, wax, use, styling, cla...",Chicago annual Indian an the is studying while...,"{Chicago, for, ,, ?, of, UIC, approx, annual, ...","{one, but, ,, ., ?, of, wax, use, clay, I, pro..."
404286,404289,537932,537933,What is like to have sex with cousin?,What is it like to have sex with your cousin?,0,9,11,4,4,90,like with is have cousin ? to sex What,"{like, with, is, have, cousin, ?, to, sex, What}","{like, with, is, your, have, cousin, it, ?, to...",,"{like, with, have, cousin, ?, to, sex, What, be}","{like, with, your, have, cousin, it, ?, to, se..."
