In [1]:
import math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn import metrics
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
import spacy
from spacy.lang.en import English
spacy.load('en')
nlp = English()
import nlp
import string
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from string import punctuation as PUNCTS
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\maryp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\maryp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\maryp\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
# CONSTANTS
PUNCTS = set(punct for punct in string.punctuation)
STOP_WORDS = set(stopwords.words("english")) - {'who', 'whom', 'what', 'when', 'where', 'why', 'how'}

In [3]:
df = pd.read_csv('train_dup.csv').drop(columns='id', axis=1)
df = df.dropna()
df.head()

Unnamed: 0,qid1,qid2,question1,question2,is_duplicate
0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [4]:
duplicates = df[df.is_duplicate==1]
not_duplicates = df[df.is_duplicate == 0]
print(f'The proportion of classes pairs/not pairs is: {len(duplicates)}/{len(not_duplicates)}')
print(f'An example of duplicated questions: \n{duplicates.question1[5]} \nAND \n{duplicates.question2[5]}')
duplicates.head()

The proportion of classes pairs/not pairs is: 149263/255024
An example of duplicated questions: 
Astrology: I am a Capricorn Sun Cap moon and cap rising...what does that say about me? 
AND 
I'm a triple Capricorn (Sun, Moon and ascendant in Capricorn) What does this say about me?


Unnamed: 0,qid1,qid2,question1,question2,is_duplicate
5,11,12,Astrology: I am a Capricorn Sun Cap moon and c...,"I'm a triple Capricorn (Sun, Moon and ascendan...",1
7,15,16,How can I be a good geologist?,What should I do to be a great geologist?,1
11,23,24,How do I read and find my YouTube comments?,How can I see all my Youtube comments?,1
12,25,26,What can make Physics easy to learn?,How can you make physics easy to learn?,1
13,27,28,What was your first sexual experience like?,What was your first sexual experience?,1


In [5]:
# This is an example, will be later removed
count_vectorizer = CountVectorizer()
print(duplicates.question1[5], duplicates.question2[5])
# Create the Bag-of-Words Model
bag_of_words = count_vectorizer.fit_transform([duplicates.question1[5], duplicates.question2[5]])

# Show the Bag-of-Words Model as a pandas DataFrame
feature_names = count_vectorizer.get_feature_names()
pd.DataFrame(bag_of_words.toarray(), columns = feature_names)

Astrology: I am a Capricorn Sun Cap moon and cap rising...what does that say about me? I'm a triple Capricorn (Sun, Moon and ascendant in Capricorn) What does this say about me?


Unnamed: 0,about,am,and,ascendant,astrology,cap,capricorn,does,in,me,moon,rising,say,sun,that,this,triple,what
0,1,1,1,0,1,2,1,1,0,1,1,1,1,1,1,0,0,1
1,1,0,1,1,0,0,2,1,1,1,1,0,1,1,0,1,1,1


In [6]:
# nltk cleaning: very slow!
lemmatizer = WordNetLemmatizer()
def clean_sentence_lemmatizer(sentence):
    """
    Receives a raw sentence and clean it using the following steps:  # BETTER
    1. Remove all non-words
    2. Transform the review in lower case
    3. Remove all stop words
    4. Perform lemmatizer

    Args:
        sentence: the sentence that will be cleaned
    Returns:
        a clean sentence using the mentioned steps above.
    """
    
    sentence = re.sub("[^A-Za-z]", " ", sentence)
    sentence = sentence.lower()
    sentence = word_tokenize(sentence)
    sentence = [lemmatizer.lemmatize(word) for word in sentence if word not in STOP_WORDS]
    sentence = " ".join(sentence)
    return sentence

print(duplicates.question1[5])
clean_sentence_lemmatizer(duplicates.question1[5])

Astrology: I am a Capricorn Sun Cap moon and cap rising...what does that say about me?


'astrology capricorn sun cap moon cap rising what say'

## SPACY

Tokenization, lemmatization, removing stop words (except question opening words like who, why, etc) and puctuation and lowering

In [4]:
# 1st way to clean q = re.sub("[^A-Za-z]", " ", q)?
nlp = English()

clean_question = lambda sentence: ' '.join([word.lemma_.lower() for word in nlp(sentence) 
                                            if word.lemma_ not in STOP_WORDS if word.lemma_ not in PUNCTS])
df_clean = df.copy()
df_clean['question1'] = df.apply(lambda row: clean_question(row['question1']), axis=1)
df_clean['question2'] = df.apply(lambda row: clean_question(row['question2']), axis=1)
df_clean.head()

Unnamed: 0,qid1,qid2,question1,question2,is_duplicate
0,1,2,what step step guide invest share market india,what step step guide invest share market,0
1,3,4,what story kohinoor koh noor diamond,what would happen indian government stole kohi...,0
2,5,6,how i increase speed internet connection using...,how internet speed increased hacking dns,0
3,7,8,why i mentally lonely how i solve,find remainder when math]23^{24}[/math divided...,0
4,9,10,which one dissolve water quikly sugar salt met...,which fish would survive salt water,0


In [5]:
resub = lambda q: re.sub("[^A-Za-z]", " ", q)
df_clean['question1'] = df_clean.apply(lambda row: resub(row['question1']), axis=1)
df_clean['question2'] = df_clean.apply(lambda row: resub(row['question2']), axis=1)

In [6]:
df_clean.to_csv('df_clean.csv', index=False)

In [10]:
# 2nd way to clean
# I didn't use it
def tok_stop_lem_punct(q, tokenize=True, stopwordize=True, punctuanize=True, lemmatize=True, lowerize=True):
    nlp = English()
    if stopwordize:
        q = ' '.join([words.text for words in nlp(q) if words.text not in STOP_WORDS])
    if lemmatize:
        q = ' '.join([words.lemma_ for words in nlp(q)])
    if punctuanize:
        q = re.sub("[^A-Za-z]", " ", q)
        q = ' '.join([words.strip() for words in q.split() if words not in PUNCTS])
    if lowerize:
        q = ' '.join([words.lower() for words in q.split()])
    if tokenize:
        tokenizer = spacy.tokenizer.Tokenizer(nlp.vocab)
        q = tokenizer(q)
    return q

print(tok_stop_lem_punct(duplicates.question1[5]))

astrology i capricorn sun cap moon cap rising what say


In [None]:
df_clean2 = df.copy()
df_clean2['question1'] = df.apply(lambda row: tok_stop_lem_punct(row['question1']), axis=1)
df_clean2['question2'] = df.apply(lambda row: tok_stop_lem_punct(row['question2']), axis=1)

Some basic stats

In [136]:
unigramm_count_dict_clean, unigramm_count_dict_initial = {}, {}
bigramm_count_dict_initial, bigramm_count_dict_clean = {}, {}
number_of_words_inq_initial, number_of_words_inq_clean = [], []
lengths_of_questions_initial, lengths_of_questions_clean = [], []


def count_basic_stats(df):
    unigramm_count_dict = {}
    bigramm_count_dict = {} 
    number_of_words_inq = []
    lengths_of_questions = []
    
    for q1, q2 in zip(df.question1, df.question2):
        unigramms_q1 = [word.text for word in nlp(q1)]
        unigramms_q2 = [word.text for word in nlp(q2)]
        number_of_words_inq.append(len(unigramms_q1))
        number_of_words_inq.append(len(unigramms_q2))
        lengths_of_questions.append(len(q1))
        lengths_of_questions.append(len(q2))

        for unigramm in unigramms_q1 + unigramms_q2:
            if unigramm in unigramm_count_dict:
                unigramm_count_dict[unigramm] += 1
            else:
                unigramm_count_dict[unigramm] = 1

        bigramms_q1 = [' '.join(q1.split()[i:i+2]) for i in range(len(q1.split())) if i < len(q1.split()) - 1]
        bigramms_q2 = [' '.join(q2.split()[i:i+2]) for i in range(len(q2.split())) if i < len(q2.split()) - 1]

        for bigramm in bigramms_q1 + bigramms_q2:
            if bigramm in bigramm_count_dict:
                bigramm_count_dict[bigramm] += 1
            else:
                bigramm_count_dict[bigramm] = 1
    return unigramm_count_dict, bigramm_count_dict, number_of_words_inq, lengths_of_questions


unigramm_count_dict_initial, bigramm_count_dict_initial, number_of_words_inq_initial, \
    lengths_of_questions_initial = count_basic_stats(df)

unigramm_count_dict_clean, bigramm_count_dict_clean, number_of_words_inq_clean, \
    lengths_of_questions_clean = count_basic_stats(df_clean)



In [153]:
# TF(term) = term_frequency / sum(all terms frequences)

tf_dict_initial = {unigramm : unigramm_count_dict_initial[unigramm] / sum(unigramm_count_dict_initial.values()) 
                  for unigramm in unigramm_count_dict_initial}
tf_dict_clean = {unigramm : unigramm_count_dict_clean[unigramm] / sum(unigramm_count_dict_clean.values())
                for unigramm in unigramm_count_dict_clean}

In [68]:
from collections import Counter

def word_usage_count(df, column1, column2, ignore_register = False):
    """
    Receives a df with raw sentences and counts word usage
    Requires collections.Counter  
    Args:
        df: dataframe with some questions
        column1, column2: columns to unite and count word usage
        ignore_register: boolean parameter to ignore register or not, defaults to False
    Returns:
        Counter object {word: number}.
    """
    questions1 = set(df[column1])
    questions2 = set(df[column2])
    unique_question_union = questions1.union(questions2)
    words_usage_count = Counter()
    for q in unique_question_union:
        for word in q.split():
            if ignore_register:
                words_usage_count[word.lower()]+= 1
            else:
                words_usage_count[word]+= 1
    return words_usage_count

def word_existence_in_corpus(df,column1, column2, ignore_register=False):
    """
    Receives a df with raw sentences and counts word usage
    Requires collections.Counter  
    Args:
        df: dataframe with some questions
        column1, column2: columns to unite and get words
        ignore_register: boolean parameter to ignore register or not, defaults to False
    Returns:
        set of words occured in corpus from df.
    """
    return set(word_usage_count(df, column1, column2, ignore_register = ignore_register).elements())


import time
t0 = time.time()
total_word_dict = word_usage_count(df,'question1', 'question2',)
t1 = time.time()
print(len(total_word_dict))
print(t1 - t0)
print(len(word_existence_in_corpus(df,'question1', 'question2',)))

232531
2.1073827743530273
232531


Playing with vectorizer

first cells are just rubbish


In [11]:
# This is an example, will be later removed
count_vectorizer = CountVectorizer()
print(duplicates.question1[5], duplicates.question2[5])
# Create the Bag-of-Words Model

bag_of_words = count_vectorizer.fit_transform([duplicates.question1[5], duplicates.question2[5]])
# Show the Bag-of-Words Model as a pandas DataFrame
feature_names = count_vectorizer.get_feature_names()
pd.DataFrame(bag_of_words.toarray(), columns = feature_names)

Astrology: I am a Capricorn Sun Cap moon and cap rising...what does that say about me? I'm a triple Capricorn (Sun, Moon and ascendant in Capricorn) What does this say about me?


Unnamed: 0,about,am,and,ascendant,astrology,cap,capricorn,does,in,me,moon,rising,say,sun,that,this,triple,what
0,1,1,1,0,1,2,1,1,0,1,1,1,1,1,1,0,0,1
1,1,0,1,1,0,0,2,1,1,1,1,0,1,1,0,1,1,1


In [27]:
pd.DataFrame(count_vectorizer.transform(['hi sun', 'moon rising asdf']).toarray(), columns = feature_names)

Unnamed: 0,about,am,and,ascendant,astrology,cap,capricorn,does,in,me,moon,rising,say,sun,that,this,triple,what
0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0


In [30]:
dfhead5 = df_clean.head()
dfhead5_bow = count_vectorizer.transform(dfhead5)
dfhead5_cv = pd.DataFrame(dfhead5_bow.toarray(), columns = feature_names)
dfhead5_cv['target'] = df_clean.is_duplicate
dfhead5_cv

Unnamed: 0,about,am,and,ascendant,astrology,cap,capricorn,does,in,me,moon,rising,say,sun,that,this,triple,what,target
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


Now i'm trying out vectorizers for cleaned df

how df was cleaned:
    
    first lambda func:
        nlp = English()
        clean_question = lambda sentence: ' '.join([word.lemma_.lower() for word in nlp(sentence)                                           if word.lemma_ not in STOP_WORDS if word.lemma_ not in PUNCTS])
        df_clean = df.copy()
        df_clean['question1'] = df.apply(lambda row: clean_question(row['question1']), axis=1)
        df_clean['question2'] = df.apply(lambda row: clean_question(row['question2']), axis=1)
        
    second lambda func:
        resub = lambda q: re.sub("[^A-Za-z]", " ", q)
        df_clean['question1'] = df_clean.apply(lambda row: resub(row['question1']), axis=1)
        df_clean['question2'] = df_clean.apply(lambda row: resub(row['question2']), axis=1)

## 1. Vectorization from full corpus

In [7]:
print(f'Initial cleaned data shape {df_clean.shape}')
from sklearn.feature_extraction.text import TfidfVectorizer
corpus = set(df_clean.question1).union(set(df_clean.question2))
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)
print(f'TfIdfVectorizer shape {X.shape}')

from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
X2 = cv.fit_transform(corpus)
print(f'Count Vectorizer/bag of words shape {X2.shape}')

Initial cleaned data shape (404287, 5)
TfIdfVectorizer shape (523984, 79161)
Count Vectorizer/bag of words shape (523984, 79161)


## 2. Transform first 10000 rows with CountVectorizer

In [9]:
# let's try Count Vectorizer on first 10000 rows - transform q1 and q2 to vectors
import time
t0 = time.time()
df_clean_vectorized_q1 = cv.transform(df_clean.head(10000).question1)
df_clean_vectorized_q2 = cv.transform(df_clean.head(10000).question2)
t1 = time.time()
df_clean_vectorized_cv_q1 = pd.DataFrame(df_clean_vectorized_q1.toarray(), columns = cv.get_feature_names())
df_clean_vectorized_cv_q2 = pd.DataFrame(df_clean_vectorized_q2.toarray(), columns = cv.get_feature_names())
df_clean_vectorized_cv_q1.shape, df_clean_vectorized_cv_q2.shape, t1-t0

((10000, 79161), (10000, 79161), 0.13861083984375)

## 3. Finding cosine distance between q1[i] from question1 and q2[i] from question2 
diagonal of pairwise distances between to matrices

In [37]:
# counting cosine similarity between first 2000 questions
from sklearn.metrics.pairwise import cosine_similarity

def count_cosine_dist_between_two_dfs(df1, df2):
    """
    Receives a dfs with vectors/matrices and counts pairwise cosine distancer  
    Args:
        df: dataframe with some questions
        
    Returns:
        pd.Series with diagoanal els with added column of pairwise distances
    """
    cs = cosine_similarity(df1.values, df2.values)
    return [cs[i, i] for i in range(len(cs))]

count_cosine_dist_between_two_dfs(df_clean_vectorized_cv_q1.iloc[:2000, :], df_clean_vectorized_cv_q2.iloc[:2000, :])

[0.9486832980505138,
 0.6154574548966638,
 0.4629100498862757,
 0.0,
 0.36927447293799825,
 0.5454545454545454,
 0.0,
 0.3333333333333334,
 1.0000000000000002,
 0.4629100498862757,
 0.0,
 0.6708203932499369,
 0.7999999999999999,
 0.8944271909999159,
 0.9375,
 0.27386127875258315,
 0.6666666666666669,
 0.0,
 0.5892556509887895,
 0.6666666666666669,
 0.6708203932499369,
 0.75,
 0.5,
 0.0,
 0.14285714285714282,
 0.8017837257372731,
 0.8660254037844388,
 0.4082482904638631,
 0.8333333333333336,
 0.6324555320336758,
 0.6666666666666669,
 0.4330127018922194,
 0.8017837257372731,
 0.0,
 0.8944271909999159,
 0.4714045207910316,
 0.6396021490668312,
 0.0668153104781061,
 0.5773502691896258,
 0.12499999999999997,
 0.0,
 0.9999999999999999,
 0.9999999999999999,
 0.5669467095138407,
 0.9090909090909092,
 0.5477225575051662,
 0.0,
 0.0,
 0.7302967433402215,
 1.0000000000000002,
 0.5714285714285713,
 0.8432740427115678,
 0.0,
 0.6681531047810609,
 0.0,
 0.16666666666666666,
 0.2041241452319315,
 0.6

In [39]:
distancesq1iq2i = count_cosine_dist_between_two_dfs(df_clean_vectorized_cv_q1.iloc[:, :], df_clean_vectorized_cv_q2.iloc[:, :])
print(f'Array of distances between q1_i and q2_i has length {len(distancesq1iq2i)} - should be {len(df_clean_vectorized_cv_q1)}')

Array of distances between q1_i and q2_i has length 10000 - should be 10000


## 4. Creating DF with cosine distance and target

In [49]:
#distance_target = pd.DataFrame([distancesq1iq2i, df_clean.is_duplicate[:10000].values], columns= ['dist', 'target'])
distance_target = pd.DataFrame({'distance': distancesq1iq2i, 'target': df_clean.is_duplicate[:10000].values})
distance_target.describe()

Unnamed: 0,distance,target
count,10000.0,10000.0
mean,0.525458,0.3711
std,0.275435,0.483123
min,0.0,0.0
25%,0.333333,0.0
50%,0.547723,0.0
75%,0.75,1.0
max,1.0,1.0


In [64]:
X = distance_target.distance
y = distance_target.target
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y)
print('data shapes', X_train.shape, X_test.shape, y_train.shape)
print(f'Distribution of classes in target: \n{y_train.value_counts()/len(y_train)*100}, \n{y_test.value_counts()/len(y_test)*100}')
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(X_train.values.reshape(-1, 1), y_train)
y_pred = lr.predict(X_test.values.reshape(-1, 1))

from sklearn.metrics import confusion_matrix, classification_report
print(f'Classification report after PR: \n{classification_report(y_test, y_pred)}')
print(f'Confusion matrix after LR: \n{confusion_matrix(y_test, y_pred)}')

from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(X_train.values.reshape(-1, 1), y_train)
y_predrf = rf.predict(X_test.values.reshape(-1, 1))

from sklearn.metrics import confusion_matrix, classification_report
print(f'Classification report after PR: \n{classification_report(y_test, y_predrf)}')
print(f'Confusion matrix after LR: \n{confusion_matrix(y_test, y_predrf)}')

data shapes (7500,) (2500,) (7500,)
Distribution of classes in target: 
0    62.64
1    37.36
Name: target, dtype: float64, 
0    63.64
1    36.36
Name: target, dtype: float64
Classification report after PR: 
              precision    recall  f1-score   support

           0       0.72      0.78      0.75      1591
           1       0.56      0.48      0.51       909

    accuracy                           0.67      2500
   macro avg       0.64      0.63      0.63      2500
weighted avg       0.66      0.67      0.67      2500

Confusion matrix after LR: 
[[1244  347]
 [ 474  435]]
Classification report after PR: 
              precision    recall  f1-score   support

           0       0.77      0.72      0.75      1591
           1       0.56      0.63      0.59       909

    accuracy                           0.69      2500
   macro avg       0.67      0.68      0.67      2500
weighted avg       0.70      0.69      0.69      2500

Confusion matrix after LR: 
[[1149  442]
 [ 338  

In [12]:
# transforming that 10000 rows
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.pipeline import Pipeline
from sklearn.manifold import TSNE
import time

pca_tsne = Pipeline([
    ("tsvd", TruncatedSVD(n_components=1500, random_state=42)),
    ("tsne", TSNE(n_components=3, random_state=42)),
])

tsvd = TruncatedSVD(n_components=1500, random_state=42)
t0 = time.time()
df_clean_vectorized_cv_q1_reduced = tsvd.fit_transform(df_clean_vectorized_cv_q1)
df_clean_vectorized_cv_q2_reduced = tsvd.transform(df_clean_vectorized_cv_q2)
t1 = time.time()
print(f'Our new data will have shape {df_clean_vectorized_cv_q2_reduced.shape}')
print(f"Reduction in pipeline with PCA and t-SNE took {round(t1-t0, 2)}s")

Our new data will have shape (10000, 1500)
Reduction in pipeline with PCA and t-SNE took 398.54s


In [72]:
df_clean.head()

Unnamed: 0,qid1,qid2,question1,question2,is_duplicate
0,1,2,what step step guide invest share market india,what step step guide invest share market,0
1,3,4,what story kohinoor koh noor diamond,what would happen indian government stole kohi...,0
2,5,6,how i increase speed internet connection using...,how internet speed increased hacking dns,0
3,7,8,why i mentally lonely how i solve,find remainder when math math divided...,0
4,9,10,which one dissolve water quikly sugar salt met...,which fish would survive salt water,0
