In [2]:
import pandas as pd

In [3]:
import nltk

In [4]:
train = pd.read_csv("data/train_data.csv").drop("is_duplicate",axis=1)

In [5]:
test = pd.read_csv("data/test_data.csv")

In [6]:
import sklearn

In [7]:
train.head()

Unnamed: 0,id,question1,question2
0,0,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...
1,1,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...
2,2,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...
3,3,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...
4,4,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?


# Remove NANs

In [8]:
train = train[~(train.question2.isna() | train.question1.isna())]

# Spell Check

In [13]:
import jamspell

corrector = jamspell.TSpellCorrector()
corrector.LoadLangModel('spell_checker/en.bin')

True

In [24]:
q1 = train.question1.values
q2 = train.question2.values

In [25]:
import numpy as np

In [26]:
v_corrector = np.vectorize(corrector.FixFragment)

In [27]:
q1_corrected = v_corrector(q1)

In [29]:
q2_corrected = v_corrector(q2)

In [31]:
q2_corrected.shape

(323160,)

In [34]:
pd.DataFrame({"q1":q1_corrected,"q2":q2_corrected})

Unnamed: 0,q1,q2
0,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...
1,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...
2,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...
3,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...
4,"Which one dissolve in water quickly sugar, sal...",Which fish would survive in salt water?
5,Astrology: I am a Capricorn Sun Cap moon and c...,"I'm a triple Capricorn (Sun, Moon and ascendan..."
6,Should I buy tiago?,What keeps children active and far from phone ...
7,How can I be a good geologist?,What should I do to be a great geologist?
8,When do you use シ instead of し?,"When do you use ""&"" instead of ""and""?"
9,Motorola (company): Can I hack my Charter Moto...,How do I hack Motorola DCX3400 for free internet?


# Basic Cleaning

In [64]:
# lower case
train["question1"] = train.question1.str.lower()
train["question2"] = train.question2.str.lower()

In [65]:
#remove punctutaion
import string

punctuation_tbl = table = str.maketrans(dict.fromkeys("'!,?()"))


train["question1"] = train["question1"].apply(lambda x: str(x).translate(punctuation_tbl))
train["question2"] = train["question2"].apply(lambda x: str(x).translate(punctuation_tbl))

#note need to account for situation where where have "%" "&" questions

In [136]:
train.iloc[84646]

id                                      105780
question1       How can I develop android app?
question2                                  NaN
is_duplicate                               NaN
Name: 84646, dtype: object

In [67]:
# remove stop words
from nltk.corpus import stopwords
stopwords = stopwords.words('english')


train["question1"] = train["question1"].apply(lambda x: [item for item in str(x).split() if item not in stopwords])
train["question2"] = train["question2"].apply(lambda x: [item for item in str(x).split() if item not in stopwords])

In [68]:
train.head()

Unnamed: 0,id,question1,question2,is_duplicate
0,0,"[step, step, guide, invest, share, market, india]","[step, step, guide, invest, share, market]",
1,1,"[story, kohinoor, koh-i-noor, diamond]","[would, happen, indian, government, stole, koh...",
2,2,"[increase, speed, internet, connection, using,...","[internet, speed, increased, hacking, dns]",
3,3,"[mentally, lonely, solve]","[find, remainder, [math]23^{24}[/math], divide...",
4,4,"[one, dissolve, water, quikly, sugar, salt, me...","[fish, would, survive, salt, water]",


In [105]:
from nltk.stem.wordnet import WordNetLemmatizer
wn = WordNetLemmatizer()

In [122]:
from nltk import word_tokenize
text = ("And now for cooked something completely different").split()
nltk.pos_tag(["and"])

[('and', 'CC')]

In [115]:
from nltk import word_tokenize
text = word_tokenize("And now for cooked something completely different")
nltk.pos_tag(text)

[('And', 'CC'),
 ('now', 'RB'),
 ('for', 'IN'),
 ('cooked', 'VBN'),
 ('something', 'NN'),
 ('completely', 'RB'),
 ('different', 'JJ')]

In [126]:
from nltk.corpus import wordnet

def get_wordnet_pos(text):
    tag = nltk.pos_tag([text])[0][1]
    
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [128]:

wn.lemmatize("loving",wordnet.ADV)

'loving'

In [141]:
train["question1"] = train["question1"].apply(lambda x: [wn.lemmatize(word,get_wordnet_pos(word)) for word in x])

In [148]:
train["question2"] = train["question2"].apply(lambda x: [wn.lemmatize(word,get_wordnet_pos(word)) for word in x])

# Other Features

In [69]:
# does it have math?
def math_flag(row):
    flag = -1
    if str(row).find("[math]") > -1:
        flag = 1
    else:
        flag = -1
        
    return flag
    

In [28]:
train2 = train
train2["math"] = train2.apply(lambda row: math_flag(row.question2), axis=1)


# Spell Check

In [196]:
from spellchecker import SpellChecker

def spell_check(line):

    output = []
    
    spell = SpellChecker()

    
    for word in line:
        if word.find("[math]") != -1:
            output.append(word)
        else:
            misspelled = spell.unknown([word])
            if len(misspelled) > 0:
                print(word)
                correction = spell.correction(word)
                output.append(correction)
            else:
                output.append(word)
                
    return output

In [197]:
train.head(10)["question2"].apply(lambda x: spell_check(x))

kohinoor
koh-i-noor
childern
"&"
"and"
dcx3400


0           [step, step, guide, invest, share, market]
1    [would, happen, indian, government, stole, koh...
2               [internet, speed, increase, hack, dns]
3    [find, remainder, [math]23^{24}[/math], divide...
4                  [fish, would, survive, salt, water]
5    [im, triple, capricorn, sun, moon, ascendant, ...
6    [keep, children, active, far, phone, video, game]
7                                   [great, geologist]
8                               [use, &, instead, and]
9            [hack, motorola, dcx3400, free, internet]
Name: question2, dtype: object

In [194]:
8*400000/10/60

5333.333333333333

In [192]:
spell.correction("can't,")

"can't"

In [188]:
word = "123word123"
word.find("シ")

-1

In [154]:
spell.candidates(word,)

{'happening', 'henning', 'penning'}

In [156]:
train.head(20)

Unnamed: 0,id,question1,question2,is_duplicate
0,0,"[step, step, guide, invest, share, market, india]","[step, step, guide, invest, share, market]",
1,1,"[story, kohinoor, koh-i-noor, diamond]","[would, happen, indian, government, stole, koh...",
2,2,"[increase, speed, internet, connection, use, vpn]","[internet, speed, increase, hack, dns]",
3,3,"[mentally, lonely, solve]","[find, remainder, [math]23^{24}[/math], divide...",
4,4,"[one, dissolve, water, quikly, sugar, salt, me...","[fish, would, survive, salt, water]",
5,5,"[astrology:, capricorn, sun, cap, moon, cap, r...","[im, triple, capricorn, sun, moon, ascendant, ...",
6,6,"[buy, tiago]","[keep, childern, active, far, phone, video, game]",
7,7,"[good, geologist]","[great, geologist]",
8,8,"[use, シ, instead, し]","[use, ""&"", instead, ""and""]",
9,9,"[motorola, company:, hack, charter, motorolla,...","[hack, motorola, dcx3400, free, internet]",


In [199]:
import jamspell

ModuleNotFoundError: No module named 'jamspell'