In [1]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
from nltk.tokenize.casual import casual_tokenize
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
nltk.download('punkt')
import joblib

[nltk_data] Downloading package punkt to /home/damian/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
df = pd.read_csv('./data/originals/train.csv')

In [3]:
df

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0
...,...,...,...,...,...,...
404285,404285,433578,379845,How many keywords are there in the Racket prog...,How many keywords are there in PERL Programmin...,0
404286,404286,18840,155606,Do you believe there is life after death?,Is it true that there is life after death?,1
404287,404287,537928,537929,What is one coin?,What's this coin?,0
404288,404288,537930,537931,What is the approx annual cost of living while...,I am having little hairfall problem but I want...,0


In [4]:
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

def custom_tokenize(text, stemming=True, lemmatizing=True):
        tokens = casual_tokenize(text)
        if stemming:
            tokens = [stemmer.stem(token) for token in tokens]
        if lemmatizing:
            tokens = [lemmatizer.lemmatize(token) for token in tokens]
        tokens = [token.lower() for token in tokens]
        return tokens

In [5]:
tokenized_questions = [custom_tokenize(str(question).lower()) for question in df['question1'] + df['question2']]
vector_size = 100
window_size = 5

model = Word2Vec(sentences=tokenized_questions, vector_size=vector_size, window=window_size, min_count=1, workers=4)

In [6]:
def pad_vector(vector, target_length):
    if not vector:
        return [[0] * vector_size] * target_length
    elif len(vector) < target_length:
        return vector + [[0] * len(vector[0])] * (target_length - len(vector))
    elif len(vector) > target_length:
        return vector[:target_length]
    else:
        return vector

def question_to_vector(question, model, max_length):
    tokens = word_tokenize(str(question).lower())
    vector = [model.wv[word] for word in tokens if word in model.wv]
    padded_vector = pad_vector(vector, target_length=max_length)
    return padded_vector

MAX_LENGTH = 15
df['question1'] = df['question1'].apply(lambda x: question_to_vector(x, model, MAX_LENGTH))
df['question2'] = df['question2'].apply(lambda x: question_to_vector(x, model, MAX_LENGTH))
df = df[['question1', 'question2', 'is_duplicate']]

In [7]:
df

Unnamed: 0,question1,question2,is_duplicate
0,"[[-1.1824719, 0.31452188, -0.3146478, 1.228863...","[[-1.1824719, 0.31452188, -0.3146478, 1.228863...",0
1,"[[-1.1824719, 0.31452188, -0.3146478, 1.228863...","[[-1.1824719, 0.31452188, -0.3146478, 1.228863...",0
2,"[[-1.724134, -1.0519515, 1.2529267, 1.7144535,...","[[-1.724134, -1.0519515, 1.2529267, 1.7144535,...",0
3,"[[-3.757544, 2.1232448, 0.6163597, -1.8996361,...","[[-2.3684523, 0.6073207, 0.3200795, 3.7134771,...",0
4,"[[-3.0175834, 1.2167847, -1.8335198, 0.5148477...","[[-3.0175834, 1.2167847, -1.8335198, 0.5148477...",0
...,...,...,...
404285,"[[-1.724134, -1.0519515, 1.2529267, 1.7144535,...","[[-1.724134, -1.0519515, 1.2529267, 1.7144535,...",0
404286,"[[0.20052868, -0.08304975, -0.6880667, -1.4290...","[[0.68149215, 1.1006297, 0.92523146, -2.283600...",1
404287,"[[-1.1824719, 0.31452188, -0.3146478, 1.228863...","[[-1.1824719, 0.31452188, -0.3146478, 1.228863...",0
404288,"[[-1.1824719, 0.31452188, -0.3146478, 1.228863...","[[0.94553715, -2.6689708, 1.4066093, -0.302593...",0


In [8]:
del model
del tokenized_questions

In [9]:
joblib.dump(df, './data/word2vec_15_100.pkl')

['./data/word2vec_15_100.pkl']