In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from sent2vec.vectorizer import Vectorizer

In [2]:
df_preprocessed = pd.read_csv(filepath_or_buffer='./Dataset/dataframes/df_preprocessed2.csv', sep=',')

In [3]:
print(df_preprocessed.head(3))

   Unnamed: 0  id                     query  median_relevance  \
0           0   1  bridal shower decoration                 1   
1           1   2      lead christmas light                 4   
2           2   4                 projector                 4   

   relevance_variance                                            product  
0               0.000  accent pillow heart design red black red satin...  
1               0.000  set 10 battery operated multi led train christ...  
2               0.471         viewsonic pro8200 dlp multimedia projector  


In [4]:
queries = df_preprocessed['query']
product = df_preprocessed['product']

In [5]:
df_preprocessed['len_q'] = [len(queries[x].split()) for x in range(0, len(queries))] 
df_preprocessed['len_p'] = [len(product[x].split()) for x in range(0, len(product))]

In [6]:
df_preprocessed = df_preprocessed.loc[(df_preprocessed['len_q'] < 512) & (df_preprocessed['len_p'] < 512)]

In [7]:
queries = df_preprocessed['query']
product = df_preprocessed['product']
classes = df_preprocessed['median_relevance']

In [8]:
print(len(queries))
print(len(product))
print(len(classes))

6342
6342
6342


In [9]:
sents1 = [x for x in queries]
sents2 = [x for x in product]
classses = [x for x in classes]

In [54]:
vectorizer = Vectorizer()
vectorizer.run(sents1)
vectors = vectorizer.vectors

Initializing Bert distilbert-base-uncased
Vectorization done on cpu


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_transform.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [15]:
vectorizer = Vectorizer()
vectorizer.run(sents2[:3000])
vectors2 = vectorizer.vectors

Initializing Bert distilbert-base-uncased
Vectorization done on cpu


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
vectorizer = Vectorizer()
vectorizer.run(sents2[3000:])
vectors22 = vectorizer.vectors

Initializing Bert distilbert-base-uncased
Vectorization done on cpu


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [13]:
from podium import Vocab, Field, LabelField
from podium.datasets import TabularDataset
from podium.vectorizers import GloVe

In [16]:
train = pd.DataFrame()
train['query'] = sents1
train['product'] = sents2
train['label'] = classes

                      query  \
0  bridal shower decoration   
1      lead christmas light   

                                             product  label  
0  accent pillow heart design red black red satin...    1.0  
1  set 10 battery operated multi led train christ...    4.0  


In [17]:
def lowercase(text):
    return text.lower()

max_vocab_size = 10_000
vocab = Vocab(max_size=max_vocab_size, min_freq=2)

S1 = Field('query', numericalizer=vocab, pretokenize_hooks=[lowercase])
S2 = Field('product', numericalizer=vocab, pretokenize_hooks=[lowercase])
LABEL = LabelField('label')

fields = [
    S1,
    S2,
    LABEL,
]

train = TabularDataset.from_pandas(train, fields)
train.finalize_fields()

glove = GloVe()
# Load only the vectors of vocab words.
embeddings = glove.load_vocab(vocab)

# Generate padded batch.
train_batch = train.batch(add_padding=True)

100%|███████████████████████████████████████████████████████████████████████████████| 862M/862M [03:01<00:00, 4.75MB/s]


In [20]:
def cosine_similarity(a, b):
    """
    Receives two 2D numpy arrays and calculates cosine similarity across the second axis.
    For examples, if `a` and `b` have shape (32, 10), the resulting array should have shape (32,).
    
    Returns:
        1D numpy array with cosine similarities
    """
    return [np.dot(a[i,:],b[i,:])/(np.linalg.norm(a[i,:])*np.linalg.norm(b[i,:])) for i in range(len(a))]

In [21]:
query_train = np.array([[embeddings[index] for index in query] for query in train_batch.query])
product_train = np.array([[embeddings[index] for index in product] for product in train_batch.product])

In [22]:
query_train_mean = np.array([np.mean(x, axis = 0) for x in query_train])
product_train_mean = np.array([np.mean(x, axis = 0) for x in product_train])

In [24]:
print(query_train_mean.shape)
print(product_train_mean.shape)

(10112, 300)
(10112, 300)


In [61]:
import json

In [72]:
query_vector = query_train_mean.tolist()
product_vector = product_train_mean.tolist()
labels = np.array(classes).tolist()

In [73]:
with open('embedding.json', 'w') as outfile:
  json.dump([query_vector, product_vector, labels], outfile,indent=3)