In [1]:
import fasttext
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
import os
import random
from itertools import combinations
from tqdm import tqdm
import matplotlib as plt

import tensorflow as tf
from tensorflow.keras.layers import LSTM, Bidirectional, Dense, Input, Dropout, Lambda, Concatenate

# Have to download the stopwords
# nltk.download('stopwords')

In [48]:
# Get the fasttext model (we are using the largest one they offer [600B tokens])
fasttext_model = fasttext.load_model('models/crawl-300d-2M-subword.bin')



## General Useful Function
Functions that are continually used throughout this project

In [2]:
"""
Definitions of some sizes in the training set
"""
MAX_LEN = 44
EMBEDDING_SHAPE = (300,)
print('MAX_LEN: ' + str(MAX_LEN), 'EMBEDDING_SHAPE: ' + str(EMBEDDING_SHAPE))

MAX_LEN: 44 EMBEDDING_SHAPE: (300,)


In [3]:
def get_max_len(df):
    max_len = 0
    for row in df.itertuples():
        if len(row.title_one.split(' ')) > max_len:
            max_len = len(row.title_one.split(' '))
            
        if len(row.title_two.split(' ')) > max_len:
            max_len = len(row.title_two.split(' '))
    
    return max_len

In [4]:
def print_dataframe(df):
    for idx in range(len(df)):
        print(df.iloc[idx].title_one + '\n' + df.iloc[idx].title_two)
        print('________________________________________________________________')

In [5]:
def create_final_data(pos_df, neg_df):
    pos_df.sample(frac=1)
    neg_df.sample(frac=1)
    final_df = pd.concat([pos_df[:min(len(pos_df), len(neg_df))], neg_df[:min(len(pos_df), len(neg_df))]])
    final_df = final_df.sample(frac=1)
    return final_df

In [6]:
def remove_stop_words(phrase):
    # Creates the stopwords
    to_stop = stopwords.words('english')
    punctuation = "!”#$%&’()*+,-./:;<=>?@[\]^_`{|}~ "
    for c in punctuation:
        to_stop.append(c)

    to_stop.append('null')
    
    for punc in punctuation:
        phrase = phrase.replace(punc, ' ')
    
    return ' '.join((' '.join([x for x in phrase.split(' ') if x not in to_stop])).split())


## Data Processsing and Organization
Here, all we really want to do is prepare the data for training. This is **only** the data from **Gold Standard** This includes:
* Simplifying the original data
* Normalizing the data 
* Balancing the positive and negative examples
* Creating the embedding representations that will actually get fed into the neural network

In [7]:
# Organizing and normalizing the data
"""
Essentially, we want to only have three attributes for each training example: title_one, title_two, label
For normalization, we are just going to use the nltk stopwords and punctuation
"""

def preprocessing(orig_data):
    """
    Normalizes the data by getting rid of stopwords and punctuation
    """
    
    # The new names of the columns
    column_names = ['title_one', 'title_two', 'label']
    # A new dataframe for the data we are going to be creating
    norm_data = pd.DataFrame(columns = column_names)
    # Iterate over the original dataframe (I know it is slow and there are probably better ways to do it)
    iloc_data = orig_data.iloc
    for idx in tqdm(range(len(orig_data))):
        row = iloc_data[idx]
        title_left = remove_stop_words(row.title_left)
        title_right = remove_stop_words(row.title_right)
        
        # Append the newly created row (title_left, title_right, label) to the new dataframe
        norm_data = norm_data.append(pd.DataFrame([[title_left, title_right, row.label]], columns=column_names))
    
    return norm_data
        

In [8]:
def create_train_df(df):
    """
    Returns a shuffled dataframe with an equal amount of positive and negative examples
    """
    # Get the positive and negative examples
    pos_df = df.loc[df['label'] == 1]
    neg_df = df.loc[df['label'] == 0]
    
    # Shuffle the data
    pos_df = pos_df.sample(frac=1)
    neg_df = neg_df.sample(frac=1)
    
    # Concatenate the positive and negative examples and 
    # make sure there are only as many negative examples as positive examples
    final_df = pd.concat([pos_df[:min(len(pos_df), len(neg_df))], neg_df[:min(len(pos_df), len(neg_df))]])
    
    # Shuffle the final data once again
    final_df.sample(frac=1)
    
    return final_df

In [9]:
def create_training_data(df, path):
    """
    Creates and saves a simpler version of the original data that only contains the the two titles and the label.
    """
    
    norm_bal_data = create_train_df(preprocessing(df))
    
    # Save the new normalized and simplified data to a CSV file to load later
    norm_bal_data.to_csv(path, index=False)

In [6]:
# Load the data
computer_df = pd.read_json('data/train/computers_train_xlarge_normalized.json.gz', compression='gzip', lines=True)

In [None]:
# See some of the data. There is clearly a separation between the positive and negative examples
computer_df

In [None]:
# Create and save the data if the simple and normalized data does not exist
computer_data_path = 'data/train/computers_train_bal_shuffle.csv'

# If the computer data has not been made yet, make it
if not os.path.exists(computer_data_path):
    create_training_data(computer_df, computer_data_path)

In [None]:
# Load cameras data
camera_df = pd.read_json('data/train/cameras_train_xlarge_normalized.json.gz', compression='gzip', lines=True)

In [None]:
camera_df

In [None]:
# Create and save the data if the simple and normalized data does not exist
camera_data_path = 'data/train/cameras_train_bal_shuffle.csv'

# If the computer data has not been made yet, make it
if not os.path.exists(camera_data_path):
    create_training_data(camera_df, camera_data_path)

In [86]:
final_computer_df = pd.read_csv('data/train/computers_train_bal_shuffle.csv')

In [87]:
final_computer_df

Unnamed: 0,title_one,title_two,label
0,intel nuc kit nuc5i5ryh i5 5250u boxnuc5i5ryh ...,intel nuc nuc5i5ryh i5 5250u pccomponentes,1
1,crucial 4gb 1x4gb 2133mhz ddr4 ecc rdimm 1 2v ...,crucial ddr4 4 gb dimm 288 pin ct4g4rfs8213 se...,1
2,hdd gold 6tb sata 128mb 3 5 smartphones tu tie...,wd gold 6tb 3 5 7200rpm 128mb cache datacenter...,1
3,corsair vengeance led 32gb 2 x 16gb ddr4 dram ...,corsair vengeance led 32gb 2x16gb ddr4 pc4 240...,1
4,data 75icc60010 cbl 16gb compactflash card 133...,gigaram cf 16gb 120x cbl 50p r21mb w9mb bulk b...,1
...,...,...,...
19375,seagate st1000lm014 1tb sshd 2 5 sata 3 pc mac...,wd blue pc ssd wds100t1b0a solid state drive 1...,0
19376,kingston technology fury 8gb 2133mhz ddr4 hype...,hyperx fury ddr3 8 gb dimm 240 pin hx318c10f s...,0
19377,gigabyte geforce gtx 1060 mini itx oc 6gb gddr5,asus gtx750ti oc 2gd5 geforce gtx 750 ti graph...,0
19378,apple smart keyboard folio case english mptl2l...,apple 12 9 inch ipad pro wi fi 32 gb gold,0


In [None]:
final_camera_df = pd.read_csv('data/train/cameras_train_bal_shuffle.csv')

In [None]:
final_camera_df

## Laptop Data Preprocessing
* Normalize the data
* Create negative examples that represent when only a couple of attributes of the laptop data changes

In [88]:
# Load the laptop data
laptop_df = pd.read_csv('data/train/laptops.csv', encoding='latin-1')

In [89]:
laptop_df

Unnamed: 0.1,Unnamed: 0,Company,Product,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price_euros
0,1,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8GB,128GB SSD,Intel Iris Plus Graphics 640,macOS,1.37kg,1339.69
1,2,Apple,Macbook Air,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,1.34kg,898.94
2,3,HP,250 G6,Notebook,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,256GB SSD,Intel HD Graphics 620,No OS,1.86kg,575.00
3,4,Apple,MacBook Pro,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16GB,512GB SSD,AMD Radeon Pro 455,macOS,1.83kg,2537.45
4,5,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8GB,256GB SSD,Intel Iris Plus Graphics 650,macOS,1.37kg,1803.60
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1298,1316,Lenovo,Yoga 500-14ISK,2 in 1 Convertible,14.0,IPS Panel Full HD / Touchscreen 1920x1080,Intel Core i7 6500U 2.5GHz,4GB,128GB SSD,Intel HD Graphics 520,Windows 10,1.8kg,638.00
1299,1317,Lenovo,Yoga 900-13ISK,2 in 1 Convertible,13.3,IPS Panel Quad HD+ / Touchscreen 3200x1800,Intel Core i7 6500U 2.5GHz,16GB,512GB SSD,Intel HD Graphics 520,Windows 10,1.3kg,1499.00
1300,1318,Lenovo,IdeaPad 100S-14IBR,Notebook,14.0,1366x768,Intel Celeron Dual Core N3050 1.6GHz,2GB,64GB Flash Storage,Intel HD Graphics,Windows 10,1.5kg,229.00
1301,1319,HP,15-AC110nv (i7-6500U/6GB/1TB/Radeon,Notebook,15.6,1366x768,Intel Core i7 6500U 2.5GHz,6GB,1TB HDD,AMD Radeon R5 M330,Windows 10,2.19kg,764.00


In [90]:
# This class will be used in order to exchange the different attributes
# to create negative examples
class Attributes():
    company = {'Apple'}
    product = {'MacBook Pro'}
    inches = {'13.3'}
    cpu = {'Intel Core i5 2.3GHz'}
    ram = {'4GB'}
    memory = {'256GB SSD'}
    gpu = {'Intel HD Graphics 520'}
    screen = {'1440x900'}
    
    def get_all_data():
        return {
            'company': Attributes.company,
            'product': Attributes.product,
            'inches': Attributes.inches,
            'cpu': Attributes.cpu,
            'ram': Attributes.ram,
            'memory': Attributes.memory,
            'gpu': Attributes.gpu,
            'screen': Attributes.screen
        }

In [91]:
# Create attribute sets
def create_attribute_sets(df):
    Attributes.company.update([row.Company for row in laptop_df[['Company']].itertuples()])
    Attributes.product.update([row.Product for row in laptop_df[['Product']].itertuples()])
    Attributes.inches.update([str(row.Inches) for row in laptop_df[['Inches']].itertuples()])
    Attributes.cpu.update([row.Cpu for row in laptop_df[['Cpu']].itertuples()])
    Attributes.ram.update([row.Ram for row in laptop_df[['Ram']].itertuples()])
    Attributes.memory.update([row.Memory for row in laptop_df[['Memory']].itertuples()])
    Attributes.gpu.update([row.Gpu for row in laptop_df[['Gpu']].itertuples()])
    Attributes.screen.update([row.ScreenResolution for row in laptop_df[['ScreenResolution']].itertuples()])

create_attribute_sets(laptop_df)

In [92]:
def concatenate_row(row):
    # Note: got rid of everything after the '(' because it has info about the actual specs of the laptop
    # so if we change the specs, we need to fix that too
    
    # Special tags at the end of the amount of inches of the laptop and the RAM to simulate real data
    inch_attr = str(row['Inches']) + random.choice([' inch', '', '"'])
    ram_attr = row['Ram'] + random.choice([' ram', ' memory', ''])
    
    # These are words that commonly come up with laptops
    modifiers = ['premium', 'new', 'fast', 'latest model']
    add_ins = ['USB 3.0', 'USB 3.1 Type-C', 'USB Type-C', 'Bluetooth', 'WIFI', 'Webcam', 'FP Reader',
               'HDMI', '802.11ac', '802.11 ac', 'home', 'flagship', 'business', 'GbE LAN', 'DVD-RW', 'DVD', 'Windows 10']
    
    cpu_attr = row['Cpu']
    if random.choice([0, 1]):
        cpu_attr = cpu_attr.split(' ')
        if random.choice([0, 1]):
            if 'Intel' in cpu_attr:
                cpu_attr.remove('Intel')
        if random.choice([0, 1]):
            if 'Core' in cpu_attr:
                cpu_attr.remove('Core')
        if random.choice([0, 1]):
            if 'AMD' in cpu_attr:
                cpu_attr.remove('AMD')
    
        cpu_attr = ' '.join(cpu_attr)

    # Create a list for all the product attributes
    order_attrs = [random.choice(modifiers),
                   row['Company'],
                   row['Product'].split('(')[0],
                   row['TypeName'],
                   inch_attr,
                   row['ScreenResolution'],
                   cpu_attr,
                   ram_attr,
                   row['Memory'],
                   row['Gpu']]
    
    order_attrs = order_attrs + random.sample(add_ins, random.choice([1, 2, 3, 4]))
    
    # Shuffle the data because in real data, it does not really matter what order the attributes are in
    random.shuffle(order_attrs)
    
    return ' '.join(order_attrs)

In [93]:
# Creates the negative examples for the laptop data
# The laptop_df is the original data, the new_df is the dataframe to append the new data to
# and the attributes are the attributes to swap for the new data
def create_neg_laptop_data(laptop_df, attributes):
    new_column_names = ['title_one', 'title_two', 'label']
    negative_df = pd.DataFrame(columns = new_column_names)
    for row in tqdm(range(len(laptop_df))):
        # Create a copy of the row for the negative example
        neg_row = laptop_df.iloc[row]
        for attribute_class in attributes:
            # Get the row in the laptop_data
            orig_row = laptop_df.iloc[row]
            
            # Get the attribute that we are trying to change
            attribute_val = orig_row[attribute_class]
            
            # Temporarily value for the new value
            new_val = attribute_val
            
            # Make sure we really get a new attribute
            while new_val == attribute_val:
                new_val = random.sample(Attributes.get_all_data()[attribute_class.lower()], 1)[0]
            
            # Change the value in the neg_row to the new value
            neg_row[attribute_class] = new_val
            
            # Concatenate and normalize the data
            title_one = remove_stop_words(concatenate_row(orig_row).lower())
            title_two = remove_stop_words(concatenate_row(neg_row).lower())
            
            # Append the data to the new df
            negative_df = negative_df.append(pd.DataFrame([[title_one, title_two, 0]], columns=new_column_names))
    
    return negative_df

In [94]:
neg_df = create_neg_laptop_data(laptop_df, attributes=['Cpu', 'Memory', 'Ram', 'Inches', 'Product'])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
100%|███████████████████████████████████████████████████████████████████████████| 1303/1303 [00:17<00:00, 73.64it/s]


In [None]:
print_dataframe(neg_df)

In [95]:
# Creates the postive examples for the laptop data
# The laptop_df is the original data, the new_df is the dataframe to append the new data to
# and the attributes are the attributes to swap or delete for the new data
def create_pos_laptop_data(laptop_df, rm_attrs, add_attrs):
    new_column_names = ['title_one', 'title_two', 'label']
    pos_df = pd.DataFrame(columns = new_column_names)
    for row in tqdm(range(len(laptop_df))):
        # Remove the attribute from the new title
        for attr_list in rm_attrs:
            # Create a copy of the row for the negative example
            new_row = laptop_df.iloc[row]
            orig_row = laptop_df.iloc[row]
            for attr in attr_list:
                new_row[attr] = ''
        
            title_one = remove_stop_words(concatenate_row(orig_row).lower())
            title_two = remove_stop_words(concatenate_row(new_row).lower())

            pos_df = pos_df.append(pd.DataFrame([[title_one, title_two, 1]], columns=new_column_names))

    return pos_df

In [None]:
pos_df = create_pos_laptop_data(laptop_df, rm_attrs = [['Company'], ['TypeName'], ['ScreenResolution'], ['Product'], ['TypeName', 'ScreenResolution']], add_attrs = [])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
 22%|████████████████▍                                                           | 282/1303 [00:03<00:14, 72.25it/s]

In [None]:
print_dataframe(pos_df)

In [None]:
final_laptop_df = create_final_data(pos_df, neg_df)

In [None]:
final_laptop_df = final_laptop_df.sample(frac=1)

## PCPartPicker Data
* Organize the data
* Preprocess the data
* Create negative and positive data

In [None]:
ram_df = pd.read_csv('data/train/pos_ram_titles.csv')
cpu_df = pd.read_csv('data/train/pos_cpu_titles.csv')
hard_drive_df = pd.read_csv('data/train/pos_hard_drive_titles.csv')

In [None]:
ram_df

In [None]:
cpu_df

In [None]:
hard_drive_df

In [None]:
# Drop the Unnamed: 0 column and drop any row where it is all NaN
def remove_misc(df):
    columns = list(df.columns)[1:]
    df = df.drop(columns=['Unnamed: 0'])
    df = df.dropna(how='all')
    print(len(df))
    return df


In [None]:
ram_df = remove_misc(ram_df)
cpu_df = remove_misc(cpu_df)
hard_drive_df = remove_misc(hard_drive_df)

In [None]:
def generate_pos_pcpartpicker_data(df):
    columns = list(df.columns)
    pos_df = pd.DataFrame(columns=['title_one', 'title_two', 'label'])
    for idx in tqdm(range(len(df))):
        row = df.iloc()[idx]
        titles = []
        for col in columns:
            if not pd.isnull(row[col]): titles.append(row[col])
        if len(titles) > 1:
            combs = combinations(titles, 2)
            for comb in combs:
                comb = list(comb)
                comb.append(1)
                pos_df = pos_df.append(pd.DataFrame([comb], columns=['title_one', 'title_two', 'label']))
    
    return pos_df


In [None]:
pos_ram_data = generate_pos_pcpartpicker_data(ram_df)

pos_cpu_data = generate_pos_pcpartpicker_data(cpu_df)

pos_hard_drive_data = generate_pos_pcpartpicker_data(hard_drive_df)


In [None]:
def generate_neg_pcpartpicker_data(df):
    columns = list(df.columns)
    neg_df = pd.DataFrame(columns=['title_one', 'title_two', 'label'])
    df_list = df.iloc()
    for idx in tqdm(range(len(df))):
        row = df_list[idx]
        for col in columns:
            if not pd.isnull(row[col]):
                neg_idx = None
                while neg_idx == idx or neg_idx is None:
                    neg_idx = random.randint(0, len(df) - 1)
                
                neg_title = None
                while neg_title == None or pd.isnull(neg_title):
                    neg_title = df_list[neg_idx][random.choice(columns)]
                
                neg_df = neg_df.append(pd.DataFrame([[row[col], neg_title, 0]], columns=['title_one', 'title_two', 'label']))
    
    return neg_df

In [None]:
neg_ram_data = generate_neg_pcpartpicker_data(ram_df)

neg_cpu_data = generate_neg_pcpartpicker_data(cpu_df)

neg_hard_drive_data = generate_neg_pcpartpicker_data(hard_drive_df)

final_ram_data = create_final_data(pos_ram_data, neg_ram_data)

final_cpu_data = create_final_data(pos_cpu_data, neg_cpu_data)

final_hard_drive_data = create_final_data(pos_hard_drive_data, neg_hard_drive_data)

print(len(final_cpu_data), len(final_ram_data), len(final_hard_drive_data))

## Custom Computer Data Generation
* Using the PCPartPicker data, we combine computer parts (e.g. CPU, hard drive, RAM, etc.) and create positive and negative examples

## Embeddings Creation Functions
Generates the embeddings and saves them

In [7]:
"""
Create the numpy files of all the training embedddings
We will have two numpy files:
1. The training/validation/test sets
2. The labels
"""

def create_embeddings(df):
    # Create the numpy arrays for storing the embeddings and labels
    total_embeddings = np.zeros(shape=(len(df), 2, MAX_LEN, EMBEDDING_SHAPE[0]))
    labels = np.zeros(shape=(len(df)))
    
    # I know this is a terrible way of doing this, but iterate over the dataframe
    # and generate the embeddings to add to the numpy array
    for idx, row in enumerate(tqdm(df.itertuples())):
        for word_idx, word in enumerate(row.title_one.split()):
            total_embeddings[idx, 0, word_idx] = fasttext_model[word]
            
        for word_idx, word in enumerate(row.title_two.split()):
            total_embeddings[idx, 1, word_idx] = fasttext_model[word]
            
        labels[idx] = row.label
        
    return total_embeddings, labels


In [8]:
def save_embeddings(df, embeddings_name, labels_name):
    """
    Saves the embeddings given the embeddings file name and labels file name
    """
    if not os.path.exists('data/numpy_data/' + embeddings_name + '.npy'):
        embeddings, labels = create_embeddings(df)
        with open('data/numpy_data/' + embeddings_name + '.npy', 'wb') as f:
            np.save(f, embeddings)

        with open('data/numpy_data/' + labels_name + '.npy', 'wb') as f:
            np.save(f, labels)

In [9]:
def load_embeddings_and_labels(embeddings_name, labels_name):
    loaded_embeddings = None
    labels = None
    with open('data/numpy_data/' + embeddings_name + '.npy', 'rb') as f:
        loaded_embeddings = np.load(f)
        loaded_embeddings = np.transpose(loaded_embeddings, (1, 0, 2, 3))
    
    with open('data/numpy_data/' + labels_name + '.npy', 'rb') as f:
        labels = np.load(f)
    
    return loaded_embeddings, labels

## Saving and Loading Embeddings
Save the embeddings for the different types of data we have

In [None]:
# Concatenate everything
total_data = pd.concat([final_computer_df, final_laptop_df, final_hard_drive_data, final_cpu_data, final_ram_data])
total_data = total_data.sample(frac=1)
MAX_LEN = get_max_len(total_data)
save_embeddings(total_data, 'all_embeddings', 'all_labels')

In [10]:
embeddings, labels = load_embeddings_and_labels('all_embeddings', 'all_labels')

In [11]:
len(embeddings[0,:])

35078

In [None]:
total_data

In [12]:
X_train1 = embeddings[0, :len(labels) - 4000]
X_train2 = embeddings[1, :len(labels) - 4000]
X_train = np.stack((X_train1, X_train2))
print('Training shape: ' + str(X_train.shape))

X_val1 = embeddings[0, len(labels) - 4000:len(labels) - 2000]
X_val2 = embeddings[1, len(labels) - 4000:len(labels) - 2000]
X_val = np.stack((X_val1, X_val2))
print('Val shape: ' + str(X_val.shape))


X_test1 = embeddings[0, len(labels) - 2000:]
X_test2 = embeddings[1, len(labels) - 2000:]
X_test = np.stack((X_test1, X_test2))
print('Test shape: ' + str(X_test.shape))

Training shape: (2, 31078, 44, 300)
Val shape: (2, 2000, 44, 300)
Test shape: (2, 2000, 44, 300)


In [13]:
Y_train = labels[:len(labels) - 4000]
print('Training labels shape:', str(Y_train.shape))

Y_val = labels[len(labels) - 4000:len(labels) - 2000]
print('Val shape:', str(Y_val.shape))

Y_test = labels[len(labels) - 2000:]
print('Test shape:', str(Y_test.shape))

Training labels shape: (31078,)
Val shape: (2000,)
Test shape: (2000,)


In [14]:
def convert_to_one_hot(Y, C):
    Y = np.eye(C)[Y.reshape(-1)]
    return Y

In [15]:
Y_train = convert_to_one_hot(Y_train.astype(np.int32), 2)
Y_val = convert_to_one_hot(Y_val.astype(np.int32), 2)
Y_test = convert_to_one_hot(Y_test.astype(np.int32), 2)

In [16]:
Y_train

array([[0., 1.],
       [0., 1.],
       [1., 0.],
       ...,
       [0., 1.],
       [0., 1.],
       [0., 1.]])

## Model Info

For the model, we are going to use LSTMs with a Constrastive Loss Function 
that will also be used to predict whether the two products are the same 

First, we have to convert the titles to embeddings through FastText before feeding into the LSTM.
The embedding part of this model will not be a layer because:
* The fasttext model would be time consuming and annoying to get to work with an embedding layer in Keras
* The fasttext model is not going to be getting its embeddings optimized, so there is really no point in adding it as an embedding layer

In [36]:
def square_distance(vectors):
    x, y = vectors
    return tf.square(x - y)

def euclidean_dist_out_shape(shapes):
    # Both inputs are fed in, so just use one of them and get the first value in the shape
    shape1, shape2 = shapes
    return (shape1[0],)

def siamese_network(input_shape):
    # Defines our inputs
    left_title = Input(input_shape, dtype='float32')
    right_title = Input(input_shape, dtype='float32')
    
    # The LSTM units
    model = tf.keras.Sequential(name='siamese_model')
    model.add(Bidirectional(LSTM(units=256, name='lstm_1')))
    model.add(Dropout(rate=0.6))
#     model.add(LSTM(units=128, return_sequences=True, name='lstm_2'))
#     model.add(Dropout(rate=0.6))
#     model.add(LSTM(units=128, name='lstm_3'))
#     model.add(Dropout(rate=0.6))
    
    # The dense layers
    model.add(Dense(units=512, activation='elu', name='dense_1'))
    model.add(Dropout(rate=0.6))
    model.add(Dense(units=256, activation='elu', name='dense_2'))
    
    # Forward propagate through the model to generate the encodings
    encoded_left_title = model(left_title)
    encoded_right_title = model(right_title)

    SquareDistanceLayer = Lambda(square_distance)
    distance = SquareDistanceLayer([encoded_left_title, encoded_right_title])
    
    prediction = Dense(units=2, activation='softmax')(distance)
    # Create and return the network
    siamese_net = tf.keras.Model(inputs=[left_title, right_title], outputs=prediction, name='siamese_network')
    return siamese_net

In [37]:
# Note: for the constrastive loss, because 0 denotes that they are from the same class
# and one denotes they are from a different class, I swaped the (Y) and (1 - Y) terms

def constrastive_loss(y_true, y_pred):
    margin = 2.0
    d = y_pred
    d_sqrt = tf.sqrt(d)
    #tf.print('\nY Pred: ', d, 'Shape: ', tf.shape(d))
    #tf.print('\nY True: ', y_true, 'Shape: ', tf.shape(y_true))
    
    loss = (y_true * d) + ((1 - y_true) * tf.square(tf.maximum(0., margin - d_sqrt)))
    
    #tf.print('\n Constrastive Loss: ', loss, 'Shape: ', tf.shape(loss))
    loss = 0.5 * tf.reduce_mean(loss)
    
    return loss

In [38]:
# Accuracy metric for constrastive loss because values close to 0 are equal and values high are different
# 0.5 is the threshold here
def constrastive_accuracy(y_true, y_pred):
    return tf.reduce_mean(tf.cast(tf.equal(y_true, tf.cast(y_pred < 0.5, y_true.dtype)), y_true.dtype))

In [39]:
def save_model(model, name):
    """
    Saves a model with a particular name
    """
    model.save('models/' + name + '.h5')

In [40]:
model = siamese_network((MAX_LEN, EMBEDDING_SHAPE[0],))
model.summary()

Model: "siamese_network"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_7 (InputLayer)            [(None, 44, 300)]    0                                            
__________________________________________________________________________________________________
input_8 (InputLayer)            [(None, 44, 300)]    0                                            
__________________________________________________________________________________________________
siamese_model (Sequential)      (None, 256)          1534720     input_7[0][0]                    
                                                                 input_8[0][0]                    
__________________________________________________________________________________________________
lambda_3 (Lambda)               (None, 256)          0           siamese_model[0][0]

In [41]:
# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
# Train the model
history = model.fit(x=[X_train1, X_train2], y=Y_train, batch_size=128, epochs=80, validation_data=([X_val[0], X_val[1]], Y_val))

In [43]:
# Test the model
results = model.evaluate([X_test1, X_test2], Y_test, batch_size=16)
print('test loss, test acc: ', results)

test loss, test acc:  [0.343930117726326, 0.883]


In [161]:
# Set the model's name
model_name = '0.1.2_Softmax-LSTM-128_batch_80_epochs'

In [162]:
# Save the model
save_model(model, model_name)

## Manual Testing
Converts titles into embeddings arrays and allow the model to make a prediction

In [163]:
model.load_weights('models/' + model_name + '.h5')

In [164]:
title_one = '2020 Dell XPS 13.3" FHD Laptop Computer, Intel Core i5-10210U Processor, 8GB RAM, 512GB PCIe SSD, Baklit Keyboard, MaxxAudio, HD Webcam, Win 10, Platinum Silver, 32GB Snow Bell USB Card'
title_two = '2020 Dell XPS 13.3" Ultrabook, Intel Core i5-10230U, HD Webcam, (16GB RAM, 512GB PCIe SSD) Backlit, Windows 10, Silver'
#title_one = '128GB ram'
#title_two = '12gb ram'
title_one_arr = np.zeros((1, MAX_LEN, 300))
title_two_arr = np.zeros((1, MAX_LEN, 300))
title_one = remove_stop_words(title_one.lower())
title_two = remove_stop_words(title_two.lower())

for idx, word in enumerate(title_one.split(' ')):
    title_one_arr[0, idx] = fasttext_model[word]
    
for idx, word in enumerate(title_two.split(' ')):
    title_two_arr[0, idx] = fasttext_model[word]

In [165]:
model.predict([title_one_arr, title_two_arr])

array([[0.57954806, 0.420452  ]], dtype=float32)