In [1]:
import fasttext
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
import os
import random

import tensorflow as tf
from tensorflow.keras.layers import LSTM, Dense, Input, Dropout, Lambda, Concatenate

# Have to download the stopwords
# nltk.download('stopwords')

In [None]:
# Get the fasttext model (we are using the largest one they offer [600B tokens])
fasttext_model = fasttext.load_model('models/crawl-300d-2M-subword.bin')

## Data Processsing and Organization
Here, all we really want to do is prepare the data for training. This is **only** the data from **Gold Standard** This includes:
* Simplifying the original data
* Normalizing the data 
* Balancing the positive and negative examples
* Creating the embedding representations that will actually get fed into the neural network

In [2]:
def remove_stop_words(phrase):
    # Creates the stopwords
    to_stop = stopwords.words('english')
    punctuation = "!”#$%&’()*+,-./:;<=>?@[\]^_`{|}~ "
    for c in punctuation:
        to_stop.append(c)

    to_stop.append('null')
    
    for punc in punctuation:
        phrase = phrase.replace(punc, ' ')
    
    return ' '.join((' '.join([x for x in phrase.split(' ') if x not in to_stop])).split())


In [35]:
# Organizing and normalizing the data
"""
Essentially, we want to only have three attributes for each training example: title_one, title_two, label
For normalization, we are just going to use the nltk stopwords and punctuation
"""

def preprocessing(orig_data):
    """
    Normalizes the data by getting rid of stopwords and punctuation
    """
    
    # The new names of the columns
    column_names = ['title_one', 'title_two', 'label']
    # A new dataframe for the data we are going to be creating
    norm_computers = pd.DataFrame(columns = column_names)
    # Iterate over the original dataframe (I know it is slow and there are probably better ways to do it)
    for row in orig_data.itertuples():
        title_left = remove_stop_words(row.title_left)
        title_right = remove_stop_words(row.title_right)
        
        # Append the newly created row (title_left, title_right, label) to the new dataframe
        norm_computers = norm_computers.append(pd.DataFrame([[title_left, title_right, row.label]], columns=column_names))
        
    return norm_computers
        

In [36]:
def create_simple_data():
    """
    Creates and saves a simpler version of the original data that only contains the the two titles and the label.
    """
    
    # Get the dataset of computer parts
    computers_df = pd.read_json('data/computers_train/computers_train_xlarge_normalized.json.gz',compression='gzip', lines=True)
    norm_computers = preprocessing(computers_df)
    
    # Save the new normalized and simplified data to a CSV file to load later
    norm_computers.to_csv('data/computers_train/computers_train_xlarge_norm_simple.csv', index=False)

In [4]:
# Create and save the data if the simple and normalized data does not exist
if not os.path.exists('data/computers_train/computers_train_xlarge_norm_simple.csv'):
    create_simple_data()

In [None]:
# Load the data
computer_df = pd.read_csv('data/computers_train/computers_train_xlarge_norm_simple.csv')

In [None]:
# See some of the data. There is clearly a separation between the positive and negative examples
computer_df

In [None]:
def create_train_df(df):
    """
    Returns a shuffled dataframe with an equal amount of positive and negative examples
    """
    # Get the positive and negative examples
    pos_df = df.loc[df['label'] == 1]
    neg_df = df.loc[df['label'] == 0]
    
    # Shuffle the data
    pos_df = pos_df.sample(frac=1)
    neg_df = neg_df.sample(frac=1)
    
    # Concatenate the positive and negative examples and 
    # make sure there are only as many negative examples as positive examples
    final_df = pd.concat([pos_df, neg_df[:len(pos_df)]])
    
    # Shuffle the final data once again
    final_df.sample(frac=1)
    return final_df

In [5]:
# Create and save the dataframe with equal numbers of positive and negative examples
# and is shuffled
if not os.path.exists('data/computers_train/computers_train_bal_shuffle.csv'):
    create_train_df(computer_df).to_csv('data/computers_train/computers_train_bal_shuffle.csv', index=False)

In [11]:
df = pd.read_csv('data/computers_train/computers_train_bal_shuffle.csv')

In [12]:
df

Unnamed: 0,title_one,title_two,label
0,corsair carbide air 240 windowed,corsair carbide series air 240 cube micro atx ...,1
1,a8 7670k black edition quad core amd cpu fan h...,amd a8 7650k 3 3ghz pccomponentes,1
2,amazonbasics 13 3 inch laptop sleeve black acc...,amazonbasics 13 3 inch laptop sleeve black car...,1
3,eg0146fartr hp 146 gb 6g 10k 2 5 dp sas hdd ne...,eg0146fartr hp 146 gb 6g 10k 2 5 dp sas hdd,1
4,usb 3 0 external adapter cable 2 5 inch hard d...,transcend ssd370 solid state drive ssd 2 5 sat...,0
...,...,...,...
19375,356816 001 ml350t g4p xeon 3 2 2mb 512mb whole...,409159 b21 hp xeon e5345 2 33ghz dl160 g3 new ...,0
19376,buy online samsung 750 evo series 120gb ssd mz...,ssd 750 basic 120 gb tradineur com,1
19377,628061 s21 hp g8 g9 3 tb 6g 7 2k 5 sata sc new...,628061 s21 hp g8 g9 3 tb 6g 7 2k 5 sata sc new...,1
19378,buy online zotac gtx 1060 6gb amp edition grap...,msi nvidia geforce gtx 1080 8gb gaming x rgb g...,0


## Laptop Data Preprocessing
* Normalize the data
* Create negative examples that represent when only a couple of attributes of the laptop data changes

In [3]:
# Load the laptop data
laptop_df = pd.read_csv('data/computers_train/laptops.csv', encoding='latin-1')

In [4]:
laptop_df

Unnamed: 0.1,Unnamed: 0,Company,Product,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price_euros
0,1,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8GB,128GB SSD,Intel Iris Plus Graphics 640,macOS,1.37kg,1339.69
1,2,Apple,Macbook Air,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,1.34kg,898.94
2,3,HP,250 G6,Notebook,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,256GB SSD,Intel HD Graphics 620,No OS,1.86kg,575.00
3,4,Apple,MacBook Pro,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16GB,512GB SSD,AMD Radeon Pro 455,macOS,1.83kg,2537.45
4,5,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8GB,256GB SSD,Intel Iris Plus Graphics 650,macOS,1.37kg,1803.60
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1298,1316,Lenovo,Yoga 500-14ISK,2 in 1 Convertible,14.0,IPS Panel Full HD / Touchscreen 1920x1080,Intel Core i7 6500U 2.5GHz,4GB,128GB SSD,Intel HD Graphics 520,Windows 10,1.8kg,638.00
1299,1317,Lenovo,Yoga 900-13ISK,2 in 1 Convertible,13.3,IPS Panel Quad HD+ / Touchscreen 3200x1800,Intel Core i7 6500U 2.5GHz,16GB,512GB SSD,Intel HD Graphics 520,Windows 10,1.3kg,1499.00
1300,1318,Lenovo,IdeaPad 100S-14IBR,Notebook,14.0,1366x768,Intel Celeron Dual Core N3050 1.6GHz,2GB,64GB Flash Storage,Intel HD Graphics,Windows 10,1.5kg,229.00
1301,1319,HP,15-AC110nv (i7-6500U/6GB/1TB/Radeon,Notebook,15.6,1366x768,Intel Core i7 6500U 2.5GHz,6GB,1TB HDD,AMD Radeon R5 M330,Windows 10,2.19kg,764.00


In [5]:
# This class will be used in order to exchange the different attributes
# to create negative examples
class Attributes():
    company = {'Apple'}
    product = {'MacBook Pro'}
    inches = {'13.3'}
    cpu = {'Intel Core i5 2.3GHz'}
    ram = {'4GB'}
    memory = {'256GB SSD'}
    gpu = {'Intel HD Graphics 520'}
    screen = {'1440x900'}
    
    def get_all_data():
        return {
            'company': Attributes.company,
            'product': Attributes.product,
            'inches': Attributes.inches,
            'cpu': Attributes.cpu,
            'ram': Attributes.ram,
            'memory': Attributes.memory,
            'gpu': Attributes.gpu,
            'screen': Attributes.screen
        }

In [6]:
# Create attribute sets
def create_attribute_sets(df):
    Attributes.company.update([row.Company for row in laptop_df[['Company']].itertuples()])
    Attributes.product.update([row.Product for row in laptop_df[['Product']].itertuples()])
    Attributes.inches.update([str(row.Inches) for row in laptop_df[['Inches']].itertuples()])
    Attributes.cpu.update([row.Cpu for row in laptop_df[['Cpu']].itertuples()])
    Attributes.ram.update([row.Ram for row in laptop_df[['Ram']].itertuples()])
    Attributes.memory.update([row.Memory for row in laptop_df[['Memory']].itertuples()])
    Attributes.gpu.update([row.Gpu for row in laptop_df[['Gpu']].itertuples()])
    Attributes.screen.update([row.ScreenResolution for row in laptop_df[['ScreenResolution']].itertuples()])

create_attribute_sets(laptop_df)

In [7]:
Attributes.get_all_data()['product']

{'110-15ACL (A6-7310/4GB/500GB/W10)',
 '14-am079na (N3710/8GB/2TB/W10)',
 '15-AC110nv (i7-6500U/6GB/1TB/Radeon',
 '15-AY023na (N3710/8GB/2TB/W10)',
 '15-BA015wm (E2-7110/4GB/500GB/W10)',
 '15-BS026nv (i5-7200U/8GB/256GB/Radeon',
 '15-BS028nv (i3-6006U/4GB/1TB/Radeon',
 '15-BS078nr (i7-7500U/8GB/1TB/W10)',
 '15-BS101nv (i7-8550U/8GB/256GB/FHD/W10)',
 '15-BS103nv (i5-8250U/6GB/256GB/Radeon',
 '15-BW004nv (A9-9420/4GB/256GB/Radeon',
 '15-BW037na (A9-9420/4GB/1TB/Radeon',
 '15-BW091ND (A9-9420/6GB/1TB',
 '15-BW094nd (A6-9220/8GB/128GB/W10)',
 '15-ay047nv (i3-6006U/6GB/1TB/Radeon',
 '15-ba043na (A12-9700P/8GB/2TB/W10)',
 '15-bs002nv (i3-6006U/4GB/128GB/FHD/W10)',
 '15-bs005nv (i3-6006U/4GB/1TB',
 '15-bs011nv (i7-7500U/4GB/500GB/Radeon',
 '15-bs012nv (i7-7500U/8GB/1TB/Radeon',
 '15-bs015dx (i5-7200U/8GB/1TB/W10)',
 '15-bs017nv (i7-7500U/8GB/256GB/Radeon',
 '15-bs018nq (i3-6006U/4GB/500GB/FHD/No',
 '15-bs023nv (i3-6006U/4GB/1TB/FHD/W10)',
 '15-bs024nv (i5-7200U/8GB/128GB/W10)',
 '15-bs025nv (

In [8]:
def concatenate_row(row):
    # Note: got rid of everything after the '(' because it has info about the actual specs of the laptop
    # so if we change the specs, we need to fix that too
    return ' '.join([row['Company'], row['Product'].split('(')[0], row['TypeName'], str(row['Inches']), 'inch',  row['ScreenResolution'], row['Cpu'], row['Ram'], 'ram', row['Memory'], 'ram', row['Gpu']])

In [9]:
# Creates the negative examples for the laptop data
# The laptop_df is the original data, the new_df is the dataframe to append the new data to
# and the attributes are the attributes to swap for the new data
def create_neg_laptop_data(laptop_df, attributes):
    new_column_names = ['title_one', 'title_two', 'label']
    negative_df = pd.DataFrame(columns = new_column_names)
    for row in range(len(laptop_df)):
        # Create a copy of the row for the negative example
        neg_row = laptop_df.iloc[row]
        for attribute_class in attributes:
            # Get the row in the laptop_data
            orig_row = laptop_df.iloc[row]
            
            # Get the attribute that we are trying to change
            attribute_val = orig_row[attribute_class]
            
            # Temporarily value for the new value
            new_val = attribute_val
            
            # Make sure we really get a new attribute
            while new_val == attribute_val:
                new_val = random.sample(Attributes.get_all_data()[attribute_class.lower()], 1)[0]
            
            # Change the value in the neg_row to the new value
            neg_row[attribute_class] = new_val
            
            # Concatenate and normalize the data
            title_one = remove_stop_words(concatenate_row(orig_row).lower())
            title_two = remove_stop_words(concatenate_row(neg_row).lower())
            
            # Append the data to the new df
            negative_df = negative_df.append(pd.DataFrame([[title_one, title_two, 0]], columns=new_column_names))
    
    return negative_df

In [10]:
neg_df = create_neg_laptop_data(laptop_df, attributes=['Cpu', 'Memory', 'Ram', 'Inches', 'Product'])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [27]:
for i in range(len(neg_df)):
    print(neg_df.iloc()[i].title_one)

apple macbook pro ultrabook 13 3 inch ips panel retina display 2560x1600 intel core i5 2 3ghz 8gb ram 128gb ssd ram intel iris plus graphics 640
apple macbook pro ultrabook 13 3 inch ips panel retina display 2560x1600 intel core i5 2 3ghz 8gb ram 128gb ssd ram intel iris plus graphics 640
apple macbook pro ultrabook 13 3 inch ips panel retina display 2560x1600 intel core i5 2 3ghz 8gb ram 128gb ssd ram intel iris plus graphics 640
apple macbook pro ultrabook 13 3 inch ips panel retina display 2560x1600 intel core i5 2 3ghz 8gb ram 128gb ssd ram intel iris plus graphics 640
apple macbook pro ultrabook 13 3 inch ips panel retina display 2560x1600 intel core i5 2 3ghz 8gb ram 128gb ssd ram intel iris plus graphics 640
apple macbook air ultrabook 13 3 inch 1440x900 intel core i5 1 8ghz 8gb ram 128gb flash storage ram intel hd graphics 6000
apple macbook air ultrabook 13 3 inch 1440x900 intel core i5 1 8ghz 8gb ram 128gb flash storage ram intel hd graphics 6000
apple macbook air ultrabook 1

msi gt73evr 7re gaming 17 3 inch full hd 1920x1080 intel core i7 7700hq 2 8ghz 16gb ram 512gb ssd 1tb hdd ram nvidia geforce gtx 1070
msi gt73evr 7re gaming 17 3 inch full hd 1920x1080 intel core i7 7700hq 2 8ghz 16gb ram 512gb ssd 1tb hdd ram nvidia geforce gtx 1070
msi gt73evr 7re gaming 17 3 inch full hd 1920x1080 intel core i7 7700hq 2 8ghz 16gb ram 512gb ssd 1tb hdd ram nvidia geforce gtx 1070
msi gt73evr 7re gaming 17 3 inch full hd 1920x1080 intel core i7 7700hq 2 8ghz 16gb ram 512gb ssd 1tb hdd ram nvidia geforce gtx 1070
lenovo yoga 720 15ikb 2 1 convertible 15 6 inch ips panel full hd touchscreen 1920x1080 intel core i7 7700hq 2 8ghz 8gb ram 512gb ssd ram nvidia geforce gtx 1050m
lenovo yoga 720 15ikb 2 1 convertible 15 6 inch ips panel full hd touchscreen 1920x1080 intel core i7 7700hq 2 8ghz 8gb ram 512gb ssd ram nvidia geforce gtx 1050m
lenovo yoga 720 15ikb 2 1 convertible 15 6 inch ips panel full hd touchscreen 1920x1080 intel core i7 7700hq 2 8ghz 8gb ram 512gb ssd ram 

mediacom flexbook edge 2 1 convertible 11 6 inch ips panel full hd touchscreen 1920x1080 intel celeron dual core n3350 1 1ghz 4gb ram 32gb ssd ram intel hd graphics 500
mediacom flexbook edge 2 1 convertible 11 6 inch ips panel full hd touchscreen 1920x1080 intel celeron dual core n3350 1 1ghz 4gb ram 32gb ssd ram intel hd graphics 500
mediacom flexbook edge 2 1 convertible 11 6 inch ips panel full hd touchscreen 1920x1080 intel celeron dual core n3350 1 1ghz 4gb ram 32gb ssd ram intel hd graphics 500
samsung chromebook 3 netbook 11 6 inch 1366x768 intel celeron dual core n3060 1 6ghz 4gb ram 16gb flash storage ram intel hd graphics 400
samsung chromebook 3 netbook 11 6 inch 1366x768 intel celeron dual core n3060 1 6ghz 4gb ram 16gb flash storage ram intel hd graphics 400
samsung chromebook 3 netbook 11 6 inch 1366x768 intel celeron dual core n3060 1 6ghz 4gb ram 16gb flash storage ram intel hd graphics 400
samsung chromebook 3 netbook 11 6 inch 1366x768 intel celeron dual core n3060 1

lenovo thinkpad yoga 2 1 convertible 13 3 inch ips panel full hd touchscreen 1920x1080 intel core i7 7500u 2 7ghz 8gb ram 256gb ssd ram intel hd graphics 620
lenovo thinkpad yoga 2 1 convertible 13 3 inch ips panel full hd touchscreen 1920x1080 intel core i7 7500u 2 7ghz 8gb ram 256gb ssd ram intel hd graphics 620
lenovo thinkpad yoga 2 1 convertible 13 3 inch ips panel full hd touchscreen 1920x1080 intel core i7 7500u 2 7ghz 8gb ram 256gb ssd ram intel hd graphics 620
dell xps 15 notebook 15 6 inch 4k ultra hd touchscreen 3840x2160 intel core i7 7700hq 2 8ghz 16gb ram 1tb ssd ram nvidia geforce gtx 1050
dell xps 15 notebook 15 6 inch 4k ultra hd touchscreen 3840x2160 intel core i7 7700hq 2 8ghz 16gb ram 1tb ssd ram nvidia geforce gtx 1050
dell xps 15 notebook 15 6 inch 4k ultra hd touchscreen 3840x2160 intel core i7 7700hq 2 8ghz 16gb ram 1tb ssd ram nvidia geforce gtx 1050
dell xps 15 notebook 15 6 inch 4k ultra hd touchscreen 3840x2160 intel core i7 7700hq 2 8ghz 16gb ram 1tb ssd ra

dell alienware 17 gaming 17 3 inch ips panel full hd 1920x1080 intel core i7 7700hq 2 8ghz 32gb ram 512gb ssd 1tb hdd ram nvidia geforce gtx 1070
dell alienware 17 gaming 17 3 inch ips panel full hd 1920x1080 intel core i7 7700hq 2 8ghz 32gb ram 512gb ssd 1tb hdd ram nvidia geforce gtx 1070
dell alienware 17 gaming 17 3 inch ips panel full hd 1920x1080 intel core i7 7700hq 2 8ghz 32gb ram 512gb ssd 1tb hdd ram nvidia geforce gtx 1070
hp probook 470 notebook 17 3 inch 1600x900 intel core i5 7200u 2 5ghz 8gb ram 1tb hdd ram nvidia geforce 930mx
hp probook 470 notebook 17 3 inch 1600x900 intel core i5 7200u 2 5ghz 8gb ram 1tb hdd ram nvidia geforce 930mx
hp probook 470 notebook 17 3 inch 1600x900 intel core i5 7200u 2 5ghz 8gb ram 1tb hdd ram nvidia geforce 930mx
hp probook 470 notebook 17 3 inch 1600x900 intel core i5 7200u 2 5ghz 8gb ram 1tb hdd ram nvidia geforce 930mx
hp probook 470 notebook 17 3 inch 1600x900 intel core i5 7200u 2 5ghz 8gb ram 1tb hdd ram nvidia geforce 930mx
dell vo

lenovo thinkpad l560 notebook 15 6 inch full hd 1920x1080 intel core i5 6200u 2 3ghz 8gb ram 256gb ssd ram intel hd graphics 520
lenovo thinkpad l560 notebook 15 6 inch full hd 1920x1080 intel core i5 6200u 2 3ghz 8gb ram 256gb ssd ram intel hd graphics 520
lenovo thinkpad l560 notebook 15 6 inch full hd 1920x1080 intel core i5 6200u 2 3ghz 8gb ram 256gb ssd ram intel hd graphics 520
lenovo thinkpad l560 notebook 15 6 inch full hd 1920x1080 intel core i5 6200u 2 3ghz 8gb ram 256gb ssd ram intel hd graphics 520
hp elitebook 840 notebook 14 0 inch full hd 1920x1080 intel core i5 6200u 2 3ghz 4gb ram 500gb hdd ram intel hd graphics 520
hp elitebook 840 notebook 14 0 inch full hd 1920x1080 intel core i5 6200u 2 3ghz 4gb ram 500gb hdd ram intel hd graphics 520
hp elitebook 840 notebook 14 0 inch full hd 1920x1080 intel core i5 6200u 2 3ghz 4gb ram 500gb hdd ram intel hd graphics 520
hp elitebook 840 notebook 14 0 inch full hd 1920x1080 intel core i5 6200u 2 3ghz 4gb ram 500gb hdd ram intel 

acer aspire es1 572 notebook 15 6 inch 1366x768 intel core i3 6006u 2 0ghz 4gb ram 500gb hdd ram intel hd graphics 520
acer aspire es1 572 notebook 15 6 inch 1366x768 intel core i3 6006u 2 0ghz 4gb ram 500gb hdd ram intel hd graphics 520
dell inspiron 7779 2 1 convertible 17 3 inch full hd touchscreen 1920x1080 intel core i7 7500u 2 7ghz 16gb ram 512gb ssd ram nvidia geforce 940mx
dell inspiron 7779 2 1 convertible 17 3 inch full hd touchscreen 1920x1080 intel core i7 7500u 2 7ghz 16gb ram 512gb ssd ram nvidia geforce 940mx
dell inspiron 7779 2 1 convertible 17 3 inch full hd touchscreen 1920x1080 intel core i7 7500u 2 7ghz 16gb ram 512gb ssd ram nvidia geforce 940mx
dell inspiron 7779 2 1 convertible 17 3 inch full hd touchscreen 1920x1080 intel core i7 7500u 2 7ghz 16gb ram 512gb ssd ram nvidia geforce 940mx
dell inspiron 7779 2 1 convertible 17 3 inch full hd touchscreen 1920x1080 intel core i7 7500u 2 7ghz 16gb ram 512gb ssd ram nvidia geforce 940mx
hp elitebook 840 notebook 14 0 i

In [138]:
# Creates the postive examples for the laptop data
# The laptop_df is the original data, the new_df is the dataframe to append the new data to
# and the attributes are the attributes to swap or delete for the new data
def create_pos_laptop_data(laptop_df, rm_attrs, add_attrs):
    new_column_names = ['title_one', 'title_two', 'label']
    pos_df = pd.DataFrame(columns = new_column_names)
    for row in range(len(laptop_df)):
        # Remove the attribute from the new title
        for attr_list in rm_attrs:
            # Create a copy of the row for the negative example
            new_row = laptop_df.iloc[row]
            orig_row = laptop_df.iloc[row]
            for attr in attr_list:
                new_row[attr] = ''
        
            title_one = remove_stop_words(concatenate_row(orig_row).lower())
            title_two = remove_stop_words(concatenate_row(new_row).lower())

            # Occassionally add in the operating system just to switch it up
            if (random.sample([0, 1], 1)):
                for attr in add_attrs:
                    title_two += ' ' + orig_row[attr].lower()

            pos_df = pos_df.append(pd.DataFrame([[title_one, title_two, 1]], columns=new_column_names))

    return pos_df

In [139]:
pos_df = create_pos_laptop_data(laptop_df, rm_attrs = [['Company'], ['TypeName'], ['ScreenResolution'], ['Product'], ['TypeName', 'ScreenResolution']], add_attrs=['OpSys'])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()


In [160]:
pos_df

Unnamed: 0,title_one,title_two,label
0,apple macbook pro ultrabook 13 3 inch ips pane...,macbook pro ultrabook 13 3 inch ips panel reti...,1
0,apple macbook pro ultrabook 13 3 inch ips pane...,apple macbook pro 13 3 inch ips panel retina d...,1
0,apple macbook pro ultrabook 13 3 inch ips pane...,apple macbook pro ultrabook 13 3 inch intel co...,1
0,apple macbook pro ultrabook 13 3 inch ips pane...,apple ultrabook 13 3 inch ips panel retina dis...,1
0,apple macbook pro ultrabook 13 3 inch ips pane...,apple macbook pro 13 3 inch intel core i5 2 3g...,1
...,...,...,...
0,asus x553sa xx031t n3050 4gb 500gb w10 noteboo...,x553sa xx031t n3050 4gb 500gb w10 notebook 15 ...,1
0,asus x553sa xx031t n3050 4gb 500gb w10 noteboo...,asus x553sa xx031t n3050 4gb 500gb w10 15 6 in...,1
0,asus x553sa xx031t n3050 4gb 500gb w10 noteboo...,asus x553sa xx031t n3050 4gb 500gb w10 noteboo...,1
0,asus x553sa xx031t n3050 4gb 500gb w10 noteboo...,asus notebook 15 6 inch 1366x768 intel celeron...,1


## Embeddings Creation
Generates the embeddings and saves them

In [6]:
"""
Definitions of some sizes in the training set
"""
MAX_LEN = 42
EMBEDDING_SHAPE = (300,)
m = 19380
print('MAX_LEN: ' + str(MAX_LEN), 'EMBEDDING_SHAPE: ' + str(EMBEDDING_SHAPE), 'm: ' + str(m))

MAX_LEN: 42 EMBEDDING_SHAPE: (300,) m: 19380


In [44]:
"""
Create the numpy files of all the training embedddings
We will have two numpy files:
1. The training/validation/test sets
2. The labels
"""

def create_embeddings(df):
    # Create the numpy arrays for storing the embeddings and labels
    total_embeddings = np.zeros(shape=(m, 2, MAX_LEN, EMBEDDING_SHAPE[0]))
    labels = np.zeros(shape=(m))
    
    # I know this is a terrible way of doing this, but iterate over the dataframe
    # and generate the embeddings to add to the numpy array
    for idx, row in enumerate(df.itertuples()):
        for word_idx, word in enumerate(row.title_one.split()):
            total_embeddings[idx, 0, word_idx] = fasttext_model[word]
            
        for word_idx, word in enumerate(row.title_two.split()):
            total_embeddings[idx, 1, word_idx] = fasttext_model[word]
            
        labels[idx] = row.label
        
    return total_embeddings, labels


In [14]:
def save_embeddings(df, embeddings_name, labels_name):
    """
    Saves the embeddings given the embeddings file name and labels file name
    """
    if not os.path.exists('data/computers_numpy/' + embeddings_name + '.npy'):
        embeddings, labels = create_embeddings(df)
        with open('data/computers_numpy/' + embeddings_name + '.npy', 'wb') as f:
            np.save(f, embeddings)

        with open('data/computers_numpy/' + labels_name + '.npy', 'wb') as f:
            np.save(f, labels)

In [15]:
def load_embeddings_and_labels(embeddings_name, labels_name):
    loaded_embeddings = None
    labels = None
    with open('data/computers_numpy/' + embeddings_name + '.npy', 'rb') as f:
        loaded_embeddings = np.load(f)
        loaded_embeddings = np.transpose(loaded_embeddings, (1, 0, 2, 3))
    
    with open('data/computers_numpy/' + labels_name + '.npy', 'rb') as f:
        labels = np.load(f)
    
    return loaded_embeddings, labels

In [16]:
def get_max_len(df):
    max_len = 0
    for row in df.itertuples():
        if len(row.title_one.split(' ')) > max_len:
            max_len = len(row.title_one.split(' '))
            
        if len(row.title_two.split(' ')) > max_len:
            max_len = len(row.title_two.split(' '))
    
    return max_len


In [17]:
save_embeddings(df, 'bal_embeddings', 'bal_labels')

In [5]:
embeddings, labels = load_embeddings_and_labels('bal_embeddings', 'bal_labels')

In [6]:
X_train1 = embeddings[0, :15000]
X_train2 = embeddings[1, :15000]
X_train = np.stack((X_train1, X_train2))
print('Training shape: ' + str(X_train.shape))

X_val1 = embeddings[0, 15000:17000]
X_val2 = embeddings[1, 15000:17000]
X_val = np.stack((X_val1, X_val2))
print('Val shape: ' + str(X_val.shape))

X_test1 = embeddings[0, 17000:]
X_test2 = embeddings[1, 17000:]
X_test = np.stack((X_test1, X_test2))
print('Test shape: ' + str(X_test.shape))

Training shape: (2, 15000, 42, 300)
Val shape: (2, 2000, 42, 300)
Test shape: (2, 2380, 42, 300)


In [7]:
Y_train = labels[:15000]
Y_val = labels[15000:17000]
Y_test = labels[17000:]

In [8]:
def convert_to_one_hot(Y, C):
    Y = np.eye(C)[Y.reshape(-1)]
    return Y

In [9]:
Y_train = convert_to_one_hot(Y_train.astype(np.int32), 2)
Y_val = convert_to_one_hot(Y_val.astype(np.int32), 2)
Y_test = convert_to_one_hot(Y_test.astype(np.int32), 2)

## Model Info

For the model, we are going to use LSTMs with a Constrastive Loss Function 
that will also be used to predict whether the two products are the same 

First, we have to convert the titles to embeddings through FastText before feeding into the LSTM.
The embedding part of this model will not be a layer because:
* The fasttext model would be time consuming and annoying to get to work with an embedding layer in Keras
* The fasttext model is not going to be getting its embeddings optimized, so there is really no point in adding it as an embedding layer

In [7]:
def square_distance(vectors):
    x, y = vectors
    return tf.square(x - y)

def euclidean_dist_out_shape(shapes):
    # Both inputs are fed in, so just use one of them and get the first value in the shape
    shape1, shape2 = shapes
    return (shape1[0],)

def siamese_network(input_shape):
    # Defines our inputs
    left_title = Input(input_shape, dtype='float32')
    right_title = Input(input_shape, dtype='float32')
    
    # The LSTM units
    model = tf.keras.Sequential(name='siamese_model')
    model.add(LSTM(units=256, return_sequences=True, name='lstm_1'))
    model.add(Dropout(rate=0.5))
    model.add(LSTM(units=128, return_sequences=True, name='lstm_2'))
    model.add(Dropout(rate=0.5))
    model.add(LSTM(units=128, name='lstm_3'))
    model.add(Dropout(rate=0.5))
    
    # The dense layers
    model.add(Dense(units=1024, activation='elu', name='dense_1'))
    model.add(Dropout(rate=0.5))
    model.add(Dense(units=512, activation='elu', name='dense_2'))
    
    # Forward propagate through the model to generate the encodings
    encoded_left_title = model(left_title)
    encoded_right_title = model(right_title)

    SquareDistanceLayer = Lambda(square_distance)
    distance = SquareDistanceLayer([encoded_left_title, encoded_right_title])
    
    prediction = Dense(units=2, activation='softmax')(distance)
    # Create and return the network
    siamese_net = tf.keras.Model(inputs=[left_title, right_title], outputs=prediction, name='siamese_network')
    return siamese_net

In [37]:
# Note: for the constrastive loss, because 0 denotes that they are from the same class
# and one denotes they are from a different class, I swaped the (Y) and (1 - Y) terms

def constrastive_loss(y_true, y_pred):
    margin = 2.0
    d = y_pred
    d_sqrt = tf.sqrt(d)
    #tf.print('\nY Pred: ', d, 'Shape: ', tf.shape(d))
    #tf.print('\nY True: ', y_true, 'Shape: ', tf.shape(y_true))
    
    loss = (y_true * d) + ((1 - y_true) * tf.square(tf.maximum(0., margin - d_sqrt)))
    
    #tf.print('\n Constrastive Loss: ', loss, 'Shape: ', tf.shape(loss))
    loss = 0.5 * tf.reduce_mean(loss)
    
    return loss

In [38]:
# Accuracy metric for constrastive loss because values close to 0 are equal and values high are different
# 0.5 is the threshold here
def constrastive_accuracy(y_true, y_pred):
    return tf.reduce_mean(tf.cast(tf.equal(y_true, tf.cast(y_pred < 0.5, y_true.dtype)), y_true.dtype))

In [2]:
def save_model(model, name):
    """
    Saves a model with a particular name
    """
    model.save('models/' + name + '.h5')

In [8]:
model = siamese_network((MAX_LEN, EMBEDDING_SHAPE[0],))
model.summary()

Model: "siamese_network"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 42, 300)]    0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 42, 300)]    0                                            
__________________________________________________________________________________________________
siamese_model (Sequential)      (None, 512)          1555968     input_1[0][0]                    
                                                                 input_2[0][0]                    
__________________________________________________________________________________________________
lambda (Lambda)                 (None, 512)          0           siamese_model[0][0]

In [None]:
# Compile the model
lr = 0.001
opt = tf.keras.optimizers.Adam(learning_rate=lr)
model.compile(loss='categorical_crossentropy', optimizer='RMSprop', metrics=['accuracy'])

In [44]:
# Train the model
model.fit(x=[X_train1, X_train2], y=Y_train, batch_size=64, epochs=50, validation_data=([X_val[0], X_val[1]], Y_val))

Train on 15000 samples, validate on 2000 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


<tensorflow.python.keras.callbacks.History at 0x1d34eb2a188>

In [None]:
# Test the model
results = model.evaluate([X_test1, X_test2], Y_test, batch_size=16)
print('test loss, test acc: ', results)

In [47]:
# Save the model
model_name = 'Softmax-LSTM-_epochs_loss'
save_model(model, model_name)

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
INFO:tensorflow:Assets written to: models/assets


## Manual Testing
Converts titles into embeddings arrays and allow the model to make a prediction

In [9]:
model.load_weights('models/' + model_name + '.h5')

In [51]:
title_one = 'True Wireless Earbuds VANKYO X200 Bluetooth 5 0 Earbuds in Ear TWS Stereo Headphones Smart LED Display Charging Case IPX8 Waterproof 120H Playtime Built Mic Deep Bass Sports Work'
title_two = 'TOZO T10 Bluetooth 5 0 Wireless Earbuds Wireless Charging Case IPX8 Waterproof TWS Stereo Headphones Ear Built Mic Headset Premium Sound Deep Bass Sport Black'
title_one_arr = np.zeros((1, 42, 300))
title_two_arr = np.zeros((1, 42, 300))
title_one.lower()
title_two.lower()
for idx, word in enumerate(title_one.split(' ')):
    title_one_arr[0, idx] = fasttext_model[word]
    
for idx, word in enumerate(title_two.split(' ')):
    title_two_arr[0, idx] = fasttext_model[word]

In [52]:
model.predict([title_one_arr, title_two_arr])

array([[0.27092224, 0.7290778 ]], dtype=float32)