In [7]:
import fasttext
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
import os
import random
from itertools import combinations

import tensorflow as tf
from tensorflow.keras.layers import LSTM, Dense, Input, Dropout, Lambda, Concatenate

# Have to download the stopwords
# nltk.download('stopwords')

In [2]:
# Get the fasttext model (we are using the largest one they offer [600B tokens])
fasttext_model = fasttext.load_model('models/crawl-300d-2M-subword.bin')



## General Useful Function
Functions that are continually used throughout this project

In [2]:
"""
Definitions of some sizes in the training set
"""
MAX_LEN = 50
EMBEDDING_SHAPE = (300,)
print('MAX_LEN: ' + str(MAX_LEN), 'EMBEDDING_SHAPE: ' + str(EMBEDDING_SHAPE))

MAX_LEN: 50 EMBEDDING_SHAPE: (300,)


In [3]:
def get_max_len(df):
    max_len = 0
    for row in df.itertuples():
        if len(row.title_one.split(' ')) > max_len:
            max_len = len(row.title_one.split(' '))
            
        if len(row.title_two.split(' ')) > max_len:
            max_len = len(row.title_two.split(' '))
    
    return max_len

In [4]:
def print_dataframe(df):
    for idx in range(len(df)):
        print(df.iloc[idx].title_one + '\n' + df.iloc[idx].title_two)
        print('________________________________________________________________')

In [5]:
def create_final_data(pos_df, neg_df):
    pos_df.sample(frac=1)
    neg_df.sample(frac=1)
    final_df = pd.concat([pos_df[:min(len(pos_df), len(neg_df))], neg_df[:min(len(pos_df), len(neg_df))]])
    final_df = final_df.sample(frac=1)
    return final_df

In [6]:
def remove_stop_words(phrase):
    # Creates the stopwords
    to_stop = stopwords.words('english')
    punctuation = "!”#$%&’()*+,-./:;<=>?@[\]^_`{|}~ "
    for c in punctuation:
        to_stop.append(c)

    to_stop.append('null')
    
    for punc in punctuation:
        phrase = phrase.replace(punc, ' ')
    
    return ' '.join((' '.join([x for x in phrase.split(' ') if x not in to_stop])).split())


## Data Processsing and Organization
Here, all we really want to do is prepare the data for training. This is **only** the data from **Gold Standard** This includes:
* Simplifying the original data
* Normalizing the data 
* Balancing the positive and negative examples
* Creating the embedding representations that will actually get fed into the neural network

In [32]:
# Organizing and normalizing the data
"""
Essentially, we want to only have three attributes for each training example: title_one, title_two, label
For normalization, we are just going to use the nltk stopwords and punctuation
"""

def preprocessing(orig_data):
    """
    Normalizes the data by getting rid of stopwords and punctuation
    """
    
    # The new names of the columns
    column_names = ['title_one', 'title_two', 'label']
    # A new dataframe for the data we are going to be creating
    norm_computers = pd.DataFrame(columns = column_names)
    # Iterate over the original dataframe (I know it is slow and there are probably better ways to do it)
    for row in orig_data.itertuples():
        title_left = remove_stop_words(row.title_left)
        title_right = remove_stop_words(row.title_right)
        
        # Append the newly created row (title_left, title_right, label) to the new dataframe
        norm_computers = norm_computers.append(pd.DataFrame([[title_left, title_right, row.label]], columns=column_names))
        
    return norm_computers
        

In [33]:
def create_simple_data():
    """
    Creates and saves a simpler version of the original data that only contains the the two titles and the label.
    """
    
    # Get the dataset of computer parts
    computers_df = pd.read_json('data/train/computers_train_xlarge_normalized.json.gz',compression='gzip', lines=True)
    norm_computers = preprocessing(computers_df)
    
    # Save the new normalized and simplified data to a CSV file to load later
    norm_computers.to_csv('data/train/computers_train_xlarge_norm_simple.csv', index=False)

In [34]:
# Create and save the data if the simple and normalized data does not exist
if not os.path.exists('data/train/computers_train_xlarge_norm_simple.csv'):
    create_simple_data()

In [None]:
# Load the data
computer_df = pd.read_csv('data/train/computers_train_xlarge_norm_simple.csv')

In [None]:
# See some of the data. There is clearly a separation between the positive and negative examples
computer_df

In [64]:
def create_train_df(df):
    """
    Returns a shuffled dataframe with an equal amount of positive and negative examples
    """
    # Get the positive and negative examples
    pos_df = df.loc[df['label'] == 1]
    neg_df = df.loc[df['label'] == 0]
    
    # Shuffle the data
    pos_df = pos_df.sample(frac=1)
    neg_df = neg_df.sample(frac=1)
    
    # Concatenate the positive and negative examples and 
    # make sure there are only as many negative examples as positive examples
    final_df = pd.concat([pos_df[:min(len(pos_df), len(neg_df))], neg_df[:min(len(pos_df), len(neg_df))]])
    
    # Shuffle the final data once again
    final_df.sample(frac=1)
    return final_df

In [36]:
# Create and save the dataframe with equal numbers of positive and negative examples
# and is shuffled
if not os.path.exists('data/train/computers_train_bal_shuffle.csv'):
    create_train_df(computer_df).to_csv('data/train/computers_train_bal_shuffle.csv', index=False)

In [37]:
final_computer_df = pd.read_csv('data/train/computers_train_bal_shuffle.csv')

In [76]:
final_computer_df

Unnamed: 0,title_one,title_two,label
0,corsair carbide air 240 windowed,corsair carbide series air 240 cube micro atx ...,1
1,a8 7670k black edition quad core amd cpu fan h...,amd a8 7650k 3 3ghz pccomponentes,1
2,amazonbasics 13 3 inch laptop sleeve black acc...,amazonbasics 13 3 inch laptop sleeve black car...,1
3,eg0146fartr hp 146 gb 6g 10k 2 5 dp sas hdd ne...,eg0146fartr hp 146 gb 6g 10k 2 5 dp sas hdd,1
4,usb 3 0 external adapter cable 2 5 inch hard d...,transcend ssd370 solid state drive ssd 2 5 sat...,0
...,...,...,...
19375,356816 001 ml350t g4p xeon 3 2 2mb 512mb whole...,409159 b21 hp xeon e5345 2 33ghz dl160 g3 new ...,0
19376,buy online samsung 750 evo series 120gb ssd mz...,ssd 750 basic 120 gb tradineur com,1
19377,628061 s21 hp g8 g9 3 tb 6g 7 2k 5 sata sc new...,628061 s21 hp g8 g9 3 tb 6g 7 2k 5 sata sc new...,1
19378,buy online zotac gtx 1060 6gb amp edition grap...,msi nvidia geforce gtx 1080 8gb gaming x rgb g...,0


## Laptop Data Preprocessing
* Normalize the data
* Create negative examples that represent when only a couple of attributes of the laptop data changes

In [8]:
# Load the laptop data
laptop_df = pd.read_csv('data/train/laptops.csv', encoding='latin-1')

In [9]:
laptop_df

Unnamed: 0.1,Unnamed: 0,Company,Product,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price_euros
0,1,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8GB,128GB SSD,Intel Iris Plus Graphics 640,macOS,1.37kg,1339.69
1,2,Apple,Macbook Air,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,1.34kg,898.94
2,3,HP,250 G6,Notebook,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,256GB SSD,Intel HD Graphics 620,No OS,1.86kg,575.00
3,4,Apple,MacBook Pro,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16GB,512GB SSD,AMD Radeon Pro 455,macOS,1.83kg,2537.45
4,5,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8GB,256GB SSD,Intel Iris Plus Graphics 650,macOS,1.37kg,1803.60
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1298,1316,Lenovo,Yoga 500-14ISK,2 in 1 Convertible,14.0,IPS Panel Full HD / Touchscreen 1920x1080,Intel Core i7 6500U 2.5GHz,4GB,128GB SSD,Intel HD Graphics 520,Windows 10,1.8kg,638.00
1299,1317,Lenovo,Yoga 900-13ISK,2 in 1 Convertible,13.3,IPS Panel Quad HD+ / Touchscreen 3200x1800,Intel Core i7 6500U 2.5GHz,16GB,512GB SSD,Intel HD Graphics 520,Windows 10,1.3kg,1499.00
1300,1318,Lenovo,IdeaPad 100S-14IBR,Notebook,14.0,1366x768,Intel Celeron Dual Core N3050 1.6GHz,2GB,64GB Flash Storage,Intel HD Graphics,Windows 10,1.5kg,229.00
1301,1319,HP,15-AC110nv (i7-6500U/6GB/1TB/Radeon,Notebook,15.6,1366x768,Intel Core i7 6500U 2.5GHz,6GB,1TB HDD,AMD Radeon R5 M330,Windows 10,2.19kg,764.00


In [16]:
# This class will be used in order to exchange the different attributes
# to create negative examples
class Attributes():
    company = {'Apple'}
    product = {'MacBook Pro'}
    inches = {'13.3'}
    cpu = {'Intel Core i5 2.3GHz'}
    ram = {'4GB'}
    memory = {'256GB SSD'}
    gpu = {'Intel HD Graphics 520'}
    screen = {'1440x900'}
    
    def get_all_data():
        return {
            'company': Attributes.company,
            'product': Attributes.product,
            'inches': Attributes.inches,
            'cpu': Attributes.cpu,
            'ram': Attributes.ram,
            'memory': Attributes.memory,
            'gpu': Attributes.gpu,
            'screen': Attributes.screen
        }

In [17]:
# Create attribute sets
def create_attribute_sets(df):
    Attributes.company.update([row.Company for row in laptop_df[['Company']].itertuples()])
    Attributes.product.update([row.Product for row in laptop_df[['Product']].itertuples()])
    Attributes.inches.update([str(row.Inches) for row in laptop_df[['Inches']].itertuples()])
    Attributes.cpu.update([row.Cpu for row in laptop_df[['Cpu']].itertuples()])
    Attributes.ram.update([row.Ram for row in laptop_df[['Ram']].itertuples()])
    Attributes.memory.update([row.Memory for row in laptop_df[['Memory']].itertuples()])
    Attributes.gpu.update([row.Gpu for row in laptop_df[['Gpu']].itertuples()])
    Attributes.screen.update([row.ScreenResolution for row in laptop_df[['ScreenResolution']].itertuples()])

create_attribute_sets(laptop_df)

In [28]:
def concatenate_row(row):
    # Note: got rid of everything after the '(' because it has info about the actual specs of the laptop
    # so if we change the specs, we need to fix that too
    
    # Special tags at the end of the amount of inches of the laptop and the RAM to simulate real data
    inch_attr = str(row['Inches']) + random.choice([' inch', '', '"'])
    ram_attr = row['Ram'] + random.choice([' ram', ' memory', ''])
    
    # These are words that commonly come up with laptops
    modifiers = ['premium', 'new', 'fast', 'latest model']
    add_ins = ['USB 3.0', 'USB 3.1 Type-C', 'USB Type-C', 'Bluetooth', 'WIFI', 'Webcam', 'FP Reader',
               'HDMI', '802.11ac', '802.11 ac', 'home', 'flagship', 'business', 'GbE LAN', 'DVD-RW', 'DVD', 'Windows 10']
    
    cpu_attr = row['Cpu']
    if random.choice([0, 1]):
        cpu_attr = cpu_attr.split(' ')
        if random.choice([0, 1]):
            if 'Intel' in cpu_attr:
                cpu_attr.remove('Intel')
        if random.choice([0, 1]):
            if 'Core' in cpu_attr:
                cpu_attr.remove('Core')
        if random.choice([0, 1]):
            if 'AMD' in cpu_attr:
                cpu_attr.remove('AMD')
    
        cpu_attr = ' '.join(cpu_attr)

    # Create a list for all the product attributes
    order_attrs = [random.choice(modifiers),
                   row['Company'],
                   row['Product'].split('(')[0],
                   row['TypeName'],
                   inch_attr,
                   row['ScreenResolution'],
                   cpu_attr,
                   ram_attr,
                   row['Memory'],
                   row['Gpu']]
    
    order_attrs = order_attrs + random.sample(add_ins, random.choice([1, 2, 3, 4]))
    
    # Shuffle the data because in real data, it does not really matter what order the attributes are in
    random.shuffle(order_attrs)
    
    return ' '.join(order_attrs)

In [29]:
# Creates the negative examples for the laptop data
# The laptop_df is the original data, the new_df is the dataframe to append the new data to
# and the attributes are the attributes to swap for the new data
def create_neg_laptop_data(laptop_df, attributes):
    new_column_names = ['title_one', 'title_two', 'label']
    negative_df = pd.DataFrame(columns = new_column_names)
    for row in range(len(laptop_df)):
        # Create a copy of the row for the negative example
        neg_row = laptop_df.iloc[row]
        for attribute_class in attributes:
            # Get the row in the laptop_data
            orig_row = laptop_df.iloc[row]
            
            # Get the attribute that we are trying to change
            attribute_val = orig_row[attribute_class]
            
            # Temporarily value for the new value
            new_val = attribute_val
            
            # Make sure we really get a new attribute
            while new_val == attribute_val:
                new_val = random.sample(Attributes.get_all_data()[attribute_class.lower()], 1)[0]
            
            # Change the value in the neg_row to the new value
            neg_row[attribute_class] = new_val
            
            # Concatenate and normalize the data
            title_one = remove_stop_words(concatenate_row(orig_row).lower())
            title_two = remove_stop_words(concatenate_row(neg_row).lower())
            
            # Append the data to the new df
            negative_df = negative_df.append(pd.DataFrame([[title_one, title_two, 0]], columns=new_column_names))
    
    return negative_df

In [30]:
neg_df = create_neg_laptop_data(laptop_df, attributes=['Cpu', 'Memory', 'Ram', 'Inches', 'Product'])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [31]:
print_dataframe(neg_df)

13 3 inch 8gb intel core i5 2 3ghz premium ultrabook apple 128gb ssd ips panel retina display 2560x1600 macbook pro wifi intel iris plus graphics 640
ultrabook 802 11 ac ips panel retina display 2560x1600 macbook pro apple usb 3 1 type c intel iris plus graphics 640 premium intel xeon e3 1535m v5 2 9ghz 13 3 128gb ssd 8gb
________________________________________________________________
13 3" ultrabook ips panel retina display 2560x1600 apple intel iris plus graphics 640 128gb ssd intel core i5 2 3ghz macbook pro 8gb ram new 802 11 ac
intel xeon e3 1535m v5 2 9ghz usb 3 0 premium 32gb ssd usb type c macbook pro intel iris plus graphics 640 ips panel retina display 2560x1600 apple ultrabook dvd 8gb memory 13 3 inch
________________________________________________________________
usb 3 1 type c ultrabook ips panel retina display 2560x1600 intel core i5 2 3ghz macbook pro bluetooth gbe lan intel iris plus graphics 640 fast apple 8gb ram wifi 13 3 inch 128gb ssd
4gb intel iris plus graphics

8gb ram latitude 5590 intel uhd graphics 620 new intel core i5 8250u 1 6ghz ips panel full hd 1920x1080 802 11ac ultrabook dell 15 6" dvd 256gb ssd
flagship ultrabook latitude 5590 8gb ram usb 3 0 ips panel full hd 1920x1080 premium dell usb 3 1 type c intel core i5 6200u 2 3ghz 15 6" 802 11 ac intel uhd graphics 620 16gb flash storage
________________________________________________________________
core i5 8250u 1 6ghz 8gb ram ips panel full hd 1920x1080 802 11 ac latitude 5590 latest model 15 6" 256gb ssd ultrabook intel uhd graphics 620 dell
dell 16gb flash storage ultrabook webcam latitude 5590 ips panel full hd 1920x1080 flagship intel uhd graphics 620 4gb latest model 15 6" intel core i5 6200u 2 3ghz
________________________________________________________________
8gb memory 15 6 dell bluetooth 256gb ssd intel uhd graphics 620 webcam latest model intel core i5 8250u 1 6ghz latitude 5590 ips panel full hd 1920x1080 usb type c ultrabook
webcam latitude 5590 new 4gb ram 12 5 inch 16

elitebook 840 ultrabook full hd 1920x1080 intel hd graphics 620 4gb ram latest model 256gb ssd intel core i5 7500u 2 7ghz hp wifi 14 0
elitebook 840 4gb ram intel celeron dual core 3855u 1 6ghz ultrabook new bluetooth full hd 1920x1080 14 0" intel hd graphics 620 32gb flash storage hp
________________________________________________________________
gbe lan intel hd graphics 620 hp fast ultrabook full hd 1920x1080 elitebook 840 256gb ssd intel core i5 7500u 2 7ghz bluetooth 14 0" 4gb ram
16gb 14 0" elitebook 840 ultrabook intel celeron dual core 3855u 1 6ghz usb type c 32gb flash storage fast full hd 1920x1080 dvd intel hd graphics 620 usb 3 1 type c hdmi hp
________________________________________________________________
elitebook 840 gbe lan latest model intel hd graphics 620 256gb ssd hp usb 3 1 type c full hd 1920x1080 4gb ram bluetooth 14 0 i5 7500u 2 7ghz ultrabook
802 11 ac celeron dual core 3855u 1 6ghz hp 802 11ac ultrabook intel hd graphics 620 full hd 1920x1080 fp reader 13 0

15 6 1tb hdd gbe lan 802 11ac 8gb ram amd firepro w4190m webcam notebook new full hd 1920x1080 hp windows 10 i7 7500u 2 7ghz zbook 15u
zbook 15u windows 10 1tb hdd home amd firepro w4190m new 8gb memory hp amd a6 series a6 9220 2 5ghz notebook 15 6 full hd 1920x1080
________________________________________________________________
8gb ram full hd 1920x1080 amd firepro w4190m notebook intel core i7 7500u 2 7ghz 1tb hdd hp zbook 15u usb 3 1 type c new 15 6
128gb ssd 1tb hdd 802 11 ac 802 11ac notebook full hd 1920x1080 amd a6 series a6 9220 2 5ghz flagship hp amd firepro w4190m home 15 6" premium zbook 15u 8gb memory
________________________________________________________________
1tb hdd windows 10 full hd 1920x1080 zbook 15u hp gbe lan new notebook intel i7 7500u 2 7ghz 15 6 8gb amd firepro w4190m
zbook 15u premium full hd 1920x1080 amd firepro w4190m notebook hp gbe lan 15 6 inch 128gb ssd 1tb hdd 24gb amd a6 series a6 9220 2 5ghz 802 11 ac
_____________________________________________

________________________________________________________________
802 11 ac premium rog gl702vs gc095t home gaming intel core i7 7700hq 2 8ghz 256gb ssd 1tb hdd nvidia geforce gtx 1070 full hd 1920x1080 17 3" 16gb asus
gaming 512gb ssd 256gb ssd rog gl702vs gc095t 802 11ac intel 6y30 0 9ghz asus nvidia geforce gtx 1070 17 3 inch 2gb memory usb type c bluetooth latest model full hd 1920x1080
________________________________________________________________
premium usb 3 0 256gb ssd 1tb hdd rog gl702vs gc095t webcam intel core i7 7700hq 2 8ghz 17 3 inch nvidia geforce gtx 1070 16gb ram asus gaming wifi full hd 1920x1080
webcam 512gb ssd 256gb ssd gaming home 2gb ram flagship usb 3 0 nvidia geforce gtx 1070 15 6" rog gl702vs gc095t asus intel core 6y30 0 9ghz full hd 1920x1080 fast
________________________________________________________________
intel core i7 7700hq 2 8ghz 17 3" asus gaming full hd 1920x1080 dvd rog gl702vs gc095t 256gb ssd 1tb hdd fast nvidia geforce gtx 1070 16gb ram
full

premium 16gb memory notebook webcam fp reader intel hd graphics 400 15 6" dell inspiron 3552 802 11 ac intel core i7 2 7ghz 32gb ssd 1366x768
________________________________________________________________
intel hd graphics 400 windows 10 dell 500gb hdd 4gb memory inspiron 3552 latest model intel celeron dual n3060 1 60ghz 15 6" 1366x768 notebook bluetooth
usb 3 0 1366x768 dell 17 0 intel i7 2 7ghz windows 10 32gb ssd inspiron 3552 notebook intel hd graphics 400 16gb memory new webcam
________________________________________________________________
inspiron 3552 15 6 inch 500gb hdd 4gb memory latest model notebook 1366x768 business intel hd graphics 400 celeron dual core n3060 1 60ghz webcam dell dvd rw
premium 802 11 ac 32gb ssd intel i7 2 7ghz intel hd graphics 400 dell 1366x768 notebook 16gb 17 0 inch portege z30 c 16z
________________________________________________________________
intel hd graphics 500 wifi flagship dvd celeron dual n3350 1 1ghz premium 1366x768 802 11ac notebook

intel hd graphics 620 14 0" lenovo premium 8gb ram business notebook thinkpad t470 intel i7 6700hq 2 6ghz 256gb ssd fp reader full hd 1920x1080 hdmi
________________________________________________________________
14 0 inch intel hd graphics 620 full hd 1920x1080 8gb dvd rw lenovo 256gb ssd fast notebook thinkpad t470 intel i5 7200u 2 5ghz
lenovo 14 0 128gb ssd 1tb hdd hdmi thinkpad t470 intel core i7 6700hq 2 6ghz 802 11ac 802 11 ac intel hd graphics 620 new 8gb notebook full hd 1920x1080
________________________________________________________________
full hd 1920x1080 14 0" flagship 802 11 ac thinkpad t470 intel hd graphics 620 8gb home new notebook intel core i5 7200u 2 5ghz lenovo 256gb ssd
notebook lenovo 128gb ssd 1tb hdd 12gb ram dvd intel core i7 6700hq 2 6ghz thinkpad t470 intel hd graphics 620 latest model 14 0 inch usb type c full hd 1920x1080
________________________________________________________________
new full hd 1920x1080 intel hd graphics 620 lenovo notebook flagshi

________________________________________________________________
hp intel core i7 7700hq 2 8ghz fast omen 15 ce006nv gaming 17 3 inch 12gb memory nvidia geforce gtx 1060 full hd 1920x1080 home 1tb hdd
256gb ssd 500gb hdd home gaming new 12gb nvidia geforce gtx 1060 usb 3 1 type c fp reader hp omen 15 ce006nv business amd a6 series a6 9220 2 5ghz full hd 1920x1080 17 3"
________________________________________________________________
gaming 1tb hdd hp nvidia geforce gtx 1060 omen 15 ce006nv full hd 1920x1080 12gb ram 17 3" fast dvd rw intel core i7 7700hq 2 8ghz
full hd 1920x1080 amd a6 series a6 9220 2 5ghz nvidia geforce gtx 1060 17 3 new hp windows 10 gaming omen 15 ce006nv 24gb ram 256gb ssd 500gb hdd
________________________________________________________________
nvidia geforce gtx 1060 home 1tb hdd hp intel i7 7700hq 2 8ghz flagship full hd 1920x1080 omen 15 ce006nv 12gb ram fast 17 3" gaming webcam
18 4 inch hp nvidia geforce gtx 1060 a6 series a6 9220 2 5ghz gaming 256gb ssd 50

thinkpad p51 xeon e3 1535m v6 3 1ghz nvidia quadro m2200m usb type c 15 6" 32gb ram lenovo premium ips panel 4k ultra hd 3840x2160 802 11 ac notebook 1tb ssd
notebook nvidia quadro m2200m 32gb intel 1 2ghz bluetooth dvd rw fast home thinkpad p51 dvd lenovo 1tb ssd 15 6 inch ips panel 4k ultra hd 3840x2160
________________________________________________________________
usb type c ips panel 4k ultra hd 3840x2160 15 6" notebook thinkpad p51 lenovo usb 3 1 type c nvidia quadro m2200m usb 3 0 new home intel xeon e3 1535m v6 3 1ghz 32gb 1tb ssd
intel core 1 2ghz thinkpad p51 premium lenovo ips panel 4k ultra hd 3840x2160 nvidia quadro m2200m 15 6" windows 10 hdmi usb type c 32gb ssd notebook 32gb ram
________________________________________________________________
home hdmi 32gb ram 1tb ssd nvidia quadro m2200m 15 6 inch gbe lan thinkpad p51 lenovo xeon e3 1535m v6 3 1ghz premium webcam notebook ips panel 4k ultra hd 3840x2160
nvidia quadro m2200m usb type c thinkpad p51 notebook intel core

________________________________________________________________
chromebook x360 hp touchscreen 1366x768 11 6 2 1 convertible 802 11 ac fast wifi intel hd graphics 500 intel celeron dual core n3350 1 1ghz 64gb flash storage 8gb
business intel hd graphics 500 intel core i5 7500u 2 7ghz home usb 3 0 premium hp 17 3" 508gb hybrid 64gb ram 2 1 convertible touchscreen 1366x768 gbe lan chromebook x360
________________________________________________________________
11 6 intel celeron dual core n3350 1 1ghz 2 1 convertible intel hd graphics 500 chromebook x360 touchscreen 1366x768 webcam 8gb 64gb flash storage new hp
touchscreen 1366x768 intel hd graphics 500 new 64gb hp 2 1 convertible thinkpad t470 hdmi 17 3" 508gb hybrid intel i5 7500u 2 7ghz windows 10
________________________________________________________________
15 6 802 11ac home intel core i7 8550u 1 8ghz ultrabook 802 11 ac ips panel full hd 1920x1080 intel hd graphics 620 gram 15z975 fast lg 8gb ram usb 3 0 512gb ssd
intel core i5

16gb ssd hdmi 250 g6 2gb memory full hd 1920x1080 premium windows 10 802 11 ac gbe lan intel hd graphics 520 notebook 15 6 inch hp celeron dual n3060 1 60ghz
________________________________________________________________
notebook new dvd rw intel hd graphics 520 gbe lan 250 g6 15 6 hp 256gb ssd 8gb i3 6006u 2ghz full hd 1920x1080
2gb business notebook 12 3" full hd 1920x1080 250 g6 hp celeron dual core n3060 1 60ghz intel hd graphics 520 latest model 16gb ssd
________________________________________________________________
hp 250 g6 notebook business 15 6 full hd 1920x1080 256gb ssd flagship 8gb memory usb 3 0 new 802 11 ac intel core i3 6006u 2ghz intel hd graphics 520
2gb gbe lan 16gb ssd full hd 1920x1080 v310 15ikb 12 3 inch new intel hd graphics 520 hp intel celeron dual core n3060 1 60ghz windows 10 notebook
________________________________________________________________
core i5 7200u 2 5ghz premium notebook 1366x768 14 0 inch usb 3 0 256gb ssd latitude 5480 8gb memory dell in

________________________________________________________________
vostro 5568 intel core i7 7500u 2 7ghz full hd 1920x1080 1tb hdd fast 8gb memory dell notebook usb 3 1 type c nvidia geforce gt 940mx 15 6
nvidia geforce gt 940mx 8gb 15 6 inch dvd rw 802 11ac dell full hd 1920x1080 flagship premium 32gb ssd samsung cortex a72 a53 2 0ghz windows 10 notebook vostro 5568
________________________________________________________________
webcam business 8gb intel core i7 7500u 2 7ghz vostro 5568 full hd 1920x1080 nvidia geforce gt 940mx 15 6" dell notebook dvd rw 1tb hdd latest model
webcam nvidia geforce gt 940mx vostro 5568 fast 32gb ssd 15 6 inch wifi 12gb memory samsung cortex a72 a53 2 0ghz notebook full hd 1920x1080 dell
________________________________________________________________
usb 3 1 type c vostro 5568 15 6" dell 1tb hdd notebook latest model intel core i7 7500u 2 7ghz full hd 1920x1080 8gb memory dvd rw nvidia geforce gt 940mx
webcam dell samsung cortex a72 a53 2 0ghz notebook 

ips panel full hd 1920x1080 intel core i7 6500u 2 5ghz 8gb ram lenovo 1tb hdd ideapad 510 15isk 15 6 notebook new home nvidia geforce 940mx
notebook 12 5 amd a6 series 7310 2ghz lenovo ips panel full hd 1920x1080 1tb hdd 1tb hdd 24gb flagship ideapad 510 15isk webcam nvidia geforce 940mx premium
________________________________________________________________
notebook ideapad 510 15isk ips panel full hd 1920x1080 bluetooth 15 6 inch nvidia geforce 940mx 8gb ram lenovo core i7 6500u 2 5ghz dvd rw 1tb hdd usb 3 1 type c fast
nvidia geforce 940mx premium v110 15iap 1tb hdd 1tb hdd 12 5 inch 24gb memory notebook amd a6 series 7310 2ghz lenovo ips panel full hd 1920x1080 usb type c
________________________________________________________________
notebook 500gb hdd dell pentium quad n3710 1 6ghz bluetooth 4gb ram 15 6" dvd premium inspiron 3552 1366x768 802 11 ac intel hd graphics
intel hd graphics 1366x768 dell fast i5 6260u 1 8ghz 4gb ram bluetooth 15 6 inch windows 10 notebook inspiron 35

dvd rw 4gb memory 802 11 ac 32gb flash storage 11 6 intel hd graphics 400 latest model home intel core i5 2 0ghz acer chromebook c738t c2ej ips panel touchscreen 1366x768 2 1 convertible 802 11ac
________________________________________________________________
intel celeron dual n3060 1 6ghz ips panel touchscreen 1366x768 fast acer chromebook c738t c2ej intel hd graphics 400 2 1 convertible 32gb flash storage windows 10 11 6 4gb memory
ips panel touchscreen 1366x768 usb 3 1 type c 2 1 convertible 4gb ram hdmi 256gb flash storage intel core i5 2 0ghz business latest model acer chromebook c738t c2ej 11 6 intel hd graphics 400
________________________________________________________________
4gb acer 32gb flash storage gbe lan intel celeron dual core n3060 1 6ghz 2 1 convertible wifi 11 6" intel hd graphics 400 chromebook c738t c2ej latest model ips panel touchscreen 1366x768
intel hd graphics 400 latest model dvd rw wifi i5 2 0ghz chromebook c738t c2ej 2gb memory 2 1 convertible 256gb fla

intel hd graphics 520 dvd intel core i5 2 3ghz latest model 12 3 24gb 802 11ac hp 128gb hdd full hd 1920x1080 business elitebook 850 notebook
________________________________________________________________
new full hd 1920x1080 elitebook 850 8gb business usb type c notebook hdmi core i7 6500u 2 5ghz 15 6 intel hd graphics 520 256gb ssd hp
full hd 1920x1080 128gb hdd vostro 5568 new 802 11ac 24gb ram notebook i5 2 3ghz hp intel hd graphics 520 12 3 inch
________________________________________________________________
full hd 1920x1080 webcam 256gb ssd elitebook 820 intel core i7 6500u 2 5ghz intel hd graphics 520 hp 8gb ram home 12 5 gbe lan premium ultrabook bluetooth
ultrabook full hd 1920x1080 elitebook 820 webcam hdmi 802 11 ac fp reader intel pentium dual core 4405u 2 1ghz 8gb ram 256gb ssd 12 5 premium hp intel hd graphics 520
________________________________________________________________
full hd 1920x1080 premium usb 3 1 type c fp reader 256gb ssd ultrabook 8gb memory eliteboo

ips panel full hd 1920x1080 15 6" core i7 6700hq 2 6ghz 802 11 ac gaming fast 16gb rog gl552vw cn470t 128gb ssd 1tb hdd asus nvidia geforce gtx 960m usb 3 1 type c
fp reader asus gaming 802 11 ac usb type c ips panel full hd 1920x1080 15 6 nvidia geforce gtx 960m rog gl552vw cn470t 24gb memory fast intel pentium dual core n4200 1 1ghz flagship 508gb hybrid
________________________________________________________________
gaming intel core i7 6700hq 2 6ghz ips panel full hd 1920x1080 15 6 fast nvidia geforce gtx 960m 128gb ssd 1tb hdd usb type c 802 11ac windows 10 asus rog gl552vw cn470t gbe lan 16gb ram
24gb rog gl552vw cn470t webcam asus premium gaming gbe lan intel pentium dual core n4200 1 1ghz ips panel full hd 1920x1080 fp reader 508gb hybrid nvidia geforce gtx 960m usb 3 0 15 0 inch
________________________________________________________________
usb type c gbe lan 16gb ram gaming asus 15 6 bluetooth dvd nvidia geforce gtx 960m ips panel full hd 1920x1080 premium 128gb ssd 1tb hd

________________________________________________________________
intel hd graphics 500 4gb ram 15 6 acer aspire 3 802 11 ac 500gb hdd notebook latest model 1366x768 dvd intel celeron dual core n3350 2ghz
fast intel hd graphics 500 amd e series 6110 1 5ghz 802 11ac notebook 15 6 4gb bluetooth aspire 3 1366x768 500gb hdd acer
________________________________________________________________
15 6 home 500gb hdd intel celeron dual core n3350 2ghz business aspire 3 notebook 1366x768 4gb memory flagship intel hd graphics 500 new acer
aspire 3 usb 3 0 32gb flash storage usb type c amd e series 6110 1 5ghz 1366x768 premium acer 15 6" intel hd graphics 500 gbe lan fp reader 4gb notebook
________________________________________________________________
15 6" wifi intel hd graphics 500 aspire 3 4gb intel celeron dual core n3350 2ghz latest model home 1366x768 acer 500gb hdd 802 11 ac notebook usb 3 1 type c
32gb flash storage dvd aspire 3 notebook intel hd graphics 500 acer usb 3 0 bluetooth dvd rw

intel core i7 7560u 2 4ghz 2 1 convertible quad hd touchscreen 3200x1800 13 9 inch dell 1tb ssd 1tb hdd 14 am079na 12gb ram premium hdmi intel hd graphics 615
________________________________________________________________
inspiron 3552 intel hd graphics 4gb wifi premium 500gb hdd pentium quad n3710 1 6ghz 1366x768 notebook 15 6 dell
dell 1366x768 dvd hdmi 802 11 ac intel core i7 6700hq 2 6ghz inspiron 3552 15 6 inch intel hd graphics notebook fast 4gb ram 500gb hdd
________________________________________________________________
dell 15 6 intel pentium quad n3710 1 6ghz hdmi inspiron 3552 1366x768 notebook intel hd graphics 4gb ram premium 500gb hdd dvd
intel core i7 6700hq 2 6ghz 15 6 premium dell 1366x768 intel hd graphics usb type c 128gb flash storage hdmi 4gb 802 11ac notebook inspiron 3552
________________________________________________________________
dell 15 6 inspiron 3552 4gb intel pentium quad core n3710 1 6ghz business notebook 500gb hdd dvd flagship fast intel hd graphi

In [32]:
# Creates the postive examples for the laptop data
# The laptop_df is the original data, the new_df is the dataframe to append the new data to
# and the attributes are the attributes to swap or delete for the new data
def create_pos_laptop_data(laptop_df, rm_attrs, add_attrs):
    new_column_names = ['title_one', 'title_two', 'label']
    pos_df = pd.DataFrame(columns = new_column_names)
    for row in range(len(laptop_df)):
        # Remove the attribute from the new title
        for attr_list in rm_attrs:
            # Create a copy of the row for the negative example
            new_row = laptop_df.iloc[row]
            orig_row = laptop_df.iloc[row]
            for attr in attr_list:
                new_row[attr] = ''
        
            title_one = remove_stop_words(concatenate_row(orig_row).lower())
            title_two = remove_stop_words(concatenate_row(new_row).lower())

            # Occassionally add in the operating system just to switch it up
#             if (random.sample([0, 1], 1)):
#                 for attr in add_attrs:
#                     title_two += ' ' + orig_row[attr].lower()

            pos_df = pos_df.append(pd.DataFrame([[title_one, title_two, 1]], columns=new_column_names))

    return pos_df

In [33]:
pos_df = create_pos_laptop_data(laptop_df, rm_attrs = [['Company'], ['TypeName'], ['ScreenResolution'], ['Product'], ['TypeName', 'ScreenResolution']], add_attrs = [])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [34]:
print_dataframe(pos_df)

apple intel core i5 2 3ghz ips panel retina display 2560x1600 13 3" dvd ultrabook macbook pro wifi 8gb latest model intel iris plus graphics 640 128gb ssd bluetooth
13 3" dvd rw new intel iris plus graphics 640 ultrabook webcam 8gb memory 128gb ssd windows 10 macbook pro intel core i5 2 3ghz ips panel retina display 2560x1600
________________________________________________________________
intel iris plus graphics 640 apple 13 3 inch hdmi ips panel retina display 2560x1600 gbe lan 128gb ssd ultrabook premium 8gb ram i5 2 3ghz macbook pro
new webcam macbook pro intel iris plus graphics 640 business usb 3 1 type c 8gb memory 13 3 inch ips panel retina display 2560x1600 apple intel core i5 2 3ghz 128gb ssd
________________________________________________________________
intel iris plus graphics 640 dvd rw ips panel retina display 2560x1600 intel core i5 2 3ghz 13 3" 128gb ssd bluetooth ultrabook new apple windows 10 8gb ram macbook pro usb 3 1 type c
ultrabook 8gb memory apple new dvd rw 

latest model apple intel core i5 1 3ghz ips panel retina display 2304x1440 hdmi macbook 12" 512gb ssd intel hd graphics 615 8gb memory 12 0 inch
________________________________________________________________
12 0" usb 3 0 apple ips panel retina display 2304x1440 windows 10 intel hd graphics 615 8gb memory 512gb ssd macbook 12" wifi ultrabook usb 3 1 type c intel core i5 1 3ghz new
fast intel hd graphics 615 intel core i5 1 3ghz ultrabook 512gb ssd 12 0" 8gb macbook 12" usb 3 1 type c windows 10 webcam apple
________________________________________________________________
fast business 8gb memory intel core i5 1 3ghz 12 0 ultrabook usb type c 512gb ssd ips panel retina display 2304x1440 macbook 12" apple intel hd graphics 615 dvd
apple ultrabook new ips panel retina display 2304x1440 512gb ssd dvd rw 12 0 8gb ram intel hd graphics 615 intel core i5 1 3ghz
________________________________________________________________
business intel hd graphics 615 8gb ram macbook 12" new usb 3 1 typ

ips panel full hd 1920x1080 fast acer notebook webcam aspire a517 51g usb type c nvidia geforce mx150 windows 10 intel core i5 8250u 1 6ghz 17 3 inch 4gb memory 256gb ssd
notebook premium i5 8250u 1 6ghz acer 256gb ssd 802 11ac aspire a517 51g gbe lan nvidia geforce mx150 4gb ram dvd 17 3 inch
________________________________________________________________
intel i5 8250u 1 6ghz 17 3 802 11ac usb 3 1 type c 256gb ssd fast nvidia geforce mx150 aspire a517 51g hdmi notebook acer 4gb memory ips panel full hd 1920x1080
4gb memory 17 3 webcam bluetooth fast intel core i5 8250u 1 6ghz acer ips panel full hd 1920x1080 nvidia geforce mx150 256gb ssd notebook
________________________________________________________________
17 3" notebook acer nvidia geforce mx150 intel i5 8250u 1 6ghz new 256gb ssd ips panel full hd 1920x1080 4gb ram aspire a517 51g dvd rw
fast nvidia geforce mx150 802 11 ac 17 3 inch 256gb ssd aspire a517 51g 4gb memory intel core i5 8250u 1 6ghz bluetooth acer
_______________

________________________________________________________________
new bluetooth 802 11ac 1tb hdd usb 3 1 type c intel i3 6006u 2ghz intel hd graphics 520 acer 1366x768 notebook aspire a315 51 15 6 inch 4gb home
premium 1tb hdd intel hd graphics 520 15 6 inch 1366x768 flagship aspire a315 51 4gb intel i3 6006u 2ghz acer
________________________________________________________________
intel core i3 6006u 2ghz usb 3 0 15 6 4gb memory gbe lan acer home notebook 1366x768 aspire a315 51 1tb hdd fast intel hd graphics 520
flagship notebook acer 4gb 15 6" intel hd graphics 520 intel core i3 6006u 2ghz aspire a315 51 1tb hdd new
________________________________________________________________
bluetooth 15 6 1tb hdd intel core i3 6006u 2ghz usb 3 0 1366x768 acer intel hd graphics 520 home 4gb memory premium aspire a315 51 notebook
802 11ac 4gb ram hdmi intel core i3 6006u 2ghz notebook 15 6 usb type c usb 3 0 acer intel hd graphics 520 new 1tb hdd 1366x768
________________________________________

________________________________________________________________
hp 4gb memory 250 g6 full hd 1920x1080 fast intel hd graphics 520 1tb hdd intel core i3 6006u 2ghz wifi 15 6 notebook
802 11 ac 250 g6 full hd 1920x1080 latest model dvd rw i3 6006u 2ghz 15 6" intel hd graphics 520 4gb memory 1tb hdd notebook
________________________________________________________________
windows 10 1tb hdd notebook home 4gb ram usb 3 1 type c intel core i3 6006u 2ghz hp full hd 1920x1080 250 g6 bluetooth intel hd graphics 520 premium 15 6
4gb 250 g6 1tb hdd latest model full hd 1920x1080 bluetooth intel hd graphics 520 i3 6006u 2ghz 15 6" hp
________________________________________________________________
1tb hdd 15 6 inch intel hd graphics 520 fp reader premium notebook hdmi 4gb full hd 1920x1080 250 g6 core i3 6006u 2ghz hp
intel core i3 6006u 2ghz intel hd graphics 520 dvd rw notebook 1tb hdd 15 6 inch 250 g6 premium hp 4gb memory
________________________________________________________________
full 

full hd touchscreen 1920x1080 intel hd graphics 620 512gb ssd ultrabook core i7 7500u 2 7ghz fast usb 3 1 type c 16gb wifi 13 3" zenbook flip
________________________________________________________________
asus full hd touchscreen 1920x1080 intel hd graphics 620 16gb ram ultrabook fp reader zenbook flip intel core i7 7500u 2 7ghz premium 512gb ssd 13 3 inch
intel core i7 7500u 2 7ghz zenbook flip latest model 802 11 ac 13 3 home intel hd graphics 620 asus full hd touchscreen 1920x1080 512gb ssd 16gb
________________________________________________________________
asus latest model zenbook flip i7 7500u 2 7ghz 13 3 intel hd graphics 620 16gb ultrabook 512gb ssd usb type c full hd touchscreen 1920x1080
intel core i7 7500u 2 7ghz 13 3 802 11 ac intel hd graphics 620 512gb ssd home fast 16gb zenbook flip usb 3 0 ultrabook asus fp reader
________________________________________________________________
ultrabook 16gb 512gb ssd full hd touchscreen 1920x1080 windows 10 latest model 13 3 zenbo

13 5 inch surface laptop 128gb ssd intel core m3 7y30 2 2ghz ultrabook microsoft flagship 4gb intel hd graphics 615 usb 3 1 type c dvd rw premium touchscreen 2256x1504
128gb ssd 802 11ac ultrabook intel core m3 7y30 2 2ghz 13 5 microsoft usb 3 1 type c surface laptop new intel hd graphics 615 4gb
________________________________________________________________
128gb ssd gbe lan premium surface laptop ultrabook touchscreen 2256x1504 intel hd graphics 615 hdmi 4gb intel core m3 7y30 2 2ghz usb type c dvd rw 13 5 microsoft
premium 128gb ssd microsoft ultrabook 4gb memory dvd touchscreen 2256x1504 intel hd graphics 615 core m3 7y30 2 2ghz 13 5" usb type c
________________________________________________________________
bluetooth usb type c microsoft fast surface laptop intel hd graphics 615 touchscreen 2256x1504 4gb ram 128gb ssd ultrabook intel core m3 7y30 2 2ghz 13 5"
13 5" surface laptop 4gb ram fast intel hd graphics 615 windows 10 128gb ssd hdmi microsoft core m3 7y30 2 2ghz
________

________________________________________________________________
full hd 1920x1080 amd radeon 530 802 11 ac notebook home 15 6 dell 256gb ssd intel core i5 8250u 1 6ghz inspiron 5570 8gb memory new
15 6" 256gb ssd dell amd radeon 530 8gb new full hd 1920x1080 inspiron 5570 intel core i5 8250u 1 6ghz usb 3 1 type c
________________________________________________________________
15 6 inch notebook dell webcam 256gb ssd 8gb ram usb 3 0 inspiron 5570 new full hd 1920x1080 i5 8250u 1 6ghz amd radeon 530
256gb ssd inspiron 5570 usb 3 0 fast 15 6" gbe lan amd radeon 530 8gb hdmi dell notebook intel core i5 8250u 1 6ghz wifi
________________________________________________________________
core i5 8250u 1 6ghz 256gb ssd flagship 8gb memory full hd 1920x1080 inspiron 5570 amd radeon 530 gbe lan premium notebook 15 6" dell bluetooth webcam
256gb ssd full hd 1920x1080 15 6 amd radeon 530 8gb notebook dell intel core i5 8250u 1 6ghz new home
________________________________________________________

14 0 inch swift sf114 31 p5hy 1366x768 acer 4gb usb 3 1 type c pentium quad core n3710 1 6ghz latest model 128gb flash storage intel hd graphics 405 notebook
intel pentium quad core n3710 1 6ghz notebook premium intel hd graphics 405 swift sf114 31 p5hy acer 14 0 128gb flash storage business webcam 4gb
________________________________________________________________
swift sf114 31 p5hy 1366x768 premium acer intel hd graphics 405 home 128gb flash storage notebook 4gb ram pentium quad core n3710 1 6ghz 14 0
14 0" home usb type c notebook fp reader acer new 1366x768 intel hd graphics 405 intel pentium quad core n3710 1 6ghz 128gb flash storage gbe lan 4gb memory
________________________________________________________________
intel pentium quad core n3710 1 6ghz 128gb flash storage business 1366x768 4gb memory notebook usb 3 0 14 0 latest model usb 3 1 type c intel hd graphics 405 swift sf114 31 p5hy acer
dvd usb 3 0 802 11 ac usb type c intel hd graphics 405 128gb flash storage intel pen

intel hd graphics 500 intel celeron dual core n3350 1 1ghz touchscreen 1366x768 chromebook x360 flagship hp usb 3 1 type c 64gb flash storage 11 6 new 8gb dvd
________________________________________________________________
8gb hp chromebook x360 fp reader touchscreen 1366x768 intel hd graphics 500 celeron dual core n3350 1 1ghz 64gb flash storage new 11 6" 2 1 convertible
chromebook x360 11 6 64gb flash storage hp 8gb memory fast intel celeron dual core n3350 1 1ghz 2 1 convertible business intel hd graphics 500
________________________________________________________________
touchscreen 1366x768 11 6 inch 8gb memory 2 1 convertible dvd rw 802 11 ac chromebook x360 premium intel hd graphics 500 hp intel celeron dual core n3350 1 1ghz 64gb flash storage
11 6 64gb flash storage celeron dual n3350 1 1ghz new intel hd graphics 500 hp 2 1 convertible touchscreen 1366x768 8gb windows 10
________________________________________________________________
hp 11 6 inch 8gb ram intel hd graphics 5

________________________________________________________________
usb type c notebook new 128gb ssd 1tb hdd 8gb ram nvidia geforce gtx 1050 ips panel full hd 1920x1080 hp intel i5 7300hq 2 5ghz webcam 15 6 dvd rw 15 cb003na hdmi
128gb ssd 1tb hdd nvidia geforce gtx 1050 gbe lan 8gb intel i5 7300hq 2 5ghz 15 6" hp ips panel full hd 1920x1080 notebook usb 3 0 latest model
________________________________________________________________
usb 3 1 type c 15 6" core i5 7300hq 2 5ghz 128gb ssd 1tb hdd home 15 cb003na webcam 8gb ips panel full hd 1920x1080 fp reader notebook premium nvidia geforce gtx 1050 hp
new 15 6 gbe lan hp 128gb ssd 1tb hdd 15 cb003na intel core i5 7300hq 2 5ghz 8gb nvidia geforce gtx 1050
________________________________________________________________
business zbook 15 nvidia quadro m1000m latest model 15 6" hp intel i7 6700hq 2 6ghz 256gb ssd full hd 1920x1080 8gb ram workstation
full hd 1920x1080 802 11 ac 8gb ram intel core i7 6700hq 2 6ghz workstation nvidia quadro m

dvd new 512gb ssd ultrabook thinkpad x1 16gb ram wifi 14 0 inch ips panel quad hd 2560x1440 intel hd graphics 520 lenovo intel core i7 6600u 2 6ghz
intel hd graphics 520 thinkpad x1 i7 6600u 2 6ghz 16gb memory 14 0 inch ips panel quad hd 2560x1440 new gbe lan ultrabook 512gb ssd 802 11ac
________________________________________________________________
ultrabook 16gb ram intel hd graphics 520 512gb ssd new dvd ips panel quad hd 2560x1440 lenovo core i7 6600u 2 6ghz 14 0 inch thinkpad x1
thinkpad x1 ips panel quad hd 2560x1440 intel hd graphics 520 16gb latest model 512gb ssd core i7 6600u 2 6ghz lenovo 14 0 inch business
________________________________________________________________
thinkpad x1 intel core i7 6600u 2 6ghz lenovo ips panel quad hd 2560x1440 latest model intel hd graphics 520 usb 3 0 gbe lan 16gb ram usb 3 1 type c 512gb ssd wifi ultrabook 14 0 inch
intel hd graphics 520 webcam thinkpad x1 windows 10 802 11ac 16gb 14 0" new ultrabook 512gb ssd lenovo gbe lan core i7 6600

netbook intel i7 7500u 2 7ghz intel hd graphics 620 802 11 ac latest model bluetooth 12 5 inch 8gb elitebook 820 full hd 1920x1080 512gb ssd hp
gbe lan intel core i7 7500u 2 7ghz netbook 12 5 latest model hp hdmi 8gb ram intel hd graphics 620 full hd 1920x1080 usb 3 1 type c 512gb ssd usb 3 0
________________________________________________________________
12 5 inch netbook full hd 1920x1080 intel hd graphics 620 512gb ssd dvd premium intel core i7 7500u 2 7ghz 8gb memory elitebook 820 hp
802 11 ac hp 8gb elitebook 820 intel core i7 7500u 2 7ghz intel hd graphics 620 512gb ssd 12 5" latest model
________________________________________________________________
flagship 16gb flash storage wifi 15 6 inch new intel hd graphics intel celeron dual core 3205u 1 5ghz 4gb memory notebook chromebook cb5 571 c1dz hdmi ips panel full hd 1920x1080 acer
webcam celeron dual 3205u 1 5ghz latest model 16gb flash storage 15 6 inch notebook chromebook cb5 571 c1dz 4gb ram intel hd graphics windows 10 usb

________________________________________________________________
802 11ac notebook premium 15 6" 8gb gbe lan usb type c dell inspiron 5567 1366x768 amd radeon r7 m445 1tb hdd intel core i7 7500u 2 7ghz
webcam 15 6" dell gbe lan amd radeon r7 m445 8gb ram inspiron 5567 intel i7 7500u 2 7ghz latest model 1tb hdd 802 11ac bluetooth
________________________________________________________________
windows 10 latitude 5480 8gb memory 128gb ssd full hd 1920x1080 802 11ac intel core i5 7200u 2 5ghz 14 0" dell intel hd graphics 620 notebook fast
premium 14 0 inch notebook intel hd graphics 620 bluetooth dvd rw 8gb memory dvd 128gb ssd intel core i5 7200u 2 5ghz latitude 5480 full hd 1920x1080
________________________________________________________________
8gb memory latitude 5480 notebook fast full hd 1920x1080 intel hd graphics 620 intel core i5 7200u 2 5ghz dvd business dell 128gb ssd 14 0
windows 10 business intel hd graphics 620 8gb ram 128gb ssd full hd 1920x1080 intel i5 7200u 2 5ghz 14 

new 2tb hdd notebook 16gb memory intel core i7 7500u 2 7ghz home amd radeon r7 m445 inspiron 5567 dvd 15 6 full hd 1920x1080 bluetooth 802 11ac
________________________________________________________________
dell i7 7500u 2 7ghz 15 6 inch 2tb hdd inspiron 5567 premium notebook home gbe lan amd radeon r7 m445 usb 3 0 16gb memory full hd 1920x1080
usb 3 1 type c inspiron 5567 hdmi 16gb ram full hd 1920x1080 intel core i7 7500u 2 7ghz flagship 15 6 inch premium 2tb hdd amd radeon r7 m445 dell
________________________________________________________________
premium gbe lan dell 2tb hdd 16gb intel core i7 7500u 2 7ghz 15 6" amd radeon r7 m445 inspiron 5567 notebook full hd 1920x1080
dell inspiron 5567 amd radeon r7 m445 16gb ram usb 3 1 type c i7 7500u 2 7ghz hdmi usb 3 0 2tb hdd notebook new 15 6" dvd rw
________________________________________________________________
inspiron 5567 dell 2tb hdd 15 6 16gb amd radeon r7 m445 usb type c premium full hd 1920x1080 notebook intel core i7 7500u 

16gb ram notebook ideapad 500 15isk 15 6" amd radeon r7 m360 1 0tb hybrid lenovo full hd 1920x1080 dvd fast intel i7 6500u 2 5ghz
premium lenovo ideapad 500 15isk intel core i7 6500u 2 5ghz 16gb ram 15 6 inch amd radeon r7 m360 1 0tb hybrid hdmi full hd 1920x1080
________________________________________________________________
lenovo dvd rw notebook usb type c 1 0tb hybrid full hd 1920x1080 fast amd radeon r7 m360 ideapad 500 15isk 15 6 inch 16gb memory intel core i7 6500u 2 5ghz
amd radeon r7 m360 usb 3 0 1 0tb hybrid lenovo i7 6500u 2 5ghz bluetooth 16gb ram latest model 15 6 notebook 802 11 ac usb type c ideapad 500 15isk
________________________________________________________________
lenovo 1 0tb hybrid notebook amd radeon r7 m360 flagship 15 6 i7 6500u 2 5ghz new 16gb ram full hd 1920x1080 ideapad 500 15isk
premium 16gb ram intel core i7 6500u 2 5ghz notebook usb 3 1 type c lenovo 15 6 1 0tb hybrid full hd 1920x1080 amd radeon r7 m360
_____________________________________________

15 6 inch 256gb ssd 1tb hdd core i7 7700hq 2 8ghz 802 11ac dvd rog strix 16gb full hd 1920x1080 gaming nvidia geforce gtx 1070 asus premium usb 3 0
asus nvidia geforce gtx 1070 rog strix 256gb ssd 1tb hdd 802 11ac gaming 16gb ram dvd i7 7700hq 2 8ghz fast usb 3 1 type c 15 6
________________________________________________________________
16gb nvidia geforce gtx 1070 256gb ssd 1tb hdd rog strix 802 11 ac home intel i7 7700hq 2 8ghz gbe lan 15 6 fast bluetooth full hd 1920x1080 gaming asus
asus premium full hd 1920x1080 256gb ssd 1tb hdd 16gb 15 6" intel core i7 7700hq 2 8ghz gaming hdmi nvidia geforce gtx 1070
________________________________________________________________
nvidia geforce gtx 1070 15 6 asus fast core i7 7700hq 2 8ghz 16gb 256gb ssd 1tb hdd full hd 1920x1080 gaming rog strix flagship
nvidia geforce gtx 1070 fast intel core i7 7700hq 2 8ghz webcam 256gb ssd 1tb hdd rog strix dvd usb 3 0 16gb memory asus 15 6"
______________________________________________________________

latest model home nvidia geforce 920m x556uj xo044t fp reader intel core i7 6500u 2 5ghz 4gb ram 1366x768 asus notebook 500gb hdd 15 6" webcam
nvidia geforce 920m 4gb memory x556uj xo044t premium bluetooth 15 6 usb 3 1 type c asus 500gb hdd intel core i7 6500u 2 5ghz 1366x768
________________________________________________________________
latest model nvidia geforce 920m wifi 1366x768 notebook intel core i7 6500u 2 5ghz 15 6" 4gb memory asus x556uj xo044t 500gb hdd
15 6" nvidia geforce 920m asus intel core i7 6500u 2 5ghz 4gb memory 500gb hdd latest model x556uj xo044t fp reader notebook
________________________________________________________________
notebook 802 11 ac 4gb ram nvidia geforce 920m 1366x768 500gb hdd core i7 6500u 2 5ghz x556uj xo044t latest model asus 15 6 inch
asus intel i7 6500u 2 5ghz 4gb memory notebook new 15 6 500gb hdd 1366x768 usb 3 0 wifi nvidia geforce 920m
________________________________________________________________
notebook 500gb hdd 1366x768 core i7 6

In [38]:
final_laptop_df = create_final_data(pos_df, neg_df)

In [39]:
final_laptop_df = final_laptop_df.sample(frac=1)

## PCPartPicker Data
* Organize the data
* Preprocess the data
* Create negative and positive data

In [40]:
ram_df = pd.read_csv('data/train/pos_ram_titles.csv')
cpu_df = pd.read_csv('data/train/pos_cpu_titles.csv')
hard_drive_df = pd.read_csv('data/train/pos_hard_drive_titles.csv')

In [13]:
ram_df

Unnamed: 0.1,Unnamed: 0,amazon,bestbuy,newegg,walmart,memoryc
0,0,Corsair Vengeance LPX 16GB (2x8GB) DDR4 DRAM 3...,CORSAIR - Vengeance LPX 16GB (2PK x 8GB) 3.2 G...,CORSAIR Vengeance LPX 16GB (2 x 8GB) 288-Pin D...,Corsair CMK16GX4M2B3200C16 Vengeance LPX 16GB ...,16GB Corsair Vengeance LPX PC4-25600 3200MHz D...
1,0,Corsair Vengeance RGB PRO 16GB (2x8GB) DDR4 32...,CORSAIR - Vengeance RGB PRO 16GB (2PK 8GB) 3.2...,CORSAIR Vengeance RGB Pro 16GB (2 x 8GB) 288-P...,,16GB Corsair Vengeance RGB Pro DDR4 3200MHz CL...
2,0,G.Skill RipJaws V Series 16GB (2 x 8GB) 288-Pi...,,G.SKILL Ripjaws V Series 16GB (2 x 8GB) 288-Pi...,,
3,0,Corsair Vengeance RGB Pro 32GB (2x16GB) DDR4 3...,CORSAIR - Vengeance RGB PRO 32GB (2PK 16GB) 3....,CORSAIR Vengeance RGB Pro 32GB (2 x 16GB) 288-...,,32GB Corsair Vengeance Pro RGB DDR4 3200MHz CL...
4,0,,,G.SKILL Trident Z RGB (For AMD) 16GB (2 x 8GB)...,,16GB G.Skill DDR4 TridentZ RGB 3600Mhz PC4-288...
...,...,...,...,...,...,...
218,0,Corsair Vengeance LPX 32GB (4x8GB) DDR4 3600 (...,,CORSAIR Vengeance LPX 32GB (4 x 8GB) 288-Pin D...,,32GB Corsair Vengeance LPX DDR4 3600MHz PC4-28...
219,0,,,,,
220,0,Corsair Vengeance LPX 16GB (2x8GB) DDR4 DRAM 3...,CORSAIR - VENGEANCE LPX Series 16GB (2PK 8GB) ...,CORSAIR Vengeance LPX 16GB (2 x 8GB) 288-Pin D...,,16GB Corsair Vengeance LPX DDR4 3000MHz PC4-24...
221,0,CORSAIR VENGEANCELPX32GB (1x 32GB) DDR43000(PC...,,CORSAIR Vengeance LPX 32GB 288-Pin DDR4 SDRAM ...,,32GB Corsair Vengeance LPX DDR4 3000MHz CL16 M...


In [14]:
cpu_df

Unnamed: 0.1,Unnamed: 0,amazon,bestbuy,newegg,walmart,memoryc,bhphotovideo
0,0,"AMD Ryzen 5 3600 6-Core, 12-Thread Unlocked De...",AMD - Ryzen 5 3600 3rd Generation 6-Core - 12-...,AMD RYZEN 5 3600 6-Core 3.6 GHz (4.2 GHz Max B...,"AMD Ryzen 5 3600 6-Core, 12-Thread 4.2 GHz AM4...",AMD Ryzen 5 3600 AM4 3.6GHZ 32MB CPU Desktop P...,AMD Ryzen 5 3600 3.6 GHz Six-Core AM4 Processor
1,0,"AMD Ryzen 7 3700X 8-Core, 16-Thread Unlocked D...",AMD - Ryzen 7 3700X 3rd Generation 8-Core - 16...,AMD RYZEN 7 3700X 8-Core 3.6 GHz (4.4 GHz Max ...,"AMD Ryzen 7 3700X 8-Core, 16-Thread 4.4 GHz AM...",AMD Ryzen 7 3700x 3.6GHz 32MB AM4 CPU Desktop ...,AMD Ryzen 7 3700X 3.6 GHz Eight-Core AM4 Proce...
2,0,AMD Ryzen 5 2600 Processor with Wraith Stealth...,,,,AMD Ryzen 5 2600 Six-Core 3.4GHz Socket AM4 19...,
3,0,"AMD Ryzen 9 3900X 12-core, 24-thread unlocked ...",AMD - Ryzen 9 3900X 3rd Generation 12-core - 2...,AMD RYZEN 9 3900X 12-Core 3.8 GHz (4.6 GHz Max...,AMD RYZEN 9 3900X 12-Core 3.8 GHz (4.6 GHz Max...,AMD Ryzen 9 3900X 3.8GHz 64MB Desktop Processo...,AMD Ryzen 9 3900X 3.8 GHz 12-Core AM4 Processor
4,0,AMD Ryzen 3 3200G 4-Core Unlocked Desktop Proc...,AMD - Ryzen 3 3200G 3rd Generation 4-Core - 4-...,AMD RYZEN 3 3200G 4-Core 3.6 GHz (4.0 GHz Max ...,,AMD Ryzen 3 AM4 3.6GHZ 4MB Desktop Processor B...,
...,...,...,...,...,...,...,...
499,0,,,,,,
500,0,Intel Xeon E3-1220 V6 Processors BX80677E31220V6,,Intel Xeon E3-1220 V6 Kaby Lake 3.0 GHz (3.5 G...,XEON E3-1220 V6 FC-LGA14C 3G 8MB CACHE BOXED,Intel Xeon E3-1220 V6 3GHz Kaby Lake CPU LGA11...,
501,0,Intel - BX80684E2134 - Intel Xeon E-2134-3.5 G...,,,Intel BX80684E2134 Xeon Quad-core E-2134 3.5GH...,,
502,0,"Intel BX80662E31230V5 XEON E3-1230V5, 3.4 GHZ,...",,,,,


In [7]:
hard_drive_df

Unnamed: 0.1,Unnamed: 0,amazon,bestbuy,newegg,walmart,memoryc,bhphotovideo
0,0,"Seagate Barracuda ST2000DM008 2 TB 3.5"" Intern...",,Seagate BarraCuda ST2000DM008 2TB 7200 RPM 256...,Seagate ST2000DM008 BarraCuda 2TB 3.5 SATA HDD...,2TB Seagate Barracuda Serial ATA III 3.5-inch ...,
1,0,Samsung (MZ-V7E500BW) 970 EVO SSD 500GB - M.2...,Samsung - 970 EVO 500GB Internal PCI Express 3...,"SAMSUNG 970 EVO M.2 2280 500GB PCIe Gen3. X4, ...",SAMSUNG 970 EVO Series - 500GB PCIe NVMe - M.2...,,Samsung 500GB 970 EVO NVMe M.2 Internal SSD
2,0,Samsung (MZ-V7E1T0BW) 970 EVO SSD 1TB - M.2 NV...,Samsung - 970 EVO 1TB Internal PCI Express 3.0...,"SAMSUNG 970 EVO M.2 2280 1TB PCIe Gen3. X4, NV...",,,Samsung 1TB 970 EVO NVMe M.2 Internal SSD
3,0,"WD Blue 1TB PC Hard Drive - 7200 RPM Class, SA...",WD - Blue 1TB Internal SATA Hard Drive for Des...,WD Blue 1TB Desktop Hard Disk Drive - 7200 RPM...,,1TB Western Digital Blue 3.5-inch SATA III 6Gb...,
4,0,"Crucial P1 1TB 3D NAND NVMe PCIe Internal SSD,...",,"Crucial P1 1TB 3D NAND NVMe PCIe Internal SSD,...",,1TB Crucial P1 M.2 2280 PCI Express 3.0 x 4 So...,Crucial 1TB P1 NVMe M.2 2280 Internal SSD
...,...,...,...,...,...,...,...
317,0,,,,,,
318,0,,XPG - Ultimate Series SU800 2TB Internal SATA ...,,,,ADATA Technology 2TB Ultimate SU800 SATA III 2...
319,0,,,,,,
320,0,,SanDisk - Ultra 2TB Internal SATA Solid State ...,,"SanDisk Ultra 2TB 2.5"" SATA Internal Solid Sta...",2TB SanDisk Ultra 3D Serial ATA III 6GB 2.5-in...,"SanDisk 2TB 3D SATA III 2.5"" Internal SSD"


In [41]:
# Drop the Unnamed: 0 column and drop any row where it is all NaN
def remove_misc(df):
    columns = list(df.columns)[1:]
    df = df.drop(columns=['Unnamed: 0'])
    df = df.dropna(how='all')
    print(len(df))
    return df


In [42]:
ram_df = remove_misc(ram_df)
cpu_df = remove_misc(cpu_df)
hard_drive_df = remove_misc(hard_drive_df)

210
315
233


In [43]:
def generate_pos_pcpartpicker_data(df):
    columns = list(df.columns)
    pos_df = pd.DataFrame(columns=['title_one', 'title_two', 'label'])
    for idx in range(len(df)):
        row = df.iloc()[idx]
        titles = []
        for col in columns:
            if not pd.isnull(row[col]): titles.append(row[col])
        if len(titles) > 1:
            combs = combinations(titles, 2)
            for comb in combs:
                comb = list(comb)
                comb.append(1)
                pos_df = pos_df.append(pd.DataFrame([comb], columns=['title_one', 'title_two', 'label']))
    
    return pos_df


In [44]:
pos_ram_data = generate_pos_pcpartpicker_data(ram_df)

pos_cpu_data = generate_pos_pcpartpicker_data(cpu_df)

pos_hard_drive_data = generate_pos_pcpartpicker_data(hard_drive_df)


In [45]:
def generate_neg_pcpartpicker_data(df):
    columns = list(df.columns)
    neg_df = pd.DataFrame(columns=['title_one', 'title_two', 'label'])
    df_list = df.iloc()
    for idx in range(len(df)):
        row = df_list[idx]
        for col in columns:
            if not pd.isnull(row[col]):
                neg_idx = None
                while neg_idx == idx or neg_idx is None:
                    neg_idx = random.randint(0, len(df) - 1)
                
                neg_title = None
                while neg_title == None or pd.isnull(neg_title):
                    neg_title = df_list[neg_idx][random.choice(columns)]
                
                neg_df = neg_df.append(pd.DataFrame([[row[col], neg_title, 0]], columns=['title_one', 'title_two', 'label']))
    
    return neg_df

In [46]:
neg_ram_data = generate_neg_pcpartpicker_data(ram_df)

neg_cpu_data = generate_neg_pcpartpicker_data(cpu_df)

neg_hard_drive_data = generate_neg_pcpartpicker_data(hard_drive_df)

final_ram_data = create_final_data(pos_ram_data, neg_ram_data)

final_cpu_data = create_final_data(pos_cpu_data, neg_cpu_data)

final_hard_drive_data = create_final_data(pos_hard_drive_data, neg_hard_drive_data)

print(len(final_cpu_data), len(final_ram_data), len(final_hard_drive_data))

962 696 1010


## Embeddings Creation Functions
Generates the embeddings and saves them

In [47]:
"""
Create the numpy files of all the training embedddings
We will have two numpy files:
1. The training/validation/test sets
2. The labels
"""

def create_embeddings(df):
    # Create the numpy arrays for storing the embeddings and labels
    total_embeddings = np.zeros(shape=(len(df), 2, MAX_LEN, EMBEDDING_SHAPE[0]))
    labels = np.zeros(shape=(len(df)))
    
    # I know this is a terrible way of doing this, but iterate over the dataframe
    # and generate the embeddings to add to the numpy array
    for idx, row in enumerate(df.itertuples()):
        for word_idx, word in enumerate(row.title_one.split()):
            total_embeddings[idx, 0, word_idx] = fasttext_model[word]
            
        for word_idx, word in enumerate(row.title_two.split()):
            total_embeddings[idx, 1, word_idx] = fasttext_model[word]
            
        labels[idx] = row.label
        
    return total_embeddings, labels


In [48]:
def save_embeddings(df, embeddings_name, labels_name):
    """
    Saves the embeddings given the embeddings file name and labels file name
    """
    if not os.path.exists('data/numpy_data/' + embeddings_name + '.npy'):
        embeddings, labels = create_embeddings(df)
        with open('data/numpy_data/' + embeddings_name + '.npy', 'wb') as f:
            np.save(f, embeddings)

        with open('data/numpy_data/' + labels_name + '.npy', 'wb') as f:
            np.save(f, labels)

In [49]:
def load_embeddings_and_labels(embeddings_name, labels_name):
    loaded_embeddings = None
    labels = None
    with open('data/numpy_data/' + embeddings_name + '.npy', 'rb') as f:
        loaded_embeddings = np.load(f)
        loaded_embeddings = np.transpose(loaded_embeddings, (1, 0, 2, 3))
    
    with open('data/numpy_data/' + labels_name + '.npy', 'rb') as f:
        labels = np.load(f)
    
    return loaded_embeddings, labels

## Saving and Loading Embeddings
Save the embeddings for the different types of data we have

In [53]:
# Concatenate everything
total_data = pd.concat([final_computer_df, final_laptop_df, final_ram_data, final_cpu_data, final_hard_drive_data])
total_data = total_data.sample(frac=1)
save_embeddings(total_data, 'all_embeddings', 'all_labels')

In [None]:
embeddings, labels = load_embeddings_and_labels('all_embeddings', 'all_labels')

In [54]:
len(embeddings[0,:])

NameError: name 'embeddings' is not defined

In [115]:
total_data

Unnamed: 0,title_one,title_two,label
16663,acer aspire es1 132 p194 business notebook 331...,acer aspire es1 132 p194 business notebook len...,1
0,lenovo ideapad 310 15ikb notebook 15 6 inch fu...,lenovo ideapad 310 15ikb 15 6 inch intel core ...,1
0,hp 250 g6 ultrabook 15 6 inch full hd 1920x108...,hp 250 g6 ultrabook 15 6 inch full hd 1920x108...,0
3086,corsair vengeance led 16gb 2x8gb ddr4pc4 21300...,corsair vengeance red led 16gb 2x8gb ddr4 pc4 ...,1
15990,kingston datatraveler 100 g3 32 gb usb 3 0 dt1...,usb datatraveler 100 g3 3 0 stick 32 gb,1
...,...,...,...
11649,seagate laptop sshd 1 tb internal st1000lm014 ...,wd green wds240g1g0a ssd 240 go sata 6gb garan...,0
0,lenovo ideapad 320 17isk notebook 17 3 inch 16...,lenovo notebook 17 3 inch 1600x900 intel core ...,1
15592,sandisk extreme microsdhc 64gb type 10 acheter...,sandisk extreme microsdhc 64gb type 10 kopen e...,1
10730,dg0146famwl hp 146 gb 6g 10k 2 5 dp sas new pa...,dg0146famwl hp 146 gb 6g 10k 2 5 dp sas hdd ne...,1


In [116]:
X_train1 = embeddings[0, :len(labels) - 4000]
X_train2 = embeddings[1, :len(labels) - 4000]
X_train = np.stack((X_train1, X_train2))
print('Training shape: ' + str(X_train.shape))

X_val1 = embeddings[0, len(labels) - 4000:len(labels) - 2000]
X_val2 = embeddings[1, len(labels) - 4000:len(labels) - 2000]
X_val = np.stack((X_val1, X_val2))
print('Val shape: ' + str(X_val.shape))

X_test1 = embeddings[0, len(labels) - 2000:]
X_test2 = embeddings[1, len(labels) - 2000:]
X_test = np.stack((X_test1, X_test2))
print('Test shape: ' + str(X_test.shape))

Training shape: (2, 28410, 42, 300)
Val shape: (2, 2000, 42, 300)
Test shape: (2, 2000, 42, 300)


In [117]:
Y_train = labels[:len(labels) - 4000]
print('Training labels shape:', str(Y_train.shape))

Y_val = labels[len(labels) - 4000:len(labels) - 2000]
print('Val shape:', str(Y_val.shape))

Y_test = labels[len(labels) - 2000:]
print('Test shape:', str(Y_test.shape))

Training labels shape: (28410,)
Val shape: (2000,)
Test shape: (2000,)


In [118]:
def convert_to_one_hot(Y, C):
    Y = np.eye(C)[Y.reshape(-1)]
    return Y

In [119]:
Y_train = convert_to_one_hot(Y_train.astype(np.int32), 2)
Y_val = convert_to_one_hot(Y_val.astype(np.int32), 2)
Y_test = convert_to_one_hot(Y_test.astype(np.int32), 2)

In [135]:
Y_train

array([[0., 1.],
       [0., 1.],
       [1., 0.],
       ...,
       [0., 1.],
       [1., 0.],
       [0., 1.]])

## Model Info

For the model, we are going to use LSTMs with a Constrastive Loss Function 
that will also be used to predict whether the two products are the same 

First, we have to convert the titles to embeddings through FastText before feeding into the LSTM.
The embedding part of this model will not be a layer because:
* The fasttext model would be time consuming and annoying to get to work with an embedding layer in Keras
* The fasttext model is not going to be getting its embeddings optimized, so there is really no point in adding it as an embedding layer

In [7]:
def square_distance(vectors):
    x, y = vectors
    return tf.square(x - y)

def euclidean_dist_out_shape(shapes):
    # Both inputs are fed in, so just use one of them and get the first value in the shape
    shape1, shape2 = shapes
    return (shape1[0],)

def siamese_network(input_shape):
    # Defines our inputs
    left_title = Input(input_shape, dtype='float32')
    right_title = Input(input_shape, dtype='float32')
    
    # The LSTM units
    model = tf.keras.Sequential(name='siamese_model')
    model.add(LSTM(units=256, return_sequences=True, name='lstm_1'))
    model.add(Dropout(rate=0.5))
    model.add(LSTM(units=128, return_sequences=True, name='lstm_2'))
    model.add(Dropout(rate=0.5))
    model.add(LSTM(units=128, name='lstm_3'))
    model.add(Dropout(rate=0.6))
    
    # The dense layers
    model.add(Dense(units=1024, activation='elu', name='dense_1'))
    model.add(Dropout(rate=0.6))
    model.add(Dense(units=512, activation='elu', name='dense_2'))
    
    # Forward propagate through the model to generate the encodings
    encoded_left_title = model(left_title)
    encoded_right_title = model(right_title)

    SquareDistanceLayer = Lambda(square_distance)
    distance = SquareDistanceLayer([encoded_left_title, encoded_right_title])
    
    prediction = Dense(units=2, activation='softmax')(distance)
    # Create and return the network
    siamese_net = tf.keras.Model(inputs=[left_title, right_title], outputs=prediction, name='siamese_network')
    return siamese_net

In [121]:
# Note: for the constrastive loss, because 0 denotes that they are from the same class
# and one denotes they are from a different class, I swaped the (Y) and (1 - Y) terms

def constrastive_loss(y_true, y_pred):
    margin = 2.0
    d = y_pred
    d_sqrt = tf.sqrt(d)
    #tf.print('\nY Pred: ', d, 'Shape: ', tf.shape(d))
    #tf.print('\nY True: ', y_true, 'Shape: ', tf.shape(y_true))
    
    loss = (y_true * d) + ((1 - y_true) * tf.square(tf.maximum(0., margin - d_sqrt)))
    
    #tf.print('\n Constrastive Loss: ', loss, 'Shape: ', tf.shape(loss))
    loss = 0.5 * tf.reduce_mean(loss)
    
    return loss

In [122]:
# Accuracy metric for constrastive loss because values close to 0 are equal and values high are different
# 0.5 is the threshold here
def constrastive_accuracy(y_true, y_pred):
    return tf.reduce_mean(tf.cast(tf.equal(y_true, tf.cast(y_pred < 0.5, y_true.dtype)), y_true.dtype))

In [123]:
def save_model(model, name):
    """
    Saves a model with a particular name
    """
    model.save('models/' + name + '.h5')

In [8]:
model = siamese_network((MAX_LEN, EMBEDDING_SHAPE[0],))
model.summary()

Model: "siamese_network"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 43, 300)]    0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 43, 300)]    0                                            
__________________________________________________________________________________________________
siamese_model (Sequential)      (None, 512)          1555968     input_1[0][0]                    
                                                                 input_2[0][0]                    
__________________________________________________________________________________________________
lambda (Lambda)                 (None, 512)          0           siamese_model[0][0]

In [128]:
# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='RMSprop', metrics=['accuracy'])

In [129]:
# Train the model
model.fit(x=[X_train1, X_train2], y=Y_train, batch_size=128, epochs=80, validation_data=([X_val[0], X_val[1]], Y_val))

Train on 28410 samples, validate on 2000 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x1c2d9bc9448>

In [130]:
# Test the model
results = model.evaluate([X_test1, X_test2], Y_test, batch_size=16)
print('test loss, test acc: ', results)

test loss, test acc:  [0.3420196931362152, 0.887]


In [9]:
# Set the model's name
model_name = '0.2_Softmax-LSTM-128_batch_80_epochs'

In [None]:
# Save the model
save_model(model, model_name)

## Manual Testing
Converts titles into embeddings arrays and allow the model to make a prediction

In [10]:
model.load_weights('models/' + model_name + '.h5')

In [135]:
title_one = 'ultrabook intel hd graphics 620 dell dvd rw full hd 1920x1080 8gb ram webcam latitude 7480 hdmi premium 14 0 256gb ssd intel core i7 7600u 2 8ghz'
title_two = 'fp reader 256gb ssd usb type c latitude 7480 intel core i7 7600u 2 8ghz ultrabook 14 0 premium dell intel hd graphics 620 home 8gb ram'
#title_one = 'Corsair 16GB ram'
#title_two = 'G Skill 32GB ram'
title_one_arr = np.zeros((1, MAX_LEN, 300))
title_two_arr = np.zeros((1, MAX_LEN, 300))
title_one = remove_stop_words(title_one.lower())
title_two = remove_stop_words(title_two.lower())

for idx, word in enumerate(title_one.split(' ')):
    title_one_arr[0, idx] = fasttext_model[word]
    
for idx, word in enumerate(title_two.split(' ')):
    title_two_arr[0, idx] = fasttext_model[word]

In [136]:
model.predict([title_one_arr, title_two_arr])

array([[9.9936146e-01, 6.3860090e-04]], dtype=float32)