In [1]:
import fasttext
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
import os
import random
from itertools import combinations

import tensorflow as tf
from tensorflow.keras.layers import LSTM, Dense, Input, Dropout, Lambda, Concatenate

# Have to download the stopwords
# nltk.download('stopwords')

In [2]:
# Get the fasttext model (we are using the largest one they offer [600B tokens])
fasttext_model = fasttext.load_model('models/crawl-300d-2M-subword.bin')



## General Useful Function
Functions that are continually used throughout this project

In [3]:
"""
Definitions of some sizes in the training set
"""
MAX_LEN = 43
EMBEDDING_SHAPE = (300,)
print('MAX_LEN: ' + str(MAX_LEN), 'EMBEDDING_SHAPE: ' + str(EMBEDDING_SHAPE))

MAX_LEN: 43 EMBEDDING_SHAPE: (300,)


In [4]:
def get_max_len(df):
    max_len = 0
    for row in df.itertuples():
        if len(row.title_one.split(' ')) > max_len:
            max_len = len(row.title_one.split(' '))
            
        if len(row.title_two.split(' ')) > max_len:
            max_len = len(row.title_two.split(' '))
    
    return max_len

In [5]:
def print_dataframe(df):
    for idx in range(len(df)):
        print(df.iloc[idx].title_one + '\n' + df.iloc[idx].title_two)
        print('________________________________________________________________')

In [47]:
def create_final_data(pos_df, neg_df):
    pos_df.sample(frac=1)
    neg_df.sample(frac=1)
    final_df = pd.concat([pos_df[:min(len(pos_df), len(neg_df))], neg_df[:min(len(pos_df), len(neg_df))]])
    final_df = final_df.sample(frac=1)
    return final_df

In [6]:
def remove_stop_words(phrase):
    # Creates the stopwords
    to_stop = stopwords.words('english')
    punctuation = "!”#$%&’()*+,-./:;<=>?@[\]^_`{|}~ "
    for c in punctuation:
        to_stop.append(c)

    to_stop.append('null')
    
    for punc in punctuation:
        phrase = phrase.replace(punc, ' ')
    
    return ' '.join((' '.join([x for x in phrase.split(' ') if x not in to_stop])).split())


## Data Processsing and Organization
Here, all we really want to do is prepare the data for training. This is **only** the data from **Gold Standard** This includes:
* Simplifying the original data
* Normalizing the data 
* Balancing the positive and negative examples
* Creating the embedding representations that will actually get fed into the neural network

In [32]:
# Organizing and normalizing the data
"""
Essentially, we want to only have three attributes for each training example: title_one, title_two, label
For normalization, we are just going to use the nltk stopwords and punctuation
"""

def preprocessing(orig_data):
    """
    Normalizes the data by getting rid of stopwords and punctuation
    """
    
    # The new names of the columns
    column_names = ['title_one', 'title_two', 'label']
    # A new dataframe for the data we are going to be creating
    norm_computers = pd.DataFrame(columns = column_names)
    # Iterate over the original dataframe (I know it is slow and there are probably better ways to do it)
    for row in orig_data.itertuples():
        title_left = remove_stop_words(row.title_left)
        title_right = remove_stop_words(row.title_right)
        
        # Append the newly created row (title_left, title_right, label) to the new dataframe
        norm_computers = norm_computers.append(pd.DataFrame([[title_left, title_right, row.label]], columns=column_names))
        
    return norm_computers
        

In [33]:
def create_simple_data():
    """
    Creates and saves a simpler version of the original data that only contains the the two titles and the label.
    """
    
    # Get the dataset of computer parts
    computers_df = pd.read_json('data/train/computers_train_xlarge_normalized.json.gz',compression='gzip', lines=True)
    norm_computers = preprocessing(computers_df)
    
    # Save the new normalized and simplified data to a CSV file to load later
    norm_computers.to_csv('data/train/computers_train_xlarge_norm_simple.csv', index=False)

In [34]:
# Create and save the data if the simple and normalized data does not exist
if not os.path.exists('data/train/computers_train_xlarge_norm_simple.csv'):
    create_simple_data()

In [None]:
# Load the data
computer_df = pd.read_csv('data/train/computers_train_xlarge_norm_simple.csv')

In [None]:
# See some of the data. There is clearly a separation between the positive and negative examples
computer_df

In [64]:
def create_train_df(df):
    """
    Returns a shuffled dataframe with an equal amount of positive and negative examples
    """
    # Get the positive and negative examples
    pos_df = df.loc[df['label'] == 1]
    neg_df = df.loc[df['label'] == 0]
    
    # Shuffle the data
    pos_df = pos_df.sample(frac=1)
    neg_df = neg_df.sample(frac=1)
    
    # Concatenate the positive and negative examples and 
    # make sure there are only as many negative examples as positive examples
    final_df = pd.concat([pos_df[:min(len(pos_df), len(neg_df))], neg_df[:min(len(pos_df), len(neg_df))]])
    
    # Shuffle the final data once again
    final_df.sample(frac=1)
    return final_df

In [36]:
# Create and save the dataframe with equal numbers of positive and negative examples
# and is shuffled
if not os.path.exists('data/train/computers_train_bal_shuffle.csv'):
    create_train_df(computer_df).to_csv('data/train/computers_train_bal_shuffle.csv', index=False)

In [31]:
final_computer_df = pd.read_csv('data/train/computers_train_bal_shuffle.csv')

In [76]:
final_computer_df

Unnamed: 0,title_one,title_two,label
0,corsair carbide air 240 windowed,corsair carbide series air 240 cube micro atx ...,1
1,a8 7670k black edition quad core amd cpu fan h...,amd a8 7650k 3 3ghz pccomponentes,1
2,amazonbasics 13 3 inch laptop sleeve black acc...,amazonbasics 13 3 inch laptop sleeve black car...,1
3,eg0146fartr hp 146 gb 6g 10k 2 5 dp sas hdd ne...,eg0146fartr hp 146 gb 6g 10k 2 5 dp sas hdd,1
4,usb 3 0 external adapter cable 2 5 inch hard d...,transcend ssd370 solid state drive ssd 2 5 sat...,0
...,...,...,...
19375,356816 001 ml350t g4p xeon 3 2 2mb 512mb whole...,409159 b21 hp xeon e5345 2 33ghz dl160 g3 new ...,0
19376,buy online samsung 750 evo series 120gb ssd mz...,ssd 750 basic 120 gb tradineur com,1
19377,628061 s21 hp g8 g9 3 tb 6g 7 2k 5 sata sc new...,628061 s21 hp g8 g9 3 tb 6g 7 2k 5 sata sc new...,1
19378,buy online zotac gtx 1060 6gb amp edition grap...,msi nvidia geforce gtx 1080 8gb gaming x rgb g...,0


## Laptop Data Preprocessing
* Normalize the data
* Create negative examples that represent when only a couple of attributes of the laptop data changes

In [75]:
# Load the laptop data
laptop_df = pd.read_csv('data/train/laptops.csv', encoding='latin-1')

In [76]:
laptop_df

Unnamed: 0.1,Unnamed: 0,Company,Product,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price_euros
0,1,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8GB,128GB SSD,Intel Iris Plus Graphics 640,macOS,1.37kg,1339.69
1,2,Apple,Macbook Air,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,1.34kg,898.94
2,3,HP,250 G6,Notebook,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,256GB SSD,Intel HD Graphics 620,No OS,1.86kg,575.00
3,4,Apple,MacBook Pro,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16GB,512GB SSD,AMD Radeon Pro 455,macOS,1.83kg,2537.45
4,5,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8GB,256GB SSD,Intel Iris Plus Graphics 650,macOS,1.37kg,1803.60
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1298,1316,Lenovo,Yoga 500-14ISK,2 in 1 Convertible,14.0,IPS Panel Full HD / Touchscreen 1920x1080,Intel Core i7 6500U 2.5GHz,4GB,128GB SSD,Intel HD Graphics 520,Windows 10,1.8kg,638.00
1299,1317,Lenovo,Yoga 900-13ISK,2 in 1 Convertible,13.3,IPS Panel Quad HD+ / Touchscreen 3200x1800,Intel Core i7 6500U 2.5GHz,16GB,512GB SSD,Intel HD Graphics 520,Windows 10,1.3kg,1499.00
1300,1318,Lenovo,IdeaPad 100S-14IBR,Notebook,14.0,1366x768,Intel Celeron Dual Core N3050 1.6GHz,2GB,64GB Flash Storage,Intel HD Graphics,Windows 10,1.5kg,229.00
1301,1319,HP,15-AC110nv (i7-6500U/6GB/1TB/Radeon,Notebook,15.6,1366x768,Intel Core i7 6500U 2.5GHz,6GB,1TB HDD,AMD Radeon R5 M330,Windows 10,2.19kg,764.00


In [77]:
# This class will be used in order to exchange the different attributes
# to create negative examples
class Attributes():
    company = {'Apple'}
    product = {'MacBook Pro'}
    inches = {'13.3'}
    cpu = {'Intel Core i5 2.3GHz'}
    ram = {'4GB'}
    memory = {'256GB SSD'}
    gpu = {'Intel HD Graphics 520'}
    screen = {'1440x900'}
    
    def get_all_data():
        return {
            'company': Attributes.company,
            'product': Attributes.product,
            'inches': Attributes.inches,
            'cpu': Attributes.cpu,
            'ram': Attributes.ram,
            'memory': Attributes.memory,
            'gpu': Attributes.gpu,
            'screen': Attributes.screen
        }

In [78]:
# Create attribute sets
def create_attribute_sets(df):
    Attributes.company.update([row.Company for row in laptop_df[['Company']].itertuples()])
    Attributes.product.update([row.Product for row in laptop_df[['Product']].itertuples()])
    Attributes.inches.update([str(row.Inches) for row in laptop_df[['Inches']].itertuples()])
    Attributes.cpu.update([row.Cpu for row in laptop_df[['Cpu']].itertuples()])
    Attributes.ram.update([row.Ram for row in laptop_df[['Ram']].itertuples()])
    Attributes.memory.update([row.Memory for row in laptop_df[['Memory']].itertuples()])
    Attributes.gpu.update([row.Gpu for row in laptop_df[['Gpu']].itertuples()])
    Attributes.screen.update([row.ScreenResolution for row in laptop_df[['ScreenResolution']].itertuples()])

create_attribute_sets(laptop_df)

In [79]:
def concatenate_row(row):
    # Note: got rid of everything after the '(' because it has info about the actual specs of the laptop
    # so if we change the specs, we need to fix that too
    
    # Special tags at the end of the amount of inches of the laptop and the RAM to simulate real data
    inch_attr = str(row['Inches']) + random.choice([' inch', '', '"'])
    ram_attr = row['Ram'] + random.choice([' ram', ' memory', ''])
    
    # These are words that commonly come up with laptops
    modifiers = ['premium', 'new', 'fast', 'latest model']
    add_ins = ['USB 3.0', 'USB 3.1 Type-C', 'USB Type-C', 'Bluetooth', 'WIFI', 'Webcam', 'FP Reader',
               'HDMI', '802.11ac', '802.11 ac', 'home', 'flagship', 'business', 'GbE LAN', 'DVD-RW', 'DVD']
    
    cpu_attr = row['Cpu']
    if random.choice([0, 1]):
        cpu_attr = cpu_attr.split(' ')
        if random.choice([0, 1]):
            if 'Intel' in cpu_attr:
                cpu_attr.remove('Intel')
        if random.choice([0, 1]):
            if 'Core' in cpu_attr:
                cpu_attr.remove('Core')
        if random.choice([0, 1]):
            if 'AMD' in cpu_attr:
                cpu_attr.remove('AMD')
    
        cpu_attr = ' '.join(cpu_attr)

    # Create a list for all the product attributes
    order_attrs = [random.choice(modifiers),
                   row['Company'],
                   row['Product'].split('(')[0],
                   row['TypeName'],
                   inch_attr,
                   row['ScreenResolution'],
                   cpu_attr,
                   ram_attr,
                   row['Memory'],
                   row['Gpu']]
    
    order_attrs = order_attrs + random.sample(add_ins, 3)
    
    # Shuffle the data because in real data, it does not really matter what order the attributes are in
    random.shuffle(order_attrs)
    
    return ' '.join(order_attrs)

In [80]:
# Creates the negative examples for the laptop data
# The laptop_df is the original data, the new_df is the dataframe to append the new data to
# and the attributes are the attributes to swap for the new data
def create_neg_laptop_data(laptop_df, attributes):
    new_column_names = ['title_one', 'title_two', 'label']
    negative_df = pd.DataFrame(columns = new_column_names)
    for row in range(len(laptop_df)):
        # Create a copy of the row for the negative example
        neg_row = laptop_df.iloc[row]
        for attribute_class in attributes:
            # Get the row in the laptop_data
            orig_row = laptop_df.iloc[row]
            
            # Get the attribute that we are trying to change
            attribute_val = orig_row[attribute_class]
            
            # Temporarily value for the new value
            new_val = attribute_val
            
            # Make sure we really get a new attribute
            while new_val == attribute_val:
                new_val = random.sample(Attributes.get_all_data()[attribute_class.lower()], 1)[0]
            
            # Change the value in the neg_row to the new value
            neg_row[attribute_class] = new_val
            
            # Concatenate and normalize the data
            title_one = remove_stop_words(concatenate_row(orig_row).lower())
            title_two = remove_stop_words(concatenate_row(neg_row).lower())
            
            # Append the data to the new df
            negative_df = negative_df.append(pd.DataFrame([[title_one, title_two, 0]], columns=new_column_names))
    
    return negative_df

In [81]:
neg_df = create_neg_laptop_data(laptop_df, attributes=['Cpu', 'Memory', 'Ram', 'Inches', 'Product'])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [82]:
print_dataframe(neg_df)

macbook pro premium ultrabook intel iris plus graphics 640 bluetooth intel core i5 2 3ghz ips panel retina display 2560x1600 8gb 128gb ssd 802 11 ac 13 3" usb 3 1 type c apple
8gb memory 13 3 intel core i7 6560u 2 2ghz 128gb ssd business macbook pro webcam home apple ultrabook premium ips panel retina display 2560x1600 intel iris plus graphics 640
________________________________________________________________
13 3 128gb ssd dvd intel core i5 2 3ghz macbook pro ultrabook 802 11 ac ips panel retina display 2560x1600 home intel iris plus graphics 640 8gb ram apple latest model
new 8gb memory intel iris plus graphics 640 ultrabook apple intel core i7 6560u 2 2ghz usb 3 0 256gb ssd 1tb hdd dvd ips panel retina display 2560x1600 macbook pro usb 3 1 type c 13 3"
________________________________________________________________
13 3" 8gb ram macbook pro ips panel retina display 2560x1600 apple usb 3 1 type c intel core i5 2 3ghz new ultrabook business intel iris plus graphics 640 128gb ssd wi

full hd 1920x1080 usb type c intel uhd graphics 620 1tb hdd notebook 8gb hdmi latest model usb 3 1 type c intel core i5 8250u 1 6ghz 14 0 inch hp probook 440
2gb ram probook 440 intel core i7 6700hq 2 6ghz 16gb flash storage fast 802 11 ac hp notebook intel uhd graphics 620 full hd 1920x1080 802 11ac 15 4" flagship
________________________________________________________________
latest model 1tb hdd intel uhd graphics 620 intel core i5 8250u 1 6ghz 802 11 ac hp dvd probook 440 notebook 8gb ram 14 0" dvd rw full hd 1920x1080
thinkpad t460s fp reader dvd notebook 16gb flash storage 2gb webcam 15 4 inch intel uhd graphics 620 latest model full hd 1920x1080 hp intel core i7 6700hq 2 6ghz
________________________________________________________________
e402wa ga007t 4gb memory 1366x768 notebook amd e series 6110 1 5ghz 14 0 inch latest model 64gb ssd amd radeon r2 asus usb 3 1 type c 802 11 ac usb 3 0
notebook 14 0" 4gb home 64gb ssd asus 1366x768 amd a10 series 9600p 2 4ghz e402wa ga007t w

512gb ssd 512gb ssd usb 3 1 type c intel hd graphics 520 24gb dvd rw ultrabook 12 5 inch flagship portege z30 c 16l intel i5 6440hq 2 6ghz new full hd 1920x1080 toshiba
________________________________________________________________
usb type c new wifi 8gb ram 256gb ssd toshiba 13 3 full hd 1920x1080 portege z30 c 16l intel hd graphics 520 dvd rw intel core i7 6500u 2 5ghz ultrabook
intel i5 6440hq 2 6ghz dvd rw intel hd graphics 520 ultrabook full hd 1920x1080 toshiba precision 7720 802 11 ac 512gb ssd 512gb ssd 12 5" premium 24gb ram home
________________________________________________________________
fast notebook probook 450 full hd 1920x1080 8gb 1tb hdd 15 6 intel uhd graphics 620 intel i5 8250u 1 6ghz bluetooth dvd rw hp home
notebook hp 802 11ac new usb 3 1 type c atom x5 z8350 1 44ghz intel uhd graphics 620 wifi 1tb hdd probook 450 8gb full hd 1920x1080 15 6 inch
________________________________________________________________
full hd 1920x1080 15 6 intel core i5 8250u 1 6ghz

amd a12 series 9720p 3 6ghz fp reader notebook 15 6" bluetooth ideapad 320 15abr 2tb hdd amd radeon 530 802 11ac full hd 1920x1080 latest model lenovo 12gb ram
gbe lan ideapad 320 15abr lenovo webcam fast intel core i3 6006u 2 0ghz full hd 1920x1080 128gb hdd amd radeon 530 fp reader 13 3 64gb memory notebook
________________________________________________________________
2tb hdd ideapad 320 15abr 12gb ram full hd 1920x1080 fp reader amd radeon 530 15 6" fast notebook lenovo home dvd rw amd a12 series 9720p 3 6ghz
13 3 128gb hdd lenovo home latest model full hd 1920x1080 notebook hdmi 64gb webcam vivobook x540ya xx519t amd radeon 530 i3 6006u 2 0ghz
________________________________________________________________
premium probook 450 dvd rw 1tb hdd ips panel full hd 1920x1080 notebook 15 6 flagship hp nvidia geforce 930mx 8gb 802 11 ac i3 7100u 2 4ghz
nvidia geforce 930mx 15 6 inch intel core i7 6560u 2 2ghz gbe lan 802 11 ac dvd latest model hp 8gb memory 1tb hdd notebook ips panel fu

vivobook max 508gb hybrid fp reader full hd 1920x1080 notebook 15 6" premium dvd rw asus intel hd graphics 620 802 11 ac 4gb intel core i3 7130u 2 7ghz
________________________________________________________________
asus intel hd graphics 620 fast vivobook max full hd 1920x1080 dvd webcam notebook intel core i5 7200u 2 5ghz usb 3 0 256gb ssd 4gb memory 15 6"
latest model intel hd graphics 620 business bluetooth intel core i3 7130u 2 7ghz vivobook max asus notebook 15 6" fp reader 508gb hybrid 2gb full hd 1920x1080
________________________________________________________________
vivobook max business notebook gbe lan 4gb fast asus 256gb ssd intel core i5 7200u 2 5ghz intel hd graphics 620 15 6 inch full hd 1920x1080 usb 3 0
full hd 1920x1080 latest model vivobook max 2gb memory 10 1 508gb hybrid asus intel hd graphics 620 dvd rw intel core i3 7130u 2 7ghz notebook hdmi flagship
________________________________________________________________
notebook intel hd graphics 620 usb 3 0 gbe l

smartbook edge 4gb 13 3 inch 128gb flash storage fp reader dvd rw intel hd graphics 500 usb type c mediacom core i5 7200u 2 70ghz notebook ips panel full hd 1920x1080 premium
________________________________________________________________
hdmi notebook 13 3" intel celeron quad core n3450 1 1ghz mediacom ips panel full hd 1920x1080 gbe lan smartbook edge flagship 32gb ssd intel hd graphics 500 premium 4gb ram
fast 128gb flash storage notebook mediacom 32gb ram webcam intel core i5 7200u 2 70ghz smartbook edge wifi ips panel full hd 1920x1080 bluetooth 13 3 inch intel hd graphics 500
________________________________________________________________
intel celeron quad core n3450 1 1ghz 32gb ssd business smartbook edge ips panel full hd 1920x1080 802 11 ac 13 3 notebook 4gb ram usb 3 1 type c intel hd graphics 500 mediacom premium
128gb flash storage intel hd graphics 500 i5 7200u 2 70ghz flagship smartbook edge notebook fp reader premium mediacom 32gb memory ips panel full hd 1920x1080 gb

________________________________________________________________
amd radeon r5 fast 17 0 inch 1600x900 amd a9 series 9420 3ghz dvd notebook 8gb memory home usb 3 1 type c hp 1tb hdd 17 ak091nd
1600x900 hdmi notebook hp 17 ak091nd 15 4 a12 series 9720p 3 6ghz dvd rw premium amd radeon r5 home 4gb memory 500gb hdd
________________________________________________________________
17 ak091nd new hp 1tb hdd bluetooth amd radeon r5 dvd rw 8gb amd a9 series 9420 3ghz 17 0 fp reader notebook 1600x900
amd radeon r5 amd a12 series 9720p 3 6ghz vivobook flip 1600x900 500gb hdd 4gb ram hp notebook latest model 15 4 inch home flagship 802 11 ac
________________________________________________________________
802 11ac 16gb memory wifi lenovo intel i7 7500u 2 7ghz usb type c latest model ips panel full hd 1920x1080 thinkpad x1 ultrabook 14 0 inch 512gb ssd intel hd graphics 620
usb type c usb 3 1 type c thinkpad x1 lenovo intel hd graphics 620 new ips panel full hd 1920x1080 512gb ssd ultrabook 14 0 i

nvidia geforce gtx 1060 full hd 1920x1080 8gb 256gb flash storage msi wifi gaming premium 802 11ac 15 6 intel core i7 2 7ghz home gs63vr 7rf
________________________________________________________________
msi home premium 15 6" gaming 256gb ssd 1tb hdd nvidia geforce gtx 1060 full hd 1920x1080 gs63vr 7rf dvd rw 16gb ram core i7 7700hq 2 8ghz gbe lan
802 11ac premium 802 11 ac msi 8gb webcam gs63vr 7rf intel core i7 2 7ghz 256gb flash storage nvidia geforce gtx 1060 full hd 1920x1080 15 6" gaming
________________________________________________________________
gs63vr 7rf usb 3 0 home intel core i7 7700hq 2 8ghz msi fp reader new 256gb ssd 1tb hdd gaming 16gb full hd 1920x1080 nvidia geforce gtx 1060 15 6"
nvidia geforce gtx 1060 intel core i7 2 7ghz msi 8gb memory usb 3 1 type c 15 6" 256gb flash storage 802 11 ac v310 15isk hdmi full hd 1920x1080 gaming fast
________________________________________________________________
1366x768 15 6 inch aspire es1 572 acer notebook wifi 4gb memory

intel hd graphics 620 fp reader celeron quad n3450 1 1ghz usb type c toshiba premium 11 6 16gb flash storage gbe lan notebook 12gb 1366x768 satellite pro
________________________________________________________________
fast wifi 500gb hdd satellite pro 15 6 inch 4gb ram fp reader toshiba flagship notebook intel core i3 7100u 2 4ghz intel hd graphics 620 1366x768
home 1366x768 intel celeron quad core n3450 1 1ghz 12gb premium 16gb flash storage intel hd graphics 620 11 6" 802 11ac toshiba notebook wifi precision 5520
________________________________________________________________
lenovo fast fp reader 1366x768 bluetooth celeron quad n3160 1 6ghz 14 0 inch 4gb memory n42 20 chromebook 802 11ac intel hd graphics 400 notebook 16gb flash storage
n42 20 chromebook webcam new hdmi lenovo wifi 16gb flash storage 1366x768 14 0" intel hd graphics 400 notebook 4gb ram core m7 6y75 1 2ghz
________________________________________________________________
n42 20 chromebook 4gb memory lenovo intel ce

________________________________________________________________
15 6" intel hd graphics 520 8gb ram tecra a50 c 1zv usb 3 1 type c toshiba intel core i5 6200u 2 3ghz 1366x768 802 11ac flagship 256gb ssd premium notebook
intel hd graphics 520 flagship 1tb ssd 15 6 fp reader notebook core i5 7440hq 2 8ghz 1366x768 fast 8gb 802 11ac toshiba tecra a50 c 1zv
________________________________________________________________
business fast 15 6" notebook 1366x768 intel hd graphics 520 intel core i5 6200u 2 3ghz usb 3 0 tecra a50 c 1zv 256gb ssd 8gb memory dvd rw toshiba
toshiba 1366x768 1tb ssd intel hd graphics 520 tecra a50 c 1zv usb type c home 15 6" notebook core i5 7440hq 2 8ghz fast usb 3 0 32gb
________________________________________________________________
256gb ssd business hdmi fast 15 6 inch toshiba intel core i5 6200u 2 3ghz tecra a50 c 1zv 1366x768 intel hd graphics 520 8gb usb type c notebook
fast 802 11ac 32gb toshiba 1tb ssd 10 1 intel hd graphics 520 notebook tecra a50 c 1zv 

usb 3 1 type c 11 6 inch 128gb hdd full hd 1920x1080 fast gaming usb type c 32gb ram gt62vr 6rd nvidia geforce gtx 1060 intel 1 1ghz msi wifi
________________________________________________________________
256gb ssd 1tb hdd 15 6 inch 16gb new nvidia geforce gtx 1060 gaming dvd webcam full hd 1920x1080 business intel core i7 6700hq 2 6ghz msi gt62vr 6rd
full hd 1920x1080 spin 3 premium intel core 1 1ghz msi 128gb hdd 802 11 ac nvidia geforce gtx 1060 11 6 gaming webcam 32gb memory 802 11ac
________________________________________________________________
i7 7500u 2 7ghz 15 6 hp intel hd graphics 620 webcam notebook latest model probook 450 256gb ssd dvd rw home full hd 1920x1080 8gb
notebook bluetooth full hd 1920x1080 new intel celeron quad core n3450 1 1ghz 15 6 flagship 8gb hp intel hd graphics 620 256gb ssd probook 450 usb type c
________________________________________________________________
i7 7500u 2 7ghz gbe lan intel hd graphics 620 802 11 ac notebook hp 256gb ssd full hd 1920

16gb ram ips panel touchscreen 2560x1440 2 1 convertible 14 0" fast lenovo intel hd graphics 520 thinkpad x1 hdmi 512gb ssd intel pentium quad core n3710 1 6ghz fp reader flagship
________________________________________________________________
512gb ssd ips panel touchscreen 2560x1440 flagship gbe lan thinkpad x1 2 1 convertible lenovo intel hd graphics 520 premium 16gb ram intel core i7 6600u 2 6ghz 14 0 bluetooth
2 1 convertible 802 11 ac thinkpad x1 802 11ac 180gb ssd intel hd graphics 520 ips panel touchscreen 2560x1440 16gb intel pentium quad core n3710 1 6ghz lenovo fast 14 0 gbe lan
________________________________________________________________
512gb ssd intel hd graphics 520 16gb 14 0" thinkpad x1 2 1 convertible ips panel touchscreen 2560x1440 webcam lenovo intel i7 6600u 2 6ghz home bluetooth premium
ips panel touchscreen 2560x1440 180gb ssd lenovo intel pentium quad n3710 1 6ghz business premium bluetooth intel hd graphics 520 2 1 convertible dvd rw 14 0" 32gb memory thin

15 6 notebook bluetooth fast intel core i3 7100u 2 4ghz touchscreen 1366x768 webcam intel hd graphics 620 dell 8gb ram inspiron 3567 hdmi 1tb hdd
home 15 6 inch 802 11 ac 24gb ram notebook intel hd graphics 620 usb 3 1 type c intel core i5 7y57 1 2ghz inspiron 3567 16gb ssd dell fast touchscreen 1366x768
________________________________________________________________
dell latest model dvd touchscreen 1366x768 notebook 15 6" intel hd graphics 620 802 11ac usb 3 1 type c 1tb hdd 8gb ram intel core i3 7100u 2 4ghz inspiron 3567
24gb memory intel hd graphics 620 intel core i5 7y57 1 2ghz inspiron 3567 16gb ssd dell notebook 12 3 touchscreen 1366x768 fp reader fast hdmi flagship
________________________________________________________________
inspiron 3567 1tb hdd core i3 7100u 2 4ghz dell usb type c fp reader 15 6 inch premium 8gb memory touchscreen 1366x768 notebook intel hd graphics 620 usb 3 1 type c
yoga 520 14ikb intel hd graphics 620 notebook 24gb ram touchscreen 1366x768 16gb ssd 1

In [83]:
# Creates the postive examples for the laptop data
# The laptop_df is the original data, the new_df is the dataframe to append the new data to
# and the attributes are the attributes to swap or delete for the new data
def create_pos_laptop_data(laptop_df, rm_attrs, add_attrs):
    new_column_names = ['title_one', 'title_two', 'label']
    pos_df = pd.DataFrame(columns = new_column_names)
    for row in range(len(laptop_df)):
        # Remove the attribute from the new title
        for attr_list in rm_attrs:
            # Create a copy of the row for the negative example
            new_row = laptop_df.iloc[row]
            orig_row = laptop_df.iloc[row]
            for attr in attr_list:
                new_row[attr] = ''
        
            title_one = remove_stop_words(concatenate_row(orig_row).lower())
            title_two = remove_stop_words(concatenate_row(new_row).lower())

            # Occassionally add in the operating system just to switch it up
            if (random.sample([0, 1], 1)):
                for attr in add_attrs:
                    title_two += ' ' + orig_row[attr].lower()

            pos_df = pos_df.append(pd.DataFrame([[title_one, title_two, 1]], columns=new_column_names))

    return pos_df

In [84]:
pos_df = create_pos_laptop_data(laptop_df, rm_attrs = [['Company'], ['TypeName'], ['ScreenResolution'], ['Product'], ['TypeName', 'ScreenResolution']], add_attrs=['OpSys'])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [85]:
print_dataframe(pos_df)

128gb ssd apple 13 3 hdmi home 8gb ips panel retina display 2560x1600 intel iris plus graphics 640 new intel core i5 2 3ghz macbook pro usb 3 1 type c ultrabook
8gb ram macbook pro latest model ultrabook usb 3 0 128gb ssd ips panel retina display 2560x1600 gbe lan intel core i5 2 3ghz 13 3" intel iris plus graphics 640 bluetooth macos
________________________________________________________________
hdmi business fp reader intel core i5 2 3ghz macbook pro apple 8gb fast intel iris plus graphics 640 ips panel retina display 2560x1600 13 3" 128gb ssd ultrabook
ips panel retina display 2560x1600 intel iris plus graphics 640 13 3 intel core i5 2 3ghz hdmi macbook pro usb 3 1 type c 8gb ram bluetooth 128gb ssd apple premium macos
________________________________________________________________
intel iris plus graphics 640 hdmi ultrabook apple usb 3 0 ips panel retina display 2560x1600 dvd rw premium 8gb memory macbook pro 13 3 inch intel core i5 2 3ghz 128gb ssd
intel iris plus graphics 640 

usb 3 1 type c 256gb ssd zenbook ux430ua intel hd graphics 620 bluetooth usb 3 0 fast 14 0" intel core i7 7500u 2 7ghz asus 8gb memory windows 10
________________________________________________________________
intel uhd graphics 620 802 11 ac spin 5 8gb ips panel full hd touchscreen 1920x1080 13 3" 2 1 convertible acer home intel core i5 8250u 1 6ghz latest model bluetooth 256gb ssd
256gb ssd intel core i5 8250u 1 6ghz 2 1 convertible ips panel full hd touchscreen 1920x1080 bluetooth spin 5 intel uhd graphics 620 13 3 dvd rw new 8gb memory gbe lan windows 10
________________________________________________________________
intel uhd graphics 620 intel core i5 8250u 1 6ghz usb 3 1 type c latest model 2 1 convertible 13 3" ips panel full hd touchscreen 1920x1080 802 11 ac 8gb memory 256gb ssd spin 5 home acer
ips panel full hd touchscreen 1920x1080 dvd rw 8gb memory 13 3" acer spin 5 intel uhd graphics 620 gbe lan premium intel core i5 8250u 1 6ghz usb 3 0 256gb ssd windows 10
__________

hp 802 11ac intel uhd graphics 620 spectre x360 intel i5 8250u 1 6ghz ips panel 4k ultra hd touchscreen 3840x2160 gbe lan 13 3" fast 8gb dvd 256gb ssd windows 10
________________________________________________________________
13 3 intel uhd graphics 620 intel core i5 8250u 1 6ghz premium fp reader hp spectre x360 2 1 convertible 802 11ac home 8gb ram 256gb ssd ips panel 4k ultra hd touchscreen 3840x2160
webcam intel uhd graphics 620 spectre x360 intel core i5 8250u 1 6ghz bluetooth 256gb ssd 8gb 2 1 convertible 13 3" new hp 802 11 ac windows 10
________________________________________________________________
256gb ssd intel core i5 8250u 1 6ghz ips panel 4k ultra hd touchscreen 3840x2160 spectre x360 hp intel uhd graphics 620 webcam 13 3 inch hdmi wifi 8gb ram 2 1 convertible latest model
intel i5 8250u 1 6ghz 256gb ssd hp 8gb ram home intel uhd graphics 620 dvd ips panel 4k ultra hd touchscreen 3840x2160 2 1 convertible premium 13 3" bluetooth windows 10
_____________________________

full hd 1920x1080 4gb intel hd graphics 520 notebook 256gb ssd fast home gbe lan usb 3 0 lenovo 15 6 inch intel i3 6006u 2ghz no os
________________________________________________________________
core i3 6006u 2ghz 15 6" full hd 1920x1080 notebook 4gb intel hd graphics 520 ideapad 320 15isk lenovo hdmi business fast wifi 256gb ssd
lenovo dvd intel hd graphics 520 802 11ac fast ideapad 320 15isk 4gb ram intel core i3 6006u 2ghz 15 6 256gb ssd usb 3 0 no os
________________________________________________________________
intel hd graphics 620 gbe lan 500gb hdd 15 6 hdmi fp reader ips panel full hd 1920x1080 hp notebook intel core i5 8250u 1 6ghz premium 4gb memory probook 450
500gb hdd 4gb memory usb type c probook 450 15 6" intel core i5 8250u 1 6ghz latest model notebook intel hd graphics 620 dvd business ips panel full hd 1920x1080 windows 10
________________________________________________________________
hp webcam notebook hdmi usb 3 0 probook 450 500gb hdd fast 4gb memory intel co

15 6" amd a9 series 9420 3ghz hp latest model 4gb ram usb 3 1 type c 256gb ssd amd radeon r5 15 bw003nv full hd 1920x1080 802 11ac dvd rw windows 10
________________________________________________________________
full hd 1920x1080 4gb memory amd a9 series 9420 3ghz notebook flagship home amd radeon r5 15 bw003nv usb 3 0 15 6 hp 256gb ssd premium
notebook usb 3 1 type c 15 bw003nv dvd rw hdmi 256gb ssd amd radeon r5 4gb memory amd a9 series 9420 3ghz 15 6 fast hp windows 10
________________________________________________________________
15 bw003nv amd radeon r5 256gb ssd premium amd a9 series 9420 3ghz 4gb 15 6" home gbe lan notebook hp business full hd 1920x1080
full hd 1920x1080 amd radeon r5 4gb memory hp business a9 series 9420 3ghz 15 6 inch fast usb type c dvd rw 256gb ssd notebook windows 10
________________________________________________________________
new 802 11ac hdmi 256gb ssd 15 6 inch 4gb ram amd a9 series 9420 3ghz notebook usb type c hp 15 bw003nv amd radeon r5 full h

flagship 4gb ram 1tb hdd core i3 6006u 2ghz new nvidia geforce 940mx webcam acer business aspire e5 576g 15 6 inch full hd 1920x1080 notebook
15 6 inch notebook 802 11 ac 4gb memory fast business acer 1tb hdd nvidia geforce 940mx fp reader core i3 6006u 2ghz full hd 1920x1080 windows 10
________________________________________________________________
15 6 premium aspire e5 576g notebook intel core i3 6006u 2ghz acer 4gb memory 1tb hdd usb 3 1 type c full hd 1920x1080 wifi nvidia geforce 940mx bluetooth
i3 6006u 2ghz 15 6 inch 1tb hdd nvidia geforce 940mx 4gb memory dvd aspire e5 576g fast acer flagship dvd rw windows 10
________________________________________________________________
acer intel hd graphics 400 1366x768 travelmate b 4gb celeron dual core n3060 1 6ghz latest model netbook dvd rw 11 6 inch 802 11 ac 128gb ssd usb 3 1 type c
intel celeron dual core n3060 1 6ghz 1366x768 usb type c netbook 11 6" 4gb ram flagship fast intel hd graphics 400 travelmate b 128gb ssd usb 3 1 type

usb 3 0 premium intel hd graphics 620 fp reader v330 15ikb 15 6 bluetooth i5 8250u 1 6ghz full hd 1920x1080 500gb hdd lenovo 4gb windows 10
________________________________________________________________
15 6 v330 15ikb latest model 4gb memory intel hd graphics 620 hdmi usb type c intel core i5 8250u 1 6ghz lenovo notebook full hd 1920x1080 500gb hdd webcam
wifi 15 6" home intel hd graphics 620 v330 15ikb i5 8250u 1 6ghz 4gb memory premium 500gb hdd notebook lenovo business windows 10
________________________________________________________________
4gb ram 500gb hdd notebook 15 6 lenovo full hd 1920x1080 premium flagship v330 15ikb intel i5 8250u 1 6ghz intel hd graphics 620 usb 3 0 802 11 ac
notebook intel hd graphics 620 hdmi usb 3 0 lenovo 500gb hdd intel core i5 8250u 1 6ghz full hd 1920x1080 802 11 ac 4gb ram premium 15 6 windows 10
________________________________________________________________
intel hd graphics 620 4gb memory full hd 1920x1080 802 11ac intel core i5 8250u 1 6g

amd radeon 520 wifi hp intel core i5 7200u 2 5ghz notebook fp reader 500gb hdd gbe lan latest model 250 g6 1366x768 15 6 4gb
250 g6 intel core i5 7200u 2 5ghz fast flagship 15 6 amd radeon 520 webcam gbe lan 4gb notebook 1366x768 500gb hdd windows 10
________________________________________________________________
802 11ac core i5 7200u 2 5ghz 1366x768 premium dvd rw notebook 15 6" hp 4gb 500gb hdd amd radeon 520 250 g6 hdmi
500gb hdd fast flagship 1366x768 hp 4gb i5 7200u 2 5ghz 250 g6 webcam usb 3 0 15 6 amd radeon 520 windows 10
________________________________________________________________
latest model intel core i5 7200u 2 5ghz 250 g6 amd radeon 520 500gb hdd webcam hp 1366x768 4gb ram notebook 15 6 inch 802 11ac hdmi
4gb 500gb hdd 250 g6 amd radeon 520 notebook new webcam usb 3 0 hp dvd intel core i5 7200u 2 5ghz 15 6 windows 10
________________________________________________________________
fp reader 4gb ram intel core i5 7200u 2 5ghz hp 15 6" 250 g6 home notebook 1366x768 50

intel core i7 7700hq 2 8ghz new 4gb legion y520 15ikbn 1tb hdd 802 11 ac home 15 6" flagship lenovo nvidia geforce gtx 1050 windows 10
________________________________________________________________
acer dvd rw gbe lan nvidia geforce gtx 1060 256gb ssd 1tb hdd home 17 3 inch gaming predator g9 793 intel i7 7700hq 2 8ghz premium ips panel full hd 1920x1080 16gb ram
ips panel full hd 1920x1080 256gb ssd 1tb hdd 802 11ac intel core i7 7700hq 2 8ghz 16gb nvidia geforce gtx 1060 new 17 3 inch gaming gbe lan 802 11 ac predator g9 793 windows 10
________________________________________________________________
16gb memory 17 3" acer nvidia geforce gtx 1060 802 11 ac gaming predator g9 793 hdmi 256gb ssd 1tb hdd new intel i7 7700hq 2 8ghz ips panel full hd 1920x1080 usb type c
17 3 inch predator g9 793 intel core i7 7700hq 2 8ghz ips panel full hd 1920x1080 nvidia geforce gtx 1060 wifi acer 256gb ssd 1tb hdd 802 11 ac 16gb ram usb 3 1 type c fast windows 10
____________________________________

fp reader 4gb 802 11ac celeron dual n3350 1 1ghz intel hd graphics 500 usb 3 1 type c asus 32gb flash storage 14 1" vivobook l402na new windows 10
________________________________________________________________
1tb hdd 8gb ips panel full hd 1920x1080 15 6" ideapad 510 15isk nvidia geforce 940mx notebook webcam lenovo flagship bluetooth latest model intel core i7 6500u 2 5ghz
15 6 inch fast usb type c fp reader 1tb hdd intel core i7 6500u 2 5ghz notebook dvd ideapad 510 15isk 8gb ram nvidia geforce 940mx ips panel full hd 1920x1080 windows 10
________________________________________________________________
dvd rw 15 6 inch 8gb memory nvidia geforce 940mx lenovo ips panel full hd 1920x1080 business ideapad 510 15isk core i7 6500u 2 5ghz latest model notebook 1tb hdd fp reader
ips panel full hd 1920x1080 nvidia geforce 940mx core i7 6500u 2 5ghz ideapad 510 15isk 1tb hdd lenovo home 802 11 ac 15 6 hdmi new 8gb ram windows 10
_______________________________________________________________

premium 256gb ssd fp reader 15 6 inch nvidia geforce 920mx webcam usb 3 1 type c ideapad 310 15ikb 6gb full hd 1920x1080 notebook intel core i5 7200u 2 5ghz windows 10
________________________________________________________________
wifi 6gb memory lenovo fp reader usb 3 1 type c 256gb ssd notebook intel core i5 7200u 2 5ghz nvidia geforce 920mx new full hd 1920x1080 ideapad 310 15ikb 15 6 inch
ideapad 310 15ikb premium 6gb 256gb ssd intel core i5 7200u 2 5ghz nvidia geforce 920mx hdmi dvd full hd 1920x1080 fp reader 15 6" lenovo windows 10
________________________________________________________________
6gb usb 3 0 lenovo intel core i5 7200u 2 5ghz 256gb ssd nvidia geforce 920mx usb 3 1 type c full hd 1920x1080 15 6" ideapad 310 15ikb notebook new hdmi
notebook wifi lenovo 256gb ssd ideapad 310 15ikb intel core i5 7200u 2 5ghz 15 6 dvd hdmi fast 6gb ram nvidia geforce 920mx windows 10
________________________________________________________________
ideapad 310 15ikb full hd 1920x1080 

802 11 ac 14 0" 4gb ram notebook intel hd graphics 520 full hd 1920x1080 hp usb 3 1 type c elitebook 840 intel core i5 6200u 2 3ghz flagship 500gb hdd new
4gb ram bluetooth webcam 802 11 ac intel hd graphics 520 intel i5 6200u 2 3ghz elitebook 840 14 0 inch new hp 500gb hdd windows 7
________________________________________________________________
full hd 1920x1080 zbook 15u 8gb amd firepro w4190m 15 6 inch 256gb ssd premium 802 11 ac home usb type c intel core i7 6500u 2 5ghz workstation hp
8gb ram dvd premium usb 3 1 type c amd firepro w4190m full hd 1920x1080 intel core i7 6500u 2 5ghz dvd rw 15 6" zbook 15u 256gb ssd workstation windows 7
________________________________________________________________
workstation zbook 15u 802 11 ac full hd 1920x1080 dvd hp 256gb ssd new 8gb memory 15 6 inch webcam core i7 6500u 2 5ghz amd firepro w4190m
intel core i7 6500u 2 5ghz 256gb ssd premium 8gb zbook 15u business hdmi gbe lan hp amd firepro w4190m full hd 1920x1080 15 6 windows 7
_________

256gb ssd 1tb hdd 16gb acer nvidia geforce gtx 1070 fast intel core i7 7700hq 2 8ghz gbe lan predator g9 793 wifi 17 3 inch flagship windows 10
________________________________________________________________
gaming legion y520 15ikbn latest model lenovo webcam 15 6 1tb hdd dvd intel core i5 7300hq 2 5ghz nvidia geforce gtx 1050 8gb memory ips panel full hd 1920x1080 wifi
dvd fast intel i5 7300hq 2 5ghz legion y520 15ikbn nvidia geforce gtx 1050 8gb home 1tb hdd 15 6" ips panel full hd 1920x1080 gbe lan gaming windows 10
________________________________________________________________
1tb hdd intel core i5 7300hq 2 5ghz latest model lenovo gaming legion y520 15ikbn 8gb wifi 15 6" ips panel full hd 1920x1080 usb 3 1 type c nvidia geforce gtx 1050 bluetooth
new ips panel full hd 1920x1080 intel core i5 7300hq 2 5ghz lenovo 8gb 802 11ac usb 3 1 type c wifi 1tb hdd 15 6 inch legion y520 15ikbn nvidia geforce gtx 1050 windows 10
______________________________________________________________

In [57]:
final_laptop_df = create_final_data(pos_df, neg_df)

In [59]:
final_laptop_df = final_laptop_df.sample(frac=1)

Unnamed: 0,title_one,title_two,label
0,intel uhd graphics 620 fast home usb 3 1 type ...,512gb ssd usb 3 1 type c wifi intel uhd graphi...,1
0,"802 11 ac 4gb ram notebook e5 774g 17 3"" nvidi...",core i3 6006u 2ghz hdmi acer e5 774g 802 11 ac...,1
0,bluetooth 13 3 inch fast full hd 1920x1080 int...,webcam v131 vero premium intel atom x5 z8350 1...,1
0,notebook a9 series 9420 2 9ghz dvd 256gb ssd l...,usb type c 4gb 1tb ssd notebook amd radeon 530...,0
0,"13 3"" full hd touchscreen 1920x1080 premium 2 ...",elitebook x360 32gb hdd hp 16gb ram 2 1 conver...,0
...,...,...,...
0,notebook 1tb hdd bluetooth 15 ac110nv 1366x768...,intel core i7 6500u 2 5ghz 15 ac110nv 1tb hdd ...,1
0,"full hd 1920x1080 zbook 15u 15 6"" amd firepro ...",workstation 256gb ssd 8gb ram bluetooth amd fi...,1
0,gp72m 7rex gaming 16gb ram nvidia geforce gtx ...,full hd 1920x1080 fp reader 16gb flash storage...,0
0,512gb ssd intel core i7 7660u 2 5ghz quad hd t...,fp reader 802 11ac dvd rw quad hd touchscreen ...,0


## PCPartPicker Data
* Organize the data
* Preprocess the data
* Create negative and positive data

In [6]:
ram_df = pd.read_csv('data/train/pos_ram_titles.csv')
cpu_df = pd.read_csv('data/train/pos_cpu_titles.csv')
hard_drive_df = pd.read_csv('data/train/pos_hard_drive_titles.csv')

In [13]:
ram_df

Unnamed: 0.1,Unnamed: 0,amazon,bestbuy,newegg,walmart,memoryc
0,0,Corsair Vengeance LPX 16GB (2x8GB) DDR4 DRAM 3...,CORSAIR - Vengeance LPX 16GB (2PK x 8GB) 3.2 G...,CORSAIR Vengeance LPX 16GB (2 x 8GB) 288-Pin D...,Corsair CMK16GX4M2B3200C16 Vengeance LPX 16GB ...,16GB Corsair Vengeance LPX PC4-25600 3200MHz D...
1,0,Corsair Vengeance RGB PRO 16GB (2x8GB) DDR4 32...,CORSAIR - Vengeance RGB PRO 16GB (2PK 8GB) 3.2...,CORSAIR Vengeance RGB Pro 16GB (2 x 8GB) 288-P...,,16GB Corsair Vengeance RGB Pro DDR4 3200MHz CL...
2,0,G.Skill RipJaws V Series 16GB (2 x 8GB) 288-Pi...,,G.SKILL Ripjaws V Series 16GB (2 x 8GB) 288-Pi...,,
3,0,Corsair Vengeance RGB Pro 32GB (2x16GB) DDR4 3...,CORSAIR - Vengeance RGB PRO 32GB (2PK 16GB) 3....,CORSAIR Vengeance RGB Pro 32GB (2 x 16GB) 288-...,,32GB Corsair Vengeance Pro RGB DDR4 3200MHz CL...
4,0,,,G.SKILL Trident Z RGB (For AMD) 16GB (2 x 8GB)...,,16GB G.Skill DDR4 TridentZ RGB 3600Mhz PC4-288...
...,...,...,...,...,...,...
218,0,Corsair Vengeance LPX 32GB (4x8GB) DDR4 3600 (...,,CORSAIR Vengeance LPX 32GB (4 x 8GB) 288-Pin D...,,32GB Corsair Vengeance LPX DDR4 3600MHz PC4-28...
219,0,,,,,
220,0,Corsair Vengeance LPX 16GB (2x8GB) DDR4 DRAM 3...,CORSAIR - VENGEANCE LPX Series 16GB (2PK 8GB) ...,CORSAIR Vengeance LPX 16GB (2 x 8GB) 288-Pin D...,,16GB Corsair Vengeance LPX DDR4 3000MHz PC4-24...
221,0,CORSAIR VENGEANCELPX32GB (1x 32GB) DDR43000(PC...,,CORSAIR Vengeance LPX 32GB 288-Pin DDR4 SDRAM ...,,32GB Corsair Vengeance LPX DDR4 3000MHz CL16 M...


In [14]:
cpu_df

Unnamed: 0.1,Unnamed: 0,amazon,bestbuy,newegg,walmart,memoryc,bhphotovideo
0,0,"AMD Ryzen 5 3600 6-Core, 12-Thread Unlocked De...",AMD - Ryzen 5 3600 3rd Generation 6-Core - 12-...,AMD RYZEN 5 3600 6-Core 3.6 GHz (4.2 GHz Max B...,"AMD Ryzen 5 3600 6-Core, 12-Thread 4.2 GHz AM4...",AMD Ryzen 5 3600 AM4 3.6GHZ 32MB CPU Desktop P...,AMD Ryzen 5 3600 3.6 GHz Six-Core AM4 Processor
1,0,"AMD Ryzen 7 3700X 8-Core, 16-Thread Unlocked D...",AMD - Ryzen 7 3700X 3rd Generation 8-Core - 16...,AMD RYZEN 7 3700X 8-Core 3.6 GHz (4.4 GHz Max ...,"AMD Ryzen 7 3700X 8-Core, 16-Thread 4.4 GHz AM...",AMD Ryzen 7 3700x 3.6GHz 32MB AM4 CPU Desktop ...,AMD Ryzen 7 3700X 3.6 GHz Eight-Core AM4 Proce...
2,0,AMD Ryzen 5 2600 Processor with Wraith Stealth...,,,,AMD Ryzen 5 2600 Six-Core 3.4GHz Socket AM4 19...,
3,0,"AMD Ryzen 9 3900X 12-core, 24-thread unlocked ...",AMD - Ryzen 9 3900X 3rd Generation 12-core - 2...,AMD RYZEN 9 3900X 12-Core 3.8 GHz (4.6 GHz Max...,AMD RYZEN 9 3900X 12-Core 3.8 GHz (4.6 GHz Max...,AMD Ryzen 9 3900X 3.8GHz 64MB Desktop Processo...,AMD Ryzen 9 3900X 3.8 GHz 12-Core AM4 Processor
4,0,AMD Ryzen 3 3200G 4-Core Unlocked Desktop Proc...,AMD - Ryzen 3 3200G 3rd Generation 4-Core - 4-...,AMD RYZEN 3 3200G 4-Core 3.6 GHz (4.0 GHz Max ...,,AMD Ryzen 3 AM4 3.6GHZ 4MB Desktop Processor B...,
...,...,...,...,...,...,...,...
499,0,,,,,,
500,0,Intel Xeon E3-1220 V6 Processors BX80677E31220V6,,Intel Xeon E3-1220 V6 Kaby Lake 3.0 GHz (3.5 G...,XEON E3-1220 V6 FC-LGA14C 3G 8MB CACHE BOXED,Intel Xeon E3-1220 V6 3GHz Kaby Lake CPU LGA11...,
501,0,Intel - BX80684E2134 - Intel Xeon E-2134-3.5 G...,,,Intel BX80684E2134 Xeon Quad-core E-2134 3.5GH...,,
502,0,"Intel BX80662E31230V5 XEON E3-1230V5, 3.4 GHZ,...",,,,,


In [7]:
hard_drive_df

Unnamed: 0.1,Unnamed: 0,amazon,bestbuy,newegg,walmart,memoryc,bhphotovideo
0,0,"Seagate Barracuda ST2000DM008 2 TB 3.5"" Intern...",,Seagate BarraCuda ST2000DM008 2TB 7200 RPM 256...,Seagate ST2000DM008 BarraCuda 2TB 3.5 SATA HDD...,2TB Seagate Barracuda Serial ATA III 3.5-inch ...,
1,0,Samsung (MZ-V7E500BW) 970 EVO SSD 500GB - M.2...,Samsung - 970 EVO 500GB Internal PCI Express 3...,"SAMSUNG 970 EVO M.2 2280 500GB PCIe Gen3. X4, ...",SAMSUNG 970 EVO Series - 500GB PCIe NVMe - M.2...,,Samsung 500GB 970 EVO NVMe M.2 Internal SSD
2,0,Samsung (MZ-V7E1T0BW) 970 EVO SSD 1TB - M.2 NV...,Samsung - 970 EVO 1TB Internal PCI Express 3.0...,"SAMSUNG 970 EVO M.2 2280 1TB PCIe Gen3. X4, NV...",,,Samsung 1TB 970 EVO NVMe M.2 Internal SSD
3,0,"WD Blue 1TB PC Hard Drive - 7200 RPM Class, SA...",WD - Blue 1TB Internal SATA Hard Drive for Des...,WD Blue 1TB Desktop Hard Disk Drive - 7200 RPM...,,1TB Western Digital Blue 3.5-inch SATA III 6Gb...,
4,0,"Crucial P1 1TB 3D NAND NVMe PCIe Internal SSD,...",,"Crucial P1 1TB 3D NAND NVMe PCIe Internal SSD,...",,1TB Crucial P1 M.2 2280 PCI Express 3.0 x 4 So...,Crucial 1TB P1 NVMe M.2 2280 Internal SSD
...,...,...,...,...,...,...,...
317,0,,,,,,
318,0,,XPG - Ultimate Series SU800 2TB Internal SATA ...,,,,ADATA Technology 2TB Ultimate SU800 SATA III 2...
319,0,,,,,,
320,0,,SanDisk - Ultra 2TB Internal SATA Solid State ...,,"SanDisk Ultra 2TB 2.5"" SATA Internal Solid Sta...",2TB SanDisk Ultra 3D Serial ATA III 6GB 2.5-in...,"SanDisk 2TB 3D SATA III 2.5"" Internal SSD"


In [9]:
# Drop the Unnamed: 0 column and drop any row where it is all NaN
def remove_misc(df):
    columns = list(df.columns)[1:]
    df = df.drop(columns=['Unnamed: 0'])
    df = df.dropna(how='all')
    print(len(df))
    return df


In [10]:
ram_df = remove_misc(ram_df)
cpu_df = remove_misc(cpu_df)
hard_drive_df = remove_misc(hard_drive_df)

210
315
233


In [12]:
def generate_pos_pcpartpicker_data(df):
    columns = list(df.columns)
    pos_df = pd.DataFrame(columns=['title_one', 'title_two', 'label'])
    for idx in range(len(df)):
        row = df.iloc()[idx]
        titles = []
        for col in columns:
            if not pd.isnull(row[col]): titles.append(row[col])
        if len(titles) > 1:
            combs = combinations(titles, 2)
            for comb in combs:
                comb = list(comb)
                comb.append(1)
                pos_df = pos_df.append(pd.DataFrame([comb], columns=['title_one', 'title_two', 'label']))
    
    return pos_df


In [None]:
pos_ram_data = generate_pos_pcpartpicker_data(ram_df)

pos_cpu_data = generate_pos_pcpartpicker_data(cpu_df)

pos_hard_drive_data = generate_pos_pcpartpicker_data(hard_drive_df)


In [15]:
def generate_neg_pcpartpicker_data(df):
    columns = list(df.columns)
    neg_df = pd.DataFrame(columns=['title_one', 'title_two', 'label'])
    df_list = df.iloc()
    for idx in range(len(df)):
        row = df_list[idx]
        for col in columns:
            if not pd.isnull(row[col]):
                neg_idx = None
                while neg_idx == idx or neg_idx is None:
                    neg_idx = random.randint(0, len(df) - 1)
                
                neg_title = None
                while neg_title == None or pd.isnull(neg_title):
                    neg_title = df_list[neg_idx][random.choice(columns)]
                
                neg_df = neg_df.append(pd.DataFrame([[row[col], neg_title, 0]], columns=['title_one', 'title_two', 'label']))
    
    return neg_df

In [25]:
neg_ram_data = generate_neg_pcpartpicker_data(ram_df)

neg_cpu_data = generate_neg_pcpartpicker_data(cpu_df)

neg_hard_drive_data = generate_neg_pcpartpicker_data(hard_drive_df)

final_ram_data = create_final_data(pos_ram_data, neg_ram_data)

final_cpu_data = create_final_data(pos_cpu_data, neg_cpu_data)

final_hard_drive_data = create_final_data(pos_hard_drive_data, neg_hard_drive_data)

print(len(final_cpu_data), len(final_ram_data), len(final_hard_drive_data))

962 696 1010


## Embeddings Creation Functions
Generates the embeddings and saves them

In [60]:
"""
Create the numpy files of all the training embedddings
We will have two numpy files:
1. The training/validation/test sets
2. The labels
"""

def create_embeddings(df):
    # Create the numpy arrays for storing the embeddings and labels
    total_embeddings = np.zeros(shape=(len(df), 2, MAX_LEN, EMBEDDING_SHAPE[0]))
    labels = np.zeros(shape=(len(df)))
    
    # I know this is a terrible way of doing this, but iterate over the dataframe
    # and generate the embeddings to add to the numpy array
    for idx, row in enumerate(df.itertuples()):
        for word_idx, word in enumerate(row.title_one.split()):
            total_embeddings[idx, 0, word_idx] = fasttext_model[word]
            
        for word_idx, word in enumerate(row.title_two.split()):
            total_embeddings[idx, 1, word_idx] = fasttext_model[word]
            
        labels[idx] = row.label
        
    return total_embeddings, labels


In [61]:
def save_embeddings(df, embeddings_name, labels_name):
    """
    Saves the embeddings given the embeddings file name and labels file name
    """
    if not os.path.exists('data/numpy_data/' + embeddings_name + '.npy'):
        embeddings, labels = create_embeddings(df)
        with open('data/numpy_data/' + embeddings_name + '.npy', 'wb') as f:
            np.save(f, embeddings)

        with open('data/numpy_data/' + labels_name + '.npy', 'wb') as f:
            np.save(f, labels)

In [62]:
def load_embeddings_and_labels(embeddings_name, labels_name):
    loaded_embeddings = None
    labels = None
    with open('data/numpy_data/' + embeddings_name + '.npy', 'rb') as f:
        loaded_embeddings = np.load(f)
        loaded_embeddings = np.transpose(loaded_embeddings, (1, 0, 2, 3))
    
    with open('data/numpy_data/' + labels_name + '.npy', 'rb') as f:
        labels = np.load(f)
    
    return loaded_embeddings, labels

## Saving and Loading Embeddings
Save the embeddings for the different types of data we have

In [75]:
# Concatenate everything
total_data = pd.concat([final_computer_df, final_laptop_df, final_ram_data, final_cpu_data, final_hard_drive_data])
total_data = total_data.sample(frac=1)
#save_embeddings(final_computer_df, 'bal_computers_embeddings', 'bal_computers_labels')
save_embeddings(total_data, 'all_embeddings', 'all_labels')

In [None]:
embeddings, labels = load_embeddings_and_labels('all_embeddings', 'all_labels')

In [114]:
len(embeddings[0,:])

32410

In [115]:
total_data

Unnamed: 0,title_one,title_two,label
16663,acer aspire es1 132 p194 business notebook 331...,acer aspire es1 132 p194 business notebook len...,1
0,lenovo ideapad 310 15ikb notebook 15 6 inch fu...,lenovo ideapad 310 15ikb 15 6 inch intel core ...,1
0,hp 250 g6 ultrabook 15 6 inch full hd 1920x108...,hp 250 g6 ultrabook 15 6 inch full hd 1920x108...,0
3086,corsair vengeance led 16gb 2x8gb ddr4pc4 21300...,corsair vengeance red led 16gb 2x8gb ddr4 pc4 ...,1
15990,kingston datatraveler 100 g3 32 gb usb 3 0 dt1...,usb datatraveler 100 g3 3 0 stick 32 gb,1
...,...,...,...
11649,seagate laptop sshd 1 tb internal st1000lm014 ...,wd green wds240g1g0a ssd 240 go sata 6gb garan...,0
0,lenovo ideapad 320 17isk notebook 17 3 inch 16...,lenovo notebook 17 3 inch 1600x900 intel core ...,1
15592,sandisk extreme microsdhc 64gb type 10 acheter...,sandisk extreme microsdhc 64gb type 10 kopen e...,1
10730,dg0146famwl hp 146 gb 6g 10k 2 5 dp sas new pa...,dg0146famwl hp 146 gb 6g 10k 2 5 dp sas hdd ne...,1


In [116]:
X_train1 = embeddings[0, :len(labels) - 4000]
X_train2 = embeddings[1, :len(labels) - 4000]
X_train = np.stack((X_train1, X_train2))
print('Training shape: ' + str(X_train.shape))

X_val1 = embeddings[0, len(labels) - 4000:len(labels) - 2000]
X_val2 = embeddings[1, len(labels) - 4000:len(labels) - 2000]
X_val = np.stack((X_val1, X_val2))
print('Val shape: ' + str(X_val.shape))

X_test1 = embeddings[0, len(labels) - 2000:]
X_test2 = embeddings[1, len(labels) - 2000:]
X_test = np.stack((X_test1, X_test2))
print('Test shape: ' + str(X_test.shape))

Training shape: (2, 28410, 42, 300)
Val shape: (2, 2000, 42, 300)
Test shape: (2, 2000, 42, 300)


In [117]:
Y_train = labels[:len(labels) - 4000]
print('Training labels shape:', str(Y_train.shape))

Y_val = labels[len(labels) - 4000:len(labels) - 2000]
print('Val shape:', str(Y_val.shape))

Y_test = labels[len(labels) - 2000:]
print('Test shape:', str(Y_test.shape))

Training labels shape: (28410,)
Val shape: (2000,)
Test shape: (2000,)


In [118]:
def convert_to_one_hot(Y, C):
    Y = np.eye(C)[Y.reshape(-1)]
    return Y

In [119]:
Y_train = convert_to_one_hot(Y_train.astype(np.int32), 2)
Y_val = convert_to_one_hot(Y_val.astype(np.int32), 2)
Y_test = convert_to_one_hot(Y_test.astype(np.int32), 2)

In [135]:
Y_train

array([[0., 1.],
       [0., 1.],
       [1., 0.],
       ...,
       [0., 1.],
       [1., 0.],
       [0., 1.]])

## Model Info

For the model, we are going to use LSTMs with a Constrastive Loss Function 
that will also be used to predict whether the two products are the same 

First, we have to convert the titles to embeddings through FastText before feeding into the LSTM.
The embedding part of this model will not be a layer because:
* The fasttext model would be time consuming and annoying to get to work with an embedding layer in Keras
* The fasttext model is not going to be getting its embeddings optimized, so there is really no point in adding it as an embedding layer

In [7]:
def square_distance(vectors):
    x, y = vectors
    return tf.square(x - y)

def euclidean_dist_out_shape(shapes):
    # Both inputs are fed in, so just use one of them and get the first value in the shape
    shape1, shape2 = shapes
    return (shape1[0],)

def siamese_network(input_shape):
    # Defines our inputs
    left_title = Input(input_shape, dtype='float32')
    right_title = Input(input_shape, dtype='float32')
    
    # The LSTM units
    model = tf.keras.Sequential(name='siamese_model')
    model.add(LSTM(units=256, return_sequences=True, name='lstm_1'))
    model.add(Dropout(rate=0.5))
    model.add(LSTM(units=128, return_sequences=True, name='lstm_2'))
    model.add(Dropout(rate=0.5))
    model.add(LSTM(units=128, name='lstm_3'))
    model.add(Dropout(rate=0.6))
    
    # The dense layers
    model.add(Dense(units=1024, activation='elu', name='dense_1'))
    model.add(Dropout(rate=0.6))
    model.add(Dense(units=512, activation='elu', name='dense_2'))
    
    # Forward propagate through the model to generate the encodings
    encoded_left_title = model(left_title)
    encoded_right_title = model(right_title)

    SquareDistanceLayer = Lambda(square_distance)
    distance = SquareDistanceLayer([encoded_left_title, encoded_right_title])
    
    prediction = Dense(units=2, activation='softmax')(distance)
    # Create and return the network
    siamese_net = tf.keras.Model(inputs=[left_title, right_title], outputs=prediction, name='siamese_network')
    return siamese_net

In [121]:
# Note: for the constrastive loss, because 0 denotes that they are from the same class
# and one denotes they are from a different class, I swaped the (Y) and (1 - Y) terms

def constrastive_loss(y_true, y_pred):
    margin = 2.0
    d = y_pred
    d_sqrt = tf.sqrt(d)
    #tf.print('\nY Pred: ', d, 'Shape: ', tf.shape(d))
    #tf.print('\nY True: ', y_true, 'Shape: ', tf.shape(y_true))
    
    loss = (y_true * d) + ((1 - y_true) * tf.square(tf.maximum(0., margin - d_sqrt)))
    
    #tf.print('\n Constrastive Loss: ', loss, 'Shape: ', tf.shape(loss))
    loss = 0.5 * tf.reduce_mean(loss)
    
    return loss

In [122]:
# Accuracy metric for constrastive loss because values close to 0 are equal and values high are different
# 0.5 is the threshold here
def constrastive_accuracy(y_true, y_pred):
    return tf.reduce_mean(tf.cast(tf.equal(y_true, tf.cast(y_pred < 0.5, y_true.dtype)), y_true.dtype))

In [123]:
def save_model(model, name):
    """
    Saves a model with a particular name
    """
    model.save('models/' + name + '.h5')

In [8]:
model = siamese_network((MAX_LEN, EMBEDDING_SHAPE[0],))
model.summary()

Model: "siamese_network"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 43, 300)]    0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 43, 300)]    0                                            
__________________________________________________________________________________________________
siamese_model (Sequential)      (None, 512)          1555968     input_1[0][0]                    
                                                                 input_2[0][0]                    
__________________________________________________________________________________________________
lambda (Lambda)                 (None, 512)          0           siamese_model[0][0]

In [128]:
# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='RMSprop', metrics=['accuracy'])

In [129]:
# Train the model
model.fit(x=[X_train1, X_train2], y=Y_train, batch_size=128, epochs=80, validation_data=([X_val[0], X_val[1]], Y_val))

Train on 28410 samples, validate on 2000 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x1c2d9bc9448>

In [130]:
# Test the model
results = model.evaluate([X_test1, X_test2], Y_test, batch_size=16)
print('test loss, test acc: ', results)

test loss, test acc:  [0.3420196931362152, 0.887]


In [9]:
# Set the model's name
model_name = '0.2_Softmax-LSTM-128_batch_80_epochs'

In [None]:
# Save the model
save_model(model, model_name)

## Manual Testing
Converts titles into embeddings arrays and allow the model to make a prediction

In [10]:
model.load_weights('models/' + model_name + '.h5')

In [135]:
title_one = 'ultrabook intel hd graphics 620 dell dvd rw full hd 1920x1080 8gb ram webcam latitude 7480 hdmi premium 14 0 256gb ssd intel core i7 7600u 2 8ghz'
title_two = 'fp reader 256gb ssd usb type c latitude 7480 intel core i7 7600u 2 8ghz ultrabook 14 0 premium dell intel hd graphics 620 home 8gb ram'
#title_one = 'Corsair 16GB ram'
#title_two = 'G Skill 32GB ram'
title_one_arr = np.zeros((1, MAX_LEN, 300))
title_two_arr = np.zeros((1, MAX_LEN, 300))
title_one = remove_stop_words(title_one.lower())
title_two = remove_stop_words(title_two.lower())

for idx, word in enumerate(title_one.split(' ')):
    title_one_arr[0, idx] = fasttext_model[word]
    
for idx, word in enumerate(title_two.split(' ')):
    title_two_arr[0, idx] = fasttext_model[word]

In [136]:
model.predict([title_one_arr, title_two_arr])

array([[9.9936146e-01, 6.3860090e-04]], dtype=float32)