In [13]:
import fasttext
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
import os
import random
from itertools import combinations

import tensorflow as tf
from tensorflow.keras.layers import LSTM, Dense, Input, Dropout, Lambda, Concatenate

# Have to download the stopwords
# nltk.download('stopwords')

In [308]:
# Get the fasttext model (we are using the largest one they offer [600B tokens])
fasttext_model = fasttext.load_model('models/crawl-300d-2M-subword.bin')



## Data Processsing and Organization
Here, all we really want to do is prepare the data for training. This is **only** the data from **Gold Standard** This includes:
* Simplifying the original data
* Normalizing the data 
* Balancing the positive and negative examples
* Creating the embedding representations that will actually get fed into the neural network

In [2]:
def remove_stop_words(phrase):
    # Creates the stopwords
    to_stop = stopwords.words('english')
    punctuation = "!”#$%&’()*+,-./:;<=>?@[\]^_`{|}~ "
    for c in punctuation:
        to_stop.append(c)

    to_stop.append('null')
    
    for punc in punctuation:
        phrase = phrase.replace(punc, ' ')
    
    return ' '.join((' '.join([x for x in phrase.split(' ') if x not in to_stop])).split())


In [32]:
# Organizing and normalizing the data
"""
Essentially, we want to only have three attributes for each training example: title_one, title_two, label
For normalization, we are just going to use the nltk stopwords and punctuation
"""

def preprocessing(orig_data):
    """
    Normalizes the data by getting rid of stopwords and punctuation
    """
    
    # The new names of the columns
    column_names = ['title_one', 'title_two', 'label']
    # A new dataframe for the data we are going to be creating
    norm_computers = pd.DataFrame(columns = column_names)
    # Iterate over the original dataframe (I know it is slow and there are probably better ways to do it)
    for row in orig_data.itertuples():
        title_left = remove_stop_words(row.title_left)
        title_right = remove_stop_words(row.title_right)
        
        # Append the newly created row (title_left, title_right, label) to the new dataframe
        norm_computers = norm_computers.append(pd.DataFrame([[title_left, title_right, row.label]], columns=column_names))
        
    return norm_computers
        

In [33]:
def create_simple_data():
    """
    Creates and saves a simpler version of the original data that only contains the the two titles and the label.
    """
    
    # Get the dataset of computer parts
    computers_df = pd.read_json('data/train/computers_train_xlarge_normalized.json.gz',compression='gzip', lines=True)
    norm_computers = preprocessing(computers_df)
    
    # Save the new normalized and simplified data to a CSV file to load later
    norm_computers.to_csv('data/train/computers_train_xlarge_norm_simple.csv', index=False)

In [34]:
# Create and save the data if the simple and normalized data does not exist
if not os.path.exists('data/train/computers_train_xlarge_norm_simple.csv'):
    create_simple_data()

In [None]:
# Load the data
computer_df = pd.read_csv('data/train/computers_train_xlarge_norm_simple.csv')

In [None]:
# See some of the data. There is clearly a separation between the positive and negative examples
computer_df

In [64]:
def create_train_df(df):
    """
    Returns a shuffled dataframe with an equal amount of positive and negative examples
    """
    # Get the positive and negative examples
    pos_df = df.loc[df['label'] == 1]
    neg_df = df.loc[df['label'] == 0]
    
    # Shuffle the data
    pos_df = pos_df.sample(frac=1)
    neg_df = neg_df.sample(frac=1)
    
    # Concatenate the positive and negative examples and 
    # make sure there are only as many negative examples as positive examples
    final_df = pd.concat([pos_df[:min(len(pos_df), len(neg_df))], neg_df[:min(len(pos_df), len(neg_df))]])
    
    # Shuffle the final data once again
    final_df.sample(frac=1)
    return final_df

In [36]:
# Create and save the dataframe with equal numbers of positive and negative examples
# and is shuffled
if not os.path.exists('data/train/computers_train_bal_shuffle.csv'):
    create_train_df(computer_df).to_csv('data/train/computers_train_bal_shuffle.csv', index=False)

In [75]:
final_computer_df = pd.read_csv('data/train/computers_train_bal_shuffle.csv')

In [76]:
final_computer_df

Unnamed: 0,title_one,title_two,label
0,corsair carbide air 240 windowed,corsair carbide series air 240 cube micro atx ...,1
1,a8 7670k black edition quad core amd cpu fan h...,amd a8 7650k 3 3ghz pccomponentes,1
2,amazonbasics 13 3 inch laptop sleeve black acc...,amazonbasics 13 3 inch laptop sleeve black car...,1
3,eg0146fartr hp 146 gb 6g 10k 2 5 dp sas hdd ne...,eg0146fartr hp 146 gb 6g 10k 2 5 dp sas hdd,1
4,usb 3 0 external adapter cable 2 5 inch hard d...,transcend ssd370 solid state drive ssd 2 5 sat...,0
...,...,...,...
19375,356816 001 ml350t g4p xeon 3 2 2mb 512mb whole...,409159 b21 hp xeon e5345 2 33ghz dl160 g3 new ...,0
19376,buy online samsung 750 evo series 120gb ssd mz...,ssd 750 basic 120 gb tradineur com,1
19377,628061 s21 hp g8 g9 3 tb 6g 7 2k 5 sata sc new...,628061 s21 hp g8 g9 3 tb 6g 7 2k 5 sata sc new...,1
19378,buy online zotac gtx 1060 6gb amp edition grap...,msi nvidia geforce gtx 1080 8gb gaming x rgb g...,0


## Laptop Data Preprocessing
* Normalize the data
* Create negative examples that represent when only a couple of attributes of the laptop data changes

In [4]:
# Load the laptop data
laptop_df = pd.read_csv('data/train/laptops.csv', encoding='latin-1')

In [5]:
laptop_df

Unnamed: 0.1,Unnamed: 0,Company,Product,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price_euros
0,1,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8GB,128GB SSD,Intel Iris Plus Graphics 640,macOS,1.37kg,1339.69
1,2,Apple,Macbook Air,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,1.34kg,898.94
2,3,HP,250 G6,Notebook,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,256GB SSD,Intel HD Graphics 620,No OS,1.86kg,575.00
3,4,Apple,MacBook Pro,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16GB,512GB SSD,AMD Radeon Pro 455,macOS,1.83kg,2537.45
4,5,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8GB,256GB SSD,Intel Iris Plus Graphics 650,macOS,1.37kg,1803.60
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1298,1316,Lenovo,Yoga 500-14ISK,2 in 1 Convertible,14.0,IPS Panel Full HD / Touchscreen 1920x1080,Intel Core i7 6500U 2.5GHz,4GB,128GB SSD,Intel HD Graphics 520,Windows 10,1.8kg,638.00
1299,1317,Lenovo,Yoga 900-13ISK,2 in 1 Convertible,13.3,IPS Panel Quad HD+ / Touchscreen 3200x1800,Intel Core i7 6500U 2.5GHz,16GB,512GB SSD,Intel HD Graphics 520,Windows 10,1.3kg,1499.00
1300,1318,Lenovo,IdeaPad 100S-14IBR,Notebook,14.0,1366x768,Intel Celeron Dual Core N3050 1.6GHz,2GB,64GB Flash Storage,Intel HD Graphics,Windows 10,1.5kg,229.00
1301,1319,HP,15-AC110nv (i7-6500U/6GB/1TB/Radeon,Notebook,15.6,1366x768,Intel Core i7 6500U 2.5GHz,6GB,1TB HDD,AMD Radeon R5 M330,Windows 10,2.19kg,764.00


In [6]:
# This class will be used in order to exchange the different attributes
# to create negative examples
class Attributes():
    company = {'Apple'}
    product = {'MacBook Pro'}
    inches = {'13.3'}
    cpu = {'Intel Core i5 2.3GHz'}
    ram = {'4GB'}
    memory = {'256GB SSD'}
    gpu = {'Intel HD Graphics 520'}
    screen = {'1440x900'}
    
    def get_all_data():
        return {
            'company': Attributes.company,
            'product': Attributes.product,
            'inches': Attributes.inches,
            'cpu': Attributes.cpu,
            'ram': Attributes.ram,
            'memory': Attributes.memory,
            'gpu': Attributes.gpu,
            'screen': Attributes.screen
        }

In [7]:
# Create attribute sets
def create_attribute_sets(df):
    Attributes.company.update([row.Company for row in laptop_df[['Company']].itertuples()])
    Attributes.product.update([row.Product for row in laptop_df[['Product']].itertuples()])
    Attributes.inches.update([str(row.Inches) for row in laptop_df[['Inches']].itertuples()])
    Attributes.cpu.update([row.Cpu for row in laptop_df[['Cpu']].itertuples()])
    Attributes.ram.update([row.Ram for row in laptop_df[['Ram']].itertuples()])
    Attributes.memory.update([row.Memory for row in laptop_df[['Memory']].itertuples()])
    Attributes.gpu.update([row.Gpu for row in laptop_df[['Gpu']].itertuples()])
    Attributes.screen.update([row.ScreenResolution for row in laptop_df[['ScreenResolution']].itertuples()])

create_attribute_sets(laptop_df)

In [62]:
def concatenate_row(row):
    # Note: got rid of everything after the '(' because it has info about the actual specs of the laptop
    # so if we change the specs, we need to fix that too
    
    # Special tags at the end of the amount of inches of the laptop and the RAM to simulate real data
    inch_attr = str(row['Inches']) + random.choice([' inch', '', '"'])
    ram_attr = row['Ram'] + random.choice([' ram', ' memory', ''])
    
    # These are words that commonly come up with laptops
    modifiers = ['premium', 'new', 'fast', 'latest model']
    add_ins = ['USB 3.0', 'USB 3.1 Type-C', 'USB Type-C', 'Bluetooth', 'WIFI', 'Webcam', 'FP Reader',
               'HDMI', '802.11ac', '802.11 ac', 'home', 'flagship', 'business', 'GbE LAN', 'DVD-RW', 'DVD']
    
    cpu_attr = row['Cpu']
    if random.choice([0, 1]):
        cpu_attr = cpu_attr.split(' ')
        if random.choice([0, 1]):
            if 'Intel' in cpu_attr:
                cpu_attr.remove('Intel')
        if random.choice([0, 1]):
            if 'Core' in cpu_attr:
                cpu_attr.remove('Core')
        if random.choice([0, 1]):
            if 'AMD' in cpu_attr:
                cpu_attr.remove('AMD')
    
        cpu_attr = ' '.join(cpu_attr)

    # Create a list for all the product attributes
    order_attrs = [random.choice(modifiers),
                   row['Company'],
                   row['Product'].split('(')[0],
                   row['TypeName'],
                   inch_attr,
                   row['ScreenResolution'],
                   cpu_attr,
                   ram_attr,
                   row['Memory'],
                   row['Gpu']]
    
    order_attrs = order_attrs + random.sample(add_ins, 3)
    
    # Shuffle the data because in real data, it does not really matter what order the attributes are in
    random.shuffle(order_attrs)
    
    return ' '.join(order_attrs)

In [63]:
def print_dataframe(df):
    for idx in range(len(df)):
        print(df.iloc[idx].title_one + '\n' + df.iloc[idx].title_two)
        print('________________________________________________________________')

In [64]:
# Creates the negative examples for the laptop data
# The laptop_df is the original data, the new_df is the dataframe to append the new data to
# and the attributes are the attributes to swap for the new data
def create_neg_laptop_data(laptop_df, attributes):
    new_column_names = ['title_one', 'title_two', 'label']
    negative_df = pd.DataFrame(columns = new_column_names)
    for row in range(len(laptop_df)):
        # Create a copy of the row for the negative example
        neg_row = laptop_df.iloc[row]
        for attribute_class in attributes:
            # Get the row in the laptop_data
            orig_row = laptop_df.iloc[row]
            
            # Get the attribute that we are trying to change
            attribute_val = orig_row[attribute_class]
            
            # Temporarily value for the new value
            new_val = attribute_val
            
            # Make sure we really get a new attribute
            while new_val == attribute_val:
                new_val = random.sample(Attributes.get_all_data()[attribute_class.lower()], 1)[0]
            
            # Change the value in the neg_row to the new value
            neg_row[attribute_class] = new_val
            
            # Concatenate and normalize the data
            title_one = remove_stop_words(concatenate_row(orig_row).lower())
            title_two = remove_stop_words(concatenate_row(neg_row).lower())
            
            # Append the data to the new df
            negative_df = negative_df.append(pd.DataFrame([[title_one, title_two, 0]], columns=new_column_names))
    
    return negative_df

In [65]:
neg_df = create_neg_laptop_data(laptop_df, attributes=['Cpu', 'Memory', 'Ram', 'Inches', 'Product'])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [66]:
print_dataframe(neg_df)

webcam 13 3" 802 11ac wifi 8gb fast apple ips panel retina display 2560x1600 intel iris plus graphics 640 ultrabook macbook pro 128gb ssd i5 2 3ghz
ultrabook intel iris plus graphics 640 apple gbe lan webcam flagship intel celeron dual n3350 2ghz macbook pro 13 3" latest model 128gb ssd ips panel retina display 2560x1600 8gb ram
________________________________________________________________
intel iris plus graphics 640 macbook pro 8gb ultrabook 802 11 ac dvd rw 13 3 latest model apple ips panel retina display 2560x1600 i5 2 3ghz home 128gb ssd
macbook pro gbe lan ultrabook 32gb flash storage webcam intel iris plus graphics 640 apple ips panel retina display 2560x1600 celeron dual core n3350 2ghz usb 3 1 type c 13 3 inch 8gb ram new
________________________________________________________________
macbook pro new dvd rw business 13 3 ips panel retina display 2560x1600 intel iris plus graphics 640 apple 128gb ssd bluetooth ultrabook 8gb ram intel core i5 2 3ghz
ips panel retina display 

notebook new 15 6 inch 128gb ssd 1tb hdd full hd 1920x1080 usb 3 0 nvidia geforce 940mx webcam intel xeon e3 1535m v5 2 9ghz asus vivobook s15 hdmi 24gb memory
________________________________________________________________
8gb memory 15 6 inch 256gb ssd notebook full hd 1920x1080 latest model asus i7 8550u 1 8ghz fp reader nvidia geforce 940mx usb 3 1 type c hdmi vivobook s15
usb 3 0 full hd 1920x1080 intel xeon e3 1535m v5 2 9ghz nvidia geforce 940mx 15 6" new asus 128gb ssd 1tb hdd 24gb memory notebook vivobook s15 dvd rw bluetooth
________________________________________________________________
8gb full hd 1920x1080 vivobook s15 asus dvd rw notebook wifi 15 6" new hdmi 256gb ssd intel core i7 8550u 1 8ghz nvidia geforce 940mx
15 6 24gb ram fp reader premium full hd 1920x1080 notebook probook 470 bluetooth 128gb ssd 1tb hdd home asus intel xeon e3 1535m v5 2 9ghz nvidia geforce 940mx
________________________________________________________________
ips panel full hd touchscreen 1920

intel xeon e3 1535m v5 2 9ghz 13 3 inch business inspiron 5379 dell 32gb flash storage full hd touchscreen 1920x1080 16gb memory 2 1 convertible dvd usb type c intel uhd graphics 620 premium
________________________________________________________________
802 11 ac 13 3" intel uhd graphics 620 512gb ssd 16gb memory core i7 8550u 1 8ghz dell inspiron 5379 usb type c wifi new full hd touchscreen 1920x1080 2 1 convertible
6gb ram webcam gbe lan intel uhd graphics 620 inspiron 5379 dell 2 1 convertible 32gb flash storage intel xeon e3 1535m v5 2 9ghz wifi full hd touchscreen 1920x1080 premium 13 3 inch
________________________________________________________________
full hd touchscreen 1920x1080 flagship dell intel uhd graphics 620 13 3 16gb new 2 1 convertible intel i7 8550u 1 8ghz inspiron 5379 wifi home 512gb ssd
business inspiron 5379 home full hd touchscreen 1920x1080 premium 6gb intel xeon e3 1535m v5 2 9ghz dvd rw 2 1 convertible 17 3 inch dell intel uhd graphics 620 32gb flash stor

ips panel full hd 1366x768 18 4 inch fp reader m3 6y30 0 9ghz usb 3 0 usb 3 1 type c hp 64gb ram notebook intel uhd graphics 620 probook 450 64gb flash storage fast
________________________________________________________________
intel i7 8550u 1 8ghz wifi new notebook usb 3 1 type c hp ips panel full hd 1366x768 1tb hdd 15 6 inch intel uhd graphics 620 bluetooth probook 450 8gb ram
hp 18 4" new usb type c 802 11ac ips panel full hd 1366x768 intel core m3 6y30 0 9ghz 64gb gbe lan 64gb flash storage latitude 7480 notebook intel uhd graphics 620
________________________________________________________________
amd radeon rx 540 usb type c acer 1366x768 15 6" bluetooth amd a12 series 9720p 2 7ghz notebook hdmi aspire 5 8gb ram new 256gb ssd
intel core 6y75 1 2ghz 15 6 usb type c 1366x768 fp reader acer 8gb ram notebook dvd aspire 5 256gb ssd latest model amd radeon rx 540
________________________________________________________________
amd radeon rx 540 amd a12 series 9720p 2 7ghz 15 6 136

home intel i7 6820hk 2 7ghz nvidia geforce gtx 1070 hp 8gb memory 12 0" 32gb flash storage gaming new aspire f5 573g 510l full hd 1920x1080 bluetooth usb 3 0
________________________________________________________________
intel celeron dual core n3350 1 1ghz 32gb ssd flexbook edge fast 2 1 convertible 11 6 inch hdmi 4gb memory webcam intel hd graphics 500 ips panel full hd touchscreen 1920x1080 mediacom 802 11 ac
dvd 2 1 convertible wifi flexbook edge amd fx 9830p 3ghz ips panel full hd touchscreen 1920x1080 mediacom 11 6 inch 4gb memory business fast 32gb ssd intel hd graphics 500
________________________________________________________________
mediacom intel celeron dual core n3350 1 1ghz 2 1 convertible webcam ips panel full hd touchscreen 1920x1080 intel hd graphics 500 business 11 6 inch 32gb ssd hdmi premium flexbook edge 4gb memory
2 1 convertible mediacom 512gb ssd 2tb hdd dvd premium 4gb 802 11ac ips panel full hd touchscreen 1920x1080 11 6 bluetooth amd fx 9830p 3ghz flexboo

dell fast amd radeon r5 m430 500gb hdd intel core i5 7200u 2 5ghz 4gb ram notebook 15 6" 802 11ac full hd 1920x1080 dvd rw usb type c inspiron 3567
new amd e series 9000e 1 5ghz business 8gb ram gbe lan rog g701vo full hd 1920x1080 2tb hdd home amd radeon r5 m430 notebook dell 15 4"
________________________________________________________________
usb type c full hd 1920x1080 i5 7440hq 2 8ghz 8gb intel hd graphics 620 dell new notebook flagship 15 6" usb 3 0 256gb ssd latitude 5580
wifi latitude 5580 8gb memory full hd 1920x1080 intel hd graphics 620 premium hdmi notebook 15 6 256gb ssd dell intel xeon e3 1535m v6 3 1ghz 802 11 ac
________________________________________________________________
premium business full hd 1920x1080 usb 3 0 8gb ram dell 15 6 802 11ac 256gb ssd intel hd graphics 620 latitude 5580 notebook intel core i5 7440hq 2 8ghz
512gb ssd latitude 5580 dvd rw dell latest model notebook xeon e3 1535m v6 3 1ghz 8gb hdmi flagship 15 6 inch full hd 1920x1080 intel hd graphic

fast home i7 7660u 2 5ghz 256gb ssd 1tb hdd 8gb memory 15 6 inch usb type c 1366x768 fp reader x541na go414t intel hd graphics 500 notebook asus
________________________________________________________________
1tb hdd asus x541na go414t wifi 8gb 15 6 inch intel celeron dual core n3350 1 1ghz 1366x768 latest model notebook intel hd graphics 500 dvd home
notebook 15 6" fast usb type c 1366x768 gbe lan 4gb memory x541na go414t 256gb ssd 1tb hdd intel hd graphics 500 asus fp reader intel core i7 7660u 2 5ghz
________________________________________________________________
8gb gbe lan flagship intel hd graphics 500 fast 1tb hdd x541na go414t fp reader 15 6 notebook 1366x768 intel celeron dual n3350 1 1ghz asus
notebook 4gb x541na go414t intel core i7 7660u 2 5ghz usb 3 0 256gb ssd 1tb hdd dvd rw 11 6" asus intel hd graphics 500 business 1366x768 latest model
________________________________________________________________
x541na go414t 15 6 usb 3 0 fast notebook 1366x768 dvd rw intel hd gra

32gb ram dell 256gb ssd 256gb ssd fp reader inspiron 7560 nvidia geforce 940mx 13 9 inch flagship amd fx 8800p 2 1ghz full hd 1920x1080 notebook 802 11ac new
________________________________________________________________
premium bluetooth 802 11ac nvidia geforce 940mx flagship dell full hd 1920x1080 core i7 7500u 2 7ghz inspiron 7560 notebook 128gb ssd 1tb hdd 15 6" 8gb
full hd 1920x1080 business usb 3 1 type c dell gt62vr 6rd nvidia geforce 940mx 32gb new 256gb ssd 256gb ssd amd fx 8800p 2 1ghz notebook 13 9 802 11 ac
________________________________________________________________
dell intel hd graphics 520 gbe lan 8gb vostro 3568 15 6 hdmi notebook 1366x768 dvd rw 256gb ssd core i3 6006u 2ghz latest model
flagship 15 6 notebook home hdmi 8gb memory intel core i5 3 1ghz fast 1366x768 intel hd graphics 520 vostro 3568 256gb ssd dell
________________________________________________________________
webcam vostro 3568 1366x768 dell 802 11 ac latest model intel hd graphics 520 core i3 6

intel i7 2 2ghz intel hd graphics 520 elitebook 840 4gb hp ultrabook 64gb flash storage 14 0 inch dvd full hd 1920x1080 dvd rw webcam fast
________________________________________________________________
intel hd graphics 520 flagship elitebook 840 full hd 1920x1080 ultrabook gbe lan 512gb ssd hp 14 0 intel core i7 6500u 2 5ghz bluetooth premium 8gb memory
64gb flash storage intel hd graphics 520 18 4 inch fp reader hp 802 11ac 4gb memory intel core i7 2 2ghz full hd 1920x1080 ultrabook premium elitebook 840 home
________________________________________________________________
premium fp reader elitebook 840 ultrabook wifi full hd 1920x1080 intel hd graphics 520 14 0 inch 8gb ram intel core i7 6500u 2 5ghz usb 3 1 type c hp 512gb ssd
core i7 2 2ghz ultrabook latest model 18 4" hp gbe lan omen 17 w006na flagship intel hd graphics 520 64gb flash storage full hd 1920x1080 4gb webcam
________________________________________________________________
full hd 1920x1080 vostro 3568 home 802 11a

1600x900 dvd rw thinkpad e480 fast gbe lan bluetooth dell 256gb ssd 256gb ssd notebook 24gb ram amd radeon r7 m445 12 5 inch intel core i3 6006u 2 2ghz
________________________________________________________________
1366x768 chromebook 14 hdmi 802 11ac 14 0 intel celeron dual core n3060 1 6ghz acer 32gb flash storage notebook intel hd graphics 400 4gb memory new bluetooth
14 0 intel hd graphics 400 4gb ram chromebook 14 wifi acer usb type c fast notebook intel pentium dual core n4200 1 1ghz 32gb flash storage 1366x768 bluetooth
________________________________________________________________
usb 3 0 chromebook 14 notebook 14 0" intel celeron dual core n3060 1 6ghz 1366x768 32gb flash storage 4gb intel hd graphics 400 webcam acer new usb 3 1 type c
4gb ram pentium dual core n4200 1 1ghz wifi chromebook 14 usb 3 0 acer 1366x768 notebook 256gb ssd 1tb hdd 14 0 intel hd graphics 400 new webcam
________________________________________________________________
acer intel hd graphics 400 inte

intel hd graphics 620 premium business hdmi notebook flagship intel celeron dual n3060 1 6ghz 11 6" 1366x768 16gb ssd 16gb 250 g5 hp
________________________________________________________________
usb 3 1 type c intel core i5 7200u 2 5ghz fast hp notebook 1366x768 250 g5 flagship business 500gb hdd 15 6 inch 4gb memory intel hd graphics 620
latest model notebook 802 11ac webcam 16gb memory 1366x768 hp intel celeron dual n3060 1 6ghz 16gb ssd 11 6 intel hd graphics 620 bluetooth 15 bs053od
________________________________________________________________
hdmi 1366x768 usb type c notebook amd radeon r5 aspire es1 523 15 6 8gb ram 1tb hdd fast wifi amd a8 series 7410 2 2ghz acer
aspire es1 523 business 8gb ram notebook 1366x768 usb 3 0 new acer amd radeon r5 15 6" fp reader intel core i7 2 9ghz 1tb hdd
________________________________________________________________
15 6 notebook amd a8 series 7410 2 2ghz 8gb memory home gbe lan usb 3 1 type c aspire es1 523 amd radeon r5 1366x768 acer pr

intel hd graphics 520 bluetooth 250 g4 hp dvd rw 1366x768 premium intel core i7 7700hq 2 8ghz 13 9 business 32gb flash storage notebook 8gb memory
________________________________________________________________
hp notebook usb type c 250 g4 wifi 1366x768 intel core i5 6200u 2 3ghz intel hd graphics 520 new 4gb 802 11 ac 15 6" 500gb hdd
1366x768 home gp62 7rdx fp reader 32gb flash storage 8gb ram notebook intel hd graphics 520 latest model 13 9 dvd hp intel core i7 7700hq 2 8ghz
________________________________________________________________
15 6" 128gb ssd 1tb hdd intel core i7 7700hq 2 8ghz gaming inspiron 7567 nvidia geforce gtx 1050 ti fast bluetooth usb 3 1 type c dell 4k ultra hd 3840x2160 8gb wifi
15 6" latest model gaming 128gb ssd 1tb hdd dvd gbe lan intel core i5 7500u 2 7ghz 4k ultra hd 3840x2160 dell flagship nvidia geforce gtx 1050 ti inspiron 7567 8gb ram
________________________________________________________________
core i7 7700hq 2 8ghz 15 6 premium gbe lan dvd gamin

webcam netbook fp reader latest model intel hd graphics 400 hp dvd 11 6 inch 2gb core i7 6560u 2 2ghz stream 11 y000na 32gb flash storage 1366x768
________________________________________________________________
intel celeron dual core n3060 1 6ghz 2gb netbook 11 6 inch stream 11 y000na intel hd graphics 400 latest model 802 11ac 1366x768 dvd rw business 32gb flash storage hp
business hp 1tb ssd intel core i7 6560u 2 2ghz usb type c netbook intel hd graphics 400 2gb ram 11 6 1366x768 new wifi stream 11 y000na
________________________________________________________________
webcam 32gb flash storage 802 11ac netbook intel hd graphics 400 2gb memory hp stream 11 y000na intel celeron dual n3060 1 6ghz flagship 1366x768 fast 11 6
netbook 11 6" hdmi fast 1366x768 usb type c hp intel core i7 6560u 2 2ghz intel hd graphics 400 webcam 1tb ssd 6gb ram stream 11 y000na
________________________________________________________________
hp 1366x768 2gb ram business intel hd graphics 400 netbook inte

In [67]:
# Creates the postive examples for the laptop data
# The laptop_df is the original data, the new_df is the dataframe to append the new data to
# and the attributes are the attributes to swap or delete for the new data
def create_pos_laptop_data(laptop_df, rm_attrs, add_attrs):
    new_column_names = ['title_one', 'title_two', 'label']
    pos_df = pd.DataFrame(columns = new_column_names)
    for row in range(len(laptop_df)):
        # Remove the attribute from the new title
        for attr_list in rm_attrs:
            # Create a copy of the row for the negative example
            new_row = laptop_df.iloc[row]
            orig_row = laptop_df.iloc[row]
            for attr in attr_list:
                new_row[attr] = ''
        
            title_one = remove_stop_words(concatenate_row(orig_row).lower())
            title_two = remove_stop_words(concatenate_row(new_row).lower())

            # Occassionally add in the operating system just to switch it up
            if (random.sample([0, 1], 1)):
                for attr in add_attrs:
                    title_two += ' ' + orig_row[attr].lower()

            pos_df = pos_df.append(pd.DataFrame([[title_one, title_two, 1]], columns=new_column_names))

    return pos_df

In [68]:
pos_df = create_pos_laptop_data(laptop_df, rm_attrs = [['Company'], ['TypeName'], ['ScreenResolution'], ['Product'], ['TypeName', 'ScreenResolution']], add_attrs=['OpSys'])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [70]:
print_dataframe(pos_df)

intel core i5 2 3ghz intel iris plus graphics 640 apple hdmi 13 3 inch premium business ultrabook 8gb ips panel retina display 2560x1600 128gb ssd macbook pro usb 3 1 type c
13 3 8gb ips panel retina display 2560x1600 intel iris plus graphics 640 wifi ultrabook i5 2 3ghz latest model fp reader dvd 128gb ssd macbook pro macos
________________________________________________________________
gbe lan 13 3 inch ultrabook business dvd rw 8gb apple macbook pro intel iris plus graphics 640 intel core i5 2 3ghz ips panel retina display 2560x1600 premium 128gb ssd
flagship 13 3 128gb ssd 8gb ram 802 11ac macbook pro usb type c premium apple ips panel retina display 2560x1600 intel core i5 2 3ghz intel iris plus graphics 640 macos
________________________________________________________________
intel iris plus graphics 640 apple macbook pro 8gb dvd fast ultrabook 128gb ssd 13 3 business 802 11 ac intel core i5 2 3ghz ips panel retina display 2560x1600
8gb memory new apple dvd rw intel iris plus g

8gb nvidia geforce 940mx new 15 6 inch fp reader intel core i7 8550u 1 8ghz asus home full hd 1920x1080 256gb ssd business vivobook s15 notebook
256gb ssd usb type c notebook vivobook s15 nvidia geforce 940mx 8gb ram asus intel core i7 8550u 1 8ghz usb 3 1 type c 15 6 gbe lan latest model windows 10
________________________________________________________________
15 6" bluetooth intel core i7 8550u 1 8ghz flagship usb 3 0 vivobook s15 asus full hd 1920x1080 nvidia geforce 940mx 256gb ssd latest model 8gb memory notebook
latest model notebook nvidia geforce 940mx wifi 256gb ssd intel core i7 8550u 1 8ghz usb 3 0 asus full hd 1920x1080 8gb memory hdmi 15 6 inch windows 10
________________________________________________________________
256gb ssd business 802 11 ac dvd vivobook s15 intel i7 8550u 1 8ghz 15 6 full hd 1920x1080 asus 8gb ram nvidia geforce 940mx notebook new
latest model nvidia geforce 940mx 256gb ssd 15 6" hdmi vivobook s15 asus webcam intel core i7 8550u 1 8ghz 8gb memory 

________________________________________________________________
2 1 convertible 16gb ram home full hd touchscreen 1920x1080 dvd rw new 13 3 inspiron 5379 dell usb 3 0 intel i7 8550u 1 8ghz 512gb ssd intel uhd graphics 620
2 1 convertible home inspiron 5379 13 3" intel core i7 8550u 1 8ghz 512gb ssd new 16gb memory intel uhd graphics 620 dell hdmi dvd windows 10
________________________________________________________________
inspiron 5379 full hd touchscreen 1920x1080 16gb ram 512gb ssd intel uhd graphics 620 premium fp reader dell i7 8550u 1 8ghz 2 1 convertible usb 3 0 home 13 3
full hd touchscreen 1920x1080 dvd rw dell gbe lan intel uhd graphics 620 16gb ram 802 11 ac 512gb ssd 13 3 inch latest model intel core i7 8550u 1 8ghz 2 1 convertible windows 10
________________________________________________________________
dvd premium inspiron 5379 dell 802 11 ac 16gb 13 3 inch full hd touchscreen 1920x1080 2 1 convertible intel uhd graphics 620 512gb ssd wifi intel core i7 8550u 1 8ghz


intel core i7 7500u 2 7ghz latest model elitebook 840 14 0" hp flagship intel hd graphics 620 home notebook 8gb gbe lan 256gb ssd windows 10
________________________________________________________________
hp new elitebook 840 intel hd graphics 620 full hd 1920x1080 home 14 0 inch 8gb ram intel core i7 7500u 2 7ghz 256gb ssd hdmi notebook gbe lan
intel hd graphics 620 usb 3 1 type c latest model 8gb intel i7 7500u 2 7ghz 14 0 inch bluetooth business hp full hd 1920x1080 notebook 256gb ssd windows 10
________________________________________________________________
256gb ssd intel core i7 7500u 2 7ghz intel hd graphics 620 elitebook 840 8gb bluetooth usb 3 0 14 0" notebook full hd 1920x1080 802 11 ac hp fast
14 0 inch 8gb memory home premium elitebook 840 intel core i7 7500u 2 7ghz hp dvd 256gb ssd business intel hd graphics 620 windows 10
________________________________________________________________
1tb hdd business lenovo 1600x900 dvd 802 11 ac nvidia geforce 920mx notebook ideapad 

dvd gbe lan x542uq dm117 802 11 ac asus 15 6 fast 1tb hdd notebook intel i3 7100u 2 4ghz 8gb ram nvidia geforce 940mx linux
________________________________________________________________
asus 15 6 intel core i3 7100u 2 4ghz dvd 8gb memory notebook x542uq dm117 new full hd 1920x1080 gbe lan nvidia geforce 940mx hdmi 1tb hdd
notebook bluetooth asus 8gb memory flagship 15 6 intel core i3 7100u 2 4ghz nvidia geforce 940mx business 1tb hdd latest model full hd 1920x1080 linux
________________________________________________________________
asus nvidia geforce 940mx x542uq dm117 8gb ram home full hd 1920x1080 1tb hdd intel core i3 7100u 2 4ghz notebook 15 6 inch latest model business fp reader
fp reader 8gb x542uq dm117 asus fast webcam intel core i3 7100u 2 4ghz nvidia geforce 940mx 15 6" wifi 1tb hdd linux
________________________________________________________________
16gb usb type c gaming home bluetooth intel core i7 7820hk 2 9ghz alienware 17 256gb ssd 1tb hdd ips panel 2560x1440 de

________________________________________________________________
15 6" dell premium full hd 1920x1080 256gb ssd notebook 8gb memory intel hd graphics 620 business gbe lan intel i5 7440hq 2 8ghz dvd latitude 5580
flagship dvd rw full hd 1920x1080 home premium 15 6 intel core i5 7440hq 2 8ghz 256gb ssd 8gb ram intel hd graphics 620 dell notebook windows 10
________________________________________________________________
full hd 1920x1080 usb 3 1 type c intel core i5 7440hq 2 8ghz intel hd graphics 620 8gb latitude 5580 home 802 11 ac notebook dell new 15 6 inch 256gb ssd
8gb ram bluetooth dell intel i5 7440hq 2 8ghz 256gb ssd flagship hdmi 15 6 inch premium intel hd graphics 620 latitude 5580 windows 10
________________________________________________________________
gaming home alienware 17 latest model 16gb memory intel core i7 7700hq 2 8ghz dell nvidia geforce gtx 1070 usb type c 128gb ssd 1tb hdd 17 3 inch ips panel full hd 1920x1080 gbe lan
ips panel full hd 1920x1080 128gb ssd 1tb 

15 6 inch 1tb hdd usb type c full hd 1920x1080 4gb ram usb 3 0 notebook home nvidia geforce 920mx core i3 6006u 2ghz fast lenovo windows 10
________________________________________________________________
4gb gbe lan nvidia geforce 920mx intel core i3 6006u 2ghz flagship 1tb hdd full hd 1920x1080 ideapad 320 15isk notebook latest model webcam 15 6" lenovo
4gb memory fast bluetooth dvd ideapad 320 15isk 15 6 inch 1tb hdd lenovo intel core i3 6006u 2ghz wifi nvidia geforce 920mx windows 10
________________________________________________________________
15 6 intel hd graphics 500 x541na go414t 1tb hdd premium usb type c flagship intel celeron dual core n3350 1 1ghz 1366x768 asus notebook dvd rw 8gb memory
flagship x541na go414t 15 6 new 1tb hdd webcam 8gb notebook intel hd graphics 500 1366x768 intel celeron dual core n3350 1 1ghz wifi windows 10
________________________________________________________________
802 11ac usb type c asus 8gb x541na go414t 15 6" intel hd graphics 500 1366x76

usb 3 0 wifi 128gb ssd 1tb hdd new 15 6 inch inspiron 7560 notebook intel i7 7500u 2 7ghz nvidia geforce 940mx home dell 8gb memory windows 10
________________________________________________________________
notebook 15 6 intel i7 7500u 2 7ghz dell dvd rw nvidia geforce 940mx 8gb memory wifi full hd 1920x1080 latest model inspiron 7560 gbe lan 128gb ssd 1tb hdd
15 6 dvd nvidia geforce 940mx premium 8gb dell webcam intel core i7 7500u 2 7ghz 128gb ssd 1tb hdd full hd 1920x1080 notebook 802 11 ac windows 10
________________________________________________________________
15 6" 8gb memory intel core i7 7500u 2 7ghz nvidia geforce 940mx premium 128gb ssd 1tb hdd wifi notebook bluetooth full hd 1920x1080 inspiron 7560 dell usb 3 1 type c
premium 8gb 15 6 dell gbe lan 128gb ssd 1tb hdd nvidia geforce 940mx inspiron 7560 bluetooth intel core i7 7500u 2 7ghz dvd windows 10
________________________________________________________________
hdmi 8gb memory vostro 3568 webcam notebook 15 6" new 256

128gb ssd 1tb hdd dvd rw gaming 15 6 16gb ram intel core i7 7700hq 2 8ghz premium flagship nvidia geforce gtx 1060 full hd 1920x1080 home rog gl502vm ds74 windows 10
________________________________________________________________
gaming bluetooth nvidia geforce gtx 1060 128gb ssd 1tb hdd fast full hd 1920x1080 16gb intel i7 7700hq 2 8ghz rog gl502vm ds74 asus 802 11 ac 15 6" usb 3 0
128gb ssd 1tb hdd 802 11 ac full hd 1920x1080 latest model nvidia geforce gtx 1060 intel core i7 7700hq 2 8ghz 16gb ram fp reader asus 15 6 rog gl502vm ds74 dvd windows 10
________________________________________________________________
802 11ac full hd 1920x1080 802 11 ac rog gl502vm ds74 16gb ram dvd rw asus 15 6" gaming intel core i7 7700hq 2 8ghz 128gb ssd 1tb hdd fast nvidia geforce gtx 1060
16gb ram 15 6" gaming premium intel core i7 7700hq 2 8ghz asus nvidia geforce gtx 1060 gbe lan flagship dvd rw rog gl502vm ds74 128gb ssd 1tb hdd windows 10
________________________________________________________

ultrabook 256gb ssd webcam 13 3 inch quad hd 3200x1800 flagship wifi premium 8gb intel hd graphics 520 intel core i7 6500u 2 5ghz dell windows 10
________________________________________________________________
xps 13 256gb ssd dell 8gb ultrabook usb 3 1 type c intel i7 6500u 2 5ghz wifi quad hd 3200x1800 latest model 13 3" dvd intel hd graphics 520
8gb ram intel hd graphics 520 256gb ssd usb 3 0 802 11 ac 13 3" dell home premium xps 13 intel core i7 6500u 2 5ghz windows 10
________________________________________________________________
webcam 4gb 2 1 convertible intel hd graphics 400 32gb flash storage ips panel touchscreen 1366x768 chromebook c738t c2ej premium intel celeron dual core n3060 1 6ghz flagship acer fp reader 11 6 inch
bluetooth chromebook c738t c2ej 4gb memory celeron dual n3060 1 6ghz ips panel touchscreen 1366x768 11 6 business 2 1 convertible fast 32gb flash storage usb 3 0 intel hd graphics 400 chrome os
______________________________________________________________

intel hd graphics 620 usb 3 1 type c intel core i7 7500u 2 7ghz 256gb ssd webcam dell usb type c premium 13 3" ultrabook xps 13 8gb memory windows 10
________________________________________________________________
13 3" 256gb ssd intel hd graphics 620 dell full hd 1920x1080 premium intel core i7 7500u 2 7ghz xps 13 dvd home 8gb fp reader ultrabook
intel i7 7500u 2 7ghz dvd 13 3 inch home full hd 1920x1080 wifi 8gb ultrabook dell 256gb ssd latest model intel hd graphics 620 windows 10
________________________________________________________________
xps 13 13 3 inch ultrabook dell 802 11 ac usb 3 0 256gb ssd new 8gb memory home intel i7 7500u 2 7ghz intel hd graphics 620 full hd 1920x1080
business flagship latest model 8gb ram usb 3 1 type c dell 13 3 inch xps 13 intel core i7 7500u 2 7ghz 256gb ssd intel hd graphics 620 windows 10
________________________________________________________________
nvidia geforce gtx 980 1tb ssd ips panel full hd 1920x1080 flagship asus 64gb ram new gbe la

nvidia geforce 920mx intel core i3 6006u 2ghz bluetooth lenovo premium hdmi 320 15isk usb 3 1 type c notebook 15 6 inch 4gb memory 1tb hdd windows 10
________________________________________________________________
dvd 320 15isk core i3 6006u 2ghz hdmi 802 11ac nvidia geforce 920mx lenovo full hd 1920x1080 15 6 notebook 1tb hdd latest model 4gb ram
15 6" full hd 1920x1080 fast lenovo webcam notebook intel core i3 6006u 2ghz nvidia geforce 920mx 1tb hdd 4gb dvd rw business windows 10
________________________________________________________________
lenovo 15 6 inch 802 11ac full hd 1920x1080 320 15isk nvidia geforce 920mx 1tb hdd notebook intel core i3 6006u 2ghz gbe lan usb 3 0 4gb memory fast
nvidia geforce 920mx 320 15isk intel i3 6006u 2ghz 15 6 inch usb 3 1 type c 1tb hdd premium usb type c 4gb ram lenovo 802 11 ac windows 10
________________________________________________________________
14 0 wifi 2gb ram dvd hp stream 14 ax000nv latest model 32gb flash storage 802 11 ac notebook 

512gb ssd intel hd graphics 515 intel core 6y30 0 9ghz ultrabook 8gb ram gbe lan flagship premium zenbook ux305ca ubm1 dvd ips panel full hd 1920x1080 13 3" windows 10
________________________________________________________________
zenbook ux305ca ubm1 dvd ultrabook fast fp reader ips panel full hd 1920x1080 core 6y30 0 9ghz 13 3" 512gb ssd usb 3 1 type c asus intel hd graphics 515 8gb
ips panel full hd 1920x1080 home 13 3" premium intel core 6y30 0 9ghz intel hd graphics 515 512gb ssd gbe lan 8gb memory asus hdmi zenbook ux305ca ubm1 windows 10
________________________________________________________________
gbe lan hdmi 8gb memory wifi premium 512gb ssd ips panel full hd 1920x1080 asus intel core 6y30 0 9ghz intel hd graphics 515 zenbook ux305ca ubm1 ultrabook 13 3
intel 6y30 0 9ghz bluetooth 8gb ram 512gb ssd hdmi asus dvd intel hd graphics 515 13 3" zenbook ux305ca ubm1 latest model ultrabook windows 10
________________________________________________________________
intel hd grap

## Laptop Data Concatenation
Create on dataframe and shuffle the data

In [68]:
def create_laptop_data(pos_df, neg_df):
    pos_df.sample(frac=1)
    neg_df.sample(frac=1)
    final_laptop_df = pd.concat([pos_df[:min(len(pos_df), len(neg_df))], neg_df[:min(len(pos_df), len(neg_df))]])
    final_laptop_df = final_laptop_df.sample(frac=1)
    return final_laptop_df

In [69]:
final_laptop_df = create_laptop_data(pos_df, neg_df)

In [70]:
final_laptop_df

Unnamed: 0,title_one,title_two,label
0,acer chromebook c731 c78g netbook 11 6 inch ip...,acer vivobook max netbook 14 0 inch ips panel ...,0
0,dell vostro 5471 ultrabook 14 0 inch full hd 1...,dell vostro 5471 14 0 inch intel core i5 8250u...,1
0,acer aspire a517 51g notebook 15 6 inch ips pa...,acer aspire a517 51g notebook 15 6 inch intel ...,1
0,asus zenbook ux310uq gl026t ultrabook 13 3 inc...,asus zenbook ux310uq gl026t ultrabook 11 3 inc...,0
0,dell inspiron 7577 gaming 15 6 inch ips panel ...,dell inspiron 7577 gaming 15 6 inch ips panel ...,0
...,...,...,...
0,lenovo ideapad 320 15ikbn notebook 15 6 inch f...,lenovo notebook 15 6 inch full hd 1920x1080 in...,1
0,dell vostro 5568 notebook 15 6 inch full hd 19...,dell vostro 5568 notebook 15 6 inch full hd 19...,0
0,dell latitude 3570 notebook 15 6 inch 1366x768...,dell latitude 3570 notebook 15 6 inch 1366x768...,0
0,dell xps 13 ultrabook 13 3 inch quad hd touchs...,dell xps 13 13 3 inch quad hd touchscreen 3200...,1


## PCPartPicker Data
* Organize the data
* Preprocess the data
* Create negative and positive data

In [72]:
ram_df = pd.read_csv('data/train/pos_ram_titles.csv')
cpu_df = pd.read_csv('data/train/pos_cpu_titles.csv')

In [73]:
ram_df

Unnamed: 0.1,Unnamed: 0,amazon,bestbuy,newegg,walmart,memoryc
0,0,Corsair Vengeance LPX 16GB (2x8GB) DDR4 DRAM 3...,CORSAIR - Vengeance LPX 16GB (2PK x 8GB) 3.2 G...,CORSAIR Vengeance LPX 16GB (2 x 8GB) 288-Pin D...,Corsair CMK16GX4M2B3200C16 Vengeance LPX 16GB ...,16GB Corsair Vengeance LPX PC4-25600 3200MHz D...
1,0,Corsair Vengeance RGB PRO 16GB (2x8GB) DDR4 32...,CORSAIR - Vengeance RGB PRO 16GB (2PK 8GB) 3.2...,CORSAIR Vengeance RGB Pro 16GB (2 x 8GB) 288-P...,,16GB Corsair Vengeance RGB Pro DDR4 3200MHz CL...
2,0,G.Skill RipJaws V Series 16GB (2 x 8GB) 288-Pi...,,G.SKILL Ripjaws V Series 16GB (2 x 8GB) 288-Pi...,,
3,0,Corsair Vengeance RGB Pro 32GB (2x16GB) DDR4 3...,CORSAIR - Vengeance RGB PRO 32GB (2PK 16GB) 3....,CORSAIR Vengeance RGB Pro 32GB (2 x 16GB) 288-...,,32GB Corsair Vengeance Pro RGB DDR4 3200MHz CL...
4,0,,,G.SKILL Trident Z RGB (For AMD) 16GB (2 x 8GB)...,,16GB G.Skill DDR4 TridentZ RGB 3600Mhz PC4-288...
...,...,...,...,...,...,...
218,0,Corsair Vengeance LPX 32GB (4x8GB) DDR4 3600 (...,,CORSAIR Vengeance LPX 32GB (4 x 8GB) 288-Pin D...,,32GB Corsair Vengeance LPX DDR4 3600MHz PC4-28...
219,0,,,,,
220,0,Corsair Vengeance LPX 16GB (2x8GB) DDR4 DRAM 3...,CORSAIR - VENGEANCE LPX Series 16GB (2PK 8GB) ...,CORSAIR Vengeance LPX 16GB (2 x 8GB) 288-Pin D...,,16GB Corsair Vengeance LPX DDR4 3000MHz PC4-24...
221,0,CORSAIR VENGEANCELPX32GB (1x 32GB) DDR43000(PC...,,CORSAIR Vengeance LPX 32GB 288-Pin DDR4 SDRAM ...,,32GB Corsair Vengeance LPX DDR4 3000MHz CL16 M...


In [74]:
cpu_df

Unnamed: 0.1,Unnamed: 0,amazon,bestbuy,newegg,walmart,memoryc,bhphotovideo
0,0,"AMD Ryzen 5 3600 6-Core, 12-Thread Unlocked De...",AMD - Ryzen 5 3600 3rd Generation 6-Core - 12-...,AMD RYZEN 5 3600 6-Core 3.6 GHz (4.2 GHz Max B...,"AMD Ryzen 5 3600 6-Core, 12-Thread 4.2 GHz AM4...",AMD Ryzen 5 3600 AM4 3.6GHZ 32MB CPU Desktop P...,AMD Ryzen 5 3600 3.6 GHz Six-Core AM4 Processor
1,0,"AMD Ryzen 7 3700X 8-Core, 16-Thread Unlocked D...",AMD - Ryzen 7 3700X 3rd Generation 8-Core - 16...,AMD RYZEN 7 3700X 8-Core 3.6 GHz (4.4 GHz Max ...,"AMD Ryzen 7 3700X 8-Core, 16-Thread 4.4 GHz AM...",AMD Ryzen 7 3700x 3.6GHz 32MB AM4 CPU Desktop ...,AMD Ryzen 7 3700X 3.6 GHz Eight-Core AM4 Proce...
2,0,AMD Ryzen 5 2600 Processor with Wraith Stealth...,,,,AMD Ryzen 5 2600 Six-Core 3.4GHz Socket AM4 19...,
3,0,"AMD Ryzen 9 3900X 12-core, 24-thread unlocked ...",AMD - Ryzen 9 3900X 3rd Generation 12-core - 2...,AMD RYZEN 9 3900X 12-Core 3.8 GHz (4.6 GHz Max...,AMD RYZEN 9 3900X 12-Core 3.8 GHz (4.6 GHz Max...,AMD Ryzen 9 3900X 3.8GHz 64MB Desktop Processo...,AMD Ryzen 9 3900X 3.8 GHz 12-Core AM4 Processor
4,0,AMD Ryzen 3 3200G 4-Core Unlocked Desktop Proc...,AMD - Ryzen 3 3200G 3rd Generation 4-Core - 4-...,AMD RYZEN 3 3200G 4-Core 3.6 GHz (4.0 GHz Max ...,,AMD Ryzen 3 AM4 3.6GHZ 4MB Desktop Processor B...,
...,...,...,...,...,...,...,...
499,0,,,,,,
500,0,Intel Xeon E3-1220 V6 Processors BX80677E31220V6,,Intel Xeon E3-1220 V6 Kaby Lake 3.0 GHz (3.5 G...,XEON E3-1220 V6 FC-LGA14C 3G 8MB CACHE BOXED,Intel Xeon E3-1220 V6 3GHz Kaby Lake CPU LGA11...,
501,0,Intel - BX80684E2134 - Intel Xeon E-2134-3.5 G...,,,Intel BX80684E2134 Xeon Quad-core E-2134 3.5GH...,,
502,0,"Intel BX80662E31230V5 XEON E3-1230V5, 3.4 GHZ,...",,,,,


In [75]:
# Drop the Unnamed: 0 column and drop any row where it is all NaN
def remove_misc(df):
    columns = list(df.columns)[1:]
    df = df.drop(columns=['Unnamed: 0'])
    df = df.dropna(how='all')
    print(len(df))
    return df


In [76]:
ram_df = remove_misc(ram_df)
cpu_df = remove_misc(cpu_df)

210
315


In [77]:
ram_df

Unnamed: 0,amazon,bestbuy,newegg,walmart,memoryc
0,Corsair Vengeance LPX 16GB (2x8GB) DDR4 DRAM 3...,CORSAIR - Vengeance LPX 16GB (2PK x 8GB) 3.2 G...,CORSAIR Vengeance LPX 16GB (2 x 8GB) 288-Pin D...,Corsair CMK16GX4M2B3200C16 Vengeance LPX 16GB ...,16GB Corsair Vengeance LPX PC4-25600 3200MHz D...
1,Corsair Vengeance RGB PRO 16GB (2x8GB) DDR4 32...,CORSAIR - Vengeance RGB PRO 16GB (2PK 8GB) 3.2...,CORSAIR Vengeance RGB Pro 16GB (2 x 8GB) 288-P...,,16GB Corsair Vengeance RGB Pro DDR4 3200MHz CL...
2,G.Skill RipJaws V Series 16GB (2 x 8GB) 288-Pi...,,G.SKILL Ripjaws V Series 16GB (2 x 8GB) 288-Pi...,,
3,Corsair Vengeance RGB Pro 32GB (2x16GB) DDR4 3...,CORSAIR - Vengeance RGB PRO 32GB (2PK 16GB) 3....,CORSAIR Vengeance RGB Pro 32GB (2 x 16GB) 288-...,,32GB Corsair Vengeance Pro RGB DDR4 3200MHz CL...
4,,,G.SKILL Trident Z RGB (For AMD) 16GB (2 x 8GB)...,,16GB G.Skill DDR4 TridentZ RGB 3600Mhz PC4-288...
...,...,...,...,...,...
217,Team 8GB T-Force Vulcan Z DDR4 PC4-25600 3200M...,,Team T-FORCE VULCAN Z 8GB 288-Pin DDR4 SDRAM D...,,
218,Corsair Vengeance LPX 32GB (4x8GB) DDR4 3600 (...,,CORSAIR Vengeance LPX 32GB (4 x 8GB) 288-Pin D...,,32GB Corsair Vengeance LPX DDR4 3600MHz PC4-28...
220,Corsair Vengeance LPX 16GB (2x8GB) DDR4 DRAM 3...,CORSAIR - VENGEANCE LPX Series 16GB (2PK 8GB) ...,CORSAIR Vengeance LPX 16GB (2 x 8GB) 288-Pin D...,,16GB Corsair Vengeance LPX DDR4 3000MHz PC4-24...
221,CORSAIR VENGEANCELPX32GB (1x 32GB) DDR43000(PC...,,CORSAIR Vengeance LPX 32GB 288-Pin DDR4 SDRAM ...,,32GB Corsair Vengeance LPX DDR4 3000MHz CL16 M...


In [79]:
def generate_pos_pcpartpicker_data(df):
    pos_df = pd.DataFrame(columns=['title_one', 'title_two', 'label'])
    for idx in range(len(df)):
        row = df.iloc()[idx]
        titles = []
        for col in columns:
            if not pd.isnull(row[col]): titles.append(row[col])
        if len(titles) > 1:
            combs = combinations(titles, 2)
            for comb in combs:
                comb = list(comb)
                comb.append(1)
                pos_df = pos_df.append(pd.DataFrame([comb], columns=['title_one', 'title_two', 'label']))
    
    return pos_df


In [80]:
generate_pos_pcpartpicker_data(ram_df)

Unnamed: 0,title_one,title_two,label
0,Corsair Vengeance LPX 16GB (2x8GB) DDR4 DRAM 3...,CORSAIR - Vengeance LPX 16GB (2PK x 8GB) 3.2 G...,1
0,Corsair Vengeance LPX 16GB (2x8GB) DDR4 DRAM 3...,CORSAIR Vengeance LPX 16GB (2 x 8GB) 288-Pin D...,1
0,Corsair Vengeance LPX 16GB (2x8GB) DDR4 DRAM 3...,Corsair CMK16GX4M2B3200C16 Vengeance LPX 16GB ...,1
0,Corsair Vengeance LPX 16GB (2x8GB) DDR4 DRAM 3...,16GB Corsair Vengeance LPX PC4-25600 3200MHz D...,1
0,CORSAIR - Vengeance LPX 16GB (2PK x 8GB) 3.2 G...,CORSAIR Vengeance LPX 16GB (2 x 8GB) 288-Pin D...,1
...,...,...,...
0,Corsair Dominator Platinum RGB 16GB (2x8GB) DD...,CORSAIR Dominator Platinum RGB 16GB (2 x 8GB) ...,1
0,Corsair Dominator Platinum RGB 16GB (2x8GB) DD...,16GB Corsair Dominator Platinum RGB 3200MHz CL...,1
0,CORSAIR - Dominator Platinum RGB 16GB (2PK 8GB...,CORSAIR Dominator Platinum RGB 16GB (2 x 8GB) ...,1
0,CORSAIR - Dominator Platinum RGB 16GB (2PK 8GB...,16GB Corsair Dominator Platinum RGB 3200MHz CL...,1


In [81]:
generate_pos_pcpartpicker_data(cpu_df)

Unnamed: 0,title_one,title_two,label
0,"AMD Ryzen 5 3600 6-Core, 12-Thread Unlocked De...",AMD - Ryzen 5 3600 3rd Generation 6-Core - 12-...,1
0,"AMD Ryzen 5 3600 6-Core, 12-Thread Unlocked De...",AMD RYZEN 5 3600 6-Core 3.6 GHz (4.2 GHz Max B...,1
0,"AMD Ryzen 5 3600 6-Core, 12-Thread Unlocked De...","AMD Ryzen 5 3600 6-Core, 12-Thread 4.2 GHz AM4...",1
0,"AMD Ryzen 5 3600 6-Core, 12-Thread Unlocked De...",AMD Ryzen 5 3600 AM4 3.6GHZ 32MB CPU Desktop P...,1
0,AMD - Ryzen 5 3600 3rd Generation 6-Core - 12-...,AMD RYZEN 5 3600 6-Core 3.6 GHz (4.2 GHz Max B...,1
...,...,...,...
0,Intel Xeon E3-1220 V6 Processors BX80677E31220V6,Intel Xeon E3-1220 V6 3GHz Kaby Lake CPU LGA11...,1
0,Intel Xeon E3-1220 V6 Kaby Lake 3.0 GHz (3.5 G...,XEON E3-1220 V6 FC-LGA14C 3G 8MB CACHE BOXED,1
0,Intel Xeon E3-1220 V6 Kaby Lake 3.0 GHz (3.5 G...,Intel Xeon E3-1220 V6 3GHz Kaby Lake CPU LGA11...,1
0,XEON E3-1220 V6 FC-LGA14C 3G 8MB CACHE BOXED,Intel Xeon E3-1220 V6 3GHz Kaby Lake CPU LGA11...,1


## Embeddings Creation Functions
Generates the embeddings and saves them

In [7]:
"""
Definitions of some sizes in the training set
"""
MAX_LEN = 42
EMBEDDING_SHAPE = (300,)
print('MAX_LEN: ' + str(MAX_LEN), 'EMBEDDING_SHAPE: ' + str(EMBEDDING_SHAPE))

MAX_LEN: 42 EMBEDDING_SHAPE: (300,)


In [84]:
"""
Create the numpy files of all the training embedddings
We will have two numpy files:
1. The training/validation/test sets
2. The labels
"""

def create_embeddings(df):
    # Create the numpy arrays for storing the embeddings and labels
    total_embeddings = np.zeros(shape=(len(df), 2, MAX_LEN, EMBEDDING_SHAPE[0]))
    labels = np.zeros(shape=(len(df)))
    
    # I know this is a terrible way of doing this, but iterate over the dataframe
    # and generate the embeddings to add to the numpy array
    for idx, row in enumerate(df.itertuples()):
        for word_idx, word in enumerate(row.title_one.split()):
            total_embeddings[idx, 0, word_idx] = fasttext_model[word]
            
        for word_idx, word in enumerate(row.title_two.split()):
            total_embeddings[idx, 1, word_idx] = fasttext_model[word]
            
        labels[idx] = row.label
        
    return total_embeddings, labels


In [85]:
def save_embeddings(df, embeddings_name, labels_name):
    """
    Saves the embeddings given the embeddings file name and labels file name
    """
    if not os.path.exists('data/numpy_data/' + embeddings_name + '.npy'):
        embeddings, labels = create_embeddings(df)
        with open('data/numpy_data/' + embeddings_name + '.npy', 'wb') as f:
            np.save(f, embeddings)

        with open('data/numpy_data/' + labels_name + '.npy', 'wb') as f:
            np.save(f, labels)

In [86]:
def load_embeddings_and_labels(embeddings_name, labels_name):
    loaded_embeddings = None
    labels = None
    with open('data/numpy_data/' + embeddings_name + '.npy', 'rb') as f:
        loaded_embeddings = np.load(f)
        loaded_embeddings = np.transpose(loaded_embeddings, (1, 0, 2, 3))
    
    with open('data/numpy_data/' + labels_name + '.npy', 'rb') as f:
        labels = np.load(f)
    
    return loaded_embeddings, labels

In [87]:
def get_max_len(df):
    max_len = 0
    for row in df.itertuples():
        if len(row.title_one.split(' ')) > max_len:
            max_len = len(row.title_one.split(' '))
            
        if len(row.title_two.split(' ')) > max_len:
            max_len = len(row.title_two.split(' '))
    
    return max_len

## Saving and Loading Embeddings
Save the embeddings for the different types of data we have

In [112]:
# Concatenate everything
total_data = pd.concat([final_computer_df, final_laptop_df])
total_data = total_data.sample(frac=1)
save_embeddings(final_computer_df, 'bal_computers_embeddings', 'bal_computers_labels')
save_embeddings(final_laptop_df, 'laptop_embeddings', 'laptop_labels')
save_embeddings(total_data, 'all_embeddings', 'all_labels')

In [113]:
embeddings, labels = load_embeddings_and_labels('all_embeddings', 'all_labels')

In [114]:
len(embeddings[0,:])

32410

In [115]:
total_data

Unnamed: 0,title_one,title_two,label
16663,acer aspire es1 132 p194 business notebook 331...,acer aspire es1 132 p194 business notebook len...,1
0,lenovo ideapad 310 15ikb notebook 15 6 inch fu...,lenovo ideapad 310 15ikb 15 6 inch intel core ...,1
0,hp 250 g6 ultrabook 15 6 inch full hd 1920x108...,hp 250 g6 ultrabook 15 6 inch full hd 1920x108...,0
3086,corsair vengeance led 16gb 2x8gb ddr4pc4 21300...,corsair vengeance red led 16gb 2x8gb ddr4 pc4 ...,1
15990,kingston datatraveler 100 g3 32 gb usb 3 0 dt1...,usb datatraveler 100 g3 3 0 stick 32 gb,1
...,...,...,...
11649,seagate laptop sshd 1 tb internal st1000lm014 ...,wd green wds240g1g0a ssd 240 go sata 6gb garan...,0
0,lenovo ideapad 320 17isk notebook 17 3 inch 16...,lenovo notebook 17 3 inch 1600x900 intel core ...,1
15592,sandisk extreme microsdhc 64gb type 10 acheter...,sandisk extreme microsdhc 64gb type 10 kopen e...,1
10730,dg0146famwl hp 146 gb 6g 10k 2 5 dp sas new pa...,dg0146famwl hp 146 gb 6g 10k 2 5 dp sas hdd ne...,1


In [116]:
X_train1 = embeddings[0, :len(labels) - 4000]
X_train2 = embeddings[1, :len(labels) - 4000]
X_train = np.stack((X_train1, X_train2))
print('Training shape: ' + str(X_train.shape))

X_val1 = embeddings[0, len(labels) - 4000:len(labels) - 2000]
X_val2 = embeddings[1, len(labels) - 4000:len(labels) - 2000]
X_val = np.stack((X_val1, X_val2))
print('Val shape: ' + str(X_val.shape))

X_test1 = embeddings[0, len(labels) - 2000:]
X_test2 = embeddings[1, len(labels) - 2000:]
X_test = np.stack((X_test1, X_test2))
print('Test shape: ' + str(X_test.shape))

Training shape: (2, 28410, 42, 300)
Val shape: (2, 2000, 42, 300)
Test shape: (2, 2000, 42, 300)


In [117]:
Y_train = labels[:len(labels) - 4000]
print('Training labels shape:', str(Y_train.shape))

Y_val = labels[len(labels) - 4000:len(labels) - 2000]
print('Val shape:', str(Y_val.shape))

Y_test = labels[len(labels) - 2000:]
print('Test shape:', str(Y_test.shape))

Training labels shape: (28410,)
Val shape: (2000,)
Test shape: (2000,)


In [118]:
def convert_to_one_hot(Y, C):
    Y = np.eye(C)[Y.reshape(-1)]
    return Y

In [119]:
Y_train = convert_to_one_hot(Y_train.astype(np.int32), 2)
Y_val = convert_to_one_hot(Y_val.astype(np.int32), 2)
Y_test = convert_to_one_hot(Y_test.astype(np.int32), 2)

In [135]:
Y_train

array([[0., 1.],
       [0., 1.],
       [1., 0.],
       ...,
       [0., 1.],
       [1., 0.],
       [0., 1.]])

## Model Info

For the model, we are going to use LSTMs with a Constrastive Loss Function 
that will also be used to predict whether the two products are the same 

First, we have to convert the titles to embeddings through FastText before feeding into the LSTM.
The embedding part of this model will not be a layer because:
* The fasttext model would be time consuming and annoying to get to work with an embedding layer in Keras
* The fasttext model is not going to be getting its embeddings optimized, so there is really no point in adding it as an embedding layer

In [8]:
def square_distance(vectors):
    x, y = vectors
    return tf.square(x - y)

def euclidean_dist_out_shape(shapes):
    # Both inputs are fed in, so just use one of them and get the first value in the shape
    shape1, shape2 = shapes
    return (shape1[0],)

def siamese_network(input_shape):
    # Defines our inputs
    left_title = Input(input_shape, dtype='float32')
    right_title = Input(input_shape, dtype='float32')
    
    # The LSTM units
    model = tf.keras.Sequential(name='siamese_model')
    model.add(LSTM(units=256, return_sequences=True, name='lstm_1'))
    model.add(Dropout(rate=0.5))
    model.add(LSTM(units=128, return_sequences=True, name='lstm_2'))
    model.add(Dropout(rate=0.5))
    model.add(LSTM(units=128, name='lstm_3'))
    model.add(Dropout(rate=0.5))
    
    # The dense layers
    model.add(Dense(units=1024, activation='elu', name='dense_1'))
    model.add(Dropout(rate=0.5))
    model.add(Dense(units=512, activation='elu', name='dense_2'))
    
    # Forward propagate through the model to generate the encodings
    encoded_left_title = model(left_title)
    encoded_right_title = model(right_title)

    SquareDistanceLayer = Lambda(square_distance)
    distance = SquareDistanceLayer([encoded_left_title, encoded_right_title])
    
    prediction = Dense(units=2, activation='softmax')(distance)
    # Create and return the network
    siamese_net = tf.keras.Model(inputs=[left_title, right_title], outputs=prediction, name='siamese_network')
    return siamese_net

In [121]:
# Note: for the constrastive loss, because 0 denotes that they are from the same class
# and one denotes they are from a different class, I swaped the (Y) and (1 - Y) terms

def constrastive_loss(y_true, y_pred):
    margin = 2.0
    d = y_pred
    d_sqrt = tf.sqrt(d)
    #tf.print('\nY Pred: ', d, 'Shape: ', tf.shape(d))
    #tf.print('\nY True: ', y_true, 'Shape: ', tf.shape(y_true))
    
    loss = (y_true * d) + ((1 - y_true) * tf.square(tf.maximum(0., margin - d_sqrt)))
    
    #tf.print('\n Constrastive Loss: ', loss, 'Shape: ', tf.shape(loss))
    loss = 0.5 * tf.reduce_mean(loss)
    
    return loss

In [122]:
# Accuracy metric for constrastive loss because values close to 0 are equal and values high are different
# 0.5 is the threshold here
def constrastive_accuracy(y_true, y_pred):
    return tf.reduce_mean(tf.cast(tf.equal(y_true, tf.cast(y_pred < 0.5, y_true.dtype)), y_true.dtype))

In [123]:
def save_model(model, name):
    """
    Saves a model with a particular name
    """
    model.save('models/' + name + '.h5')

In [9]:
model = siamese_network((MAX_LEN, EMBEDDING_SHAPE[0],))
model.summary()

Model: "siamese_network"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 42, 300)]    0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 42, 300)]    0                                            
__________________________________________________________________________________________________
siamese_model (Sequential)      (None, 512)          1555968     input_1[0][0]                    
                                                                 input_2[0][0]                    
__________________________________________________________________________________________________
lambda (Lambda)                 (None, 512)          0           siamese_model[0][0]

In [128]:
# Compile the model
lr = 0.001
opt = tf.keras.optimizers.Adam(learning_rate=lr)
model.compile(loss='categorical_crossentropy', optimizer='RMSprop', metrics=['accuracy'])

In [129]:
# Train the model
model.fit(x=[X_train1, X_train2], y=Y_train, batch_size=64, epochs=50, validation_data=([X_val[0], X_val[1]], Y_val))

Train on 28410 samples, validate on 2000 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x1c2d9bc9448>

In [130]:
# Test the model
results = model.evaluate([X_test1, X_test2], Y_test, batch_size=16)
print('test loss, test acc: ', results)

test loss, test acc:  [0.3420196931362152, 0.887]


In [10]:
# Set the model's name
model_name = '0.2_Softmax-LSTM-50_epochs'

In [None]:
# Save the model
save_model(model, model_name)

## Manual Testing
Converts titles into embeddings arrays and allow the model to make a prediction

In [11]:
model.load_weights('models/' + model_name + '.h5')

In [346]:
title_one = 'LG OLED77CXPUA 77 Class HDR 4K UHD Smart OLED TV (2020 Model)'
title_two = 'LG OLED77CXPUA 77 Alexa Built-In CX Series 4K Ultra HD Smart OLED TV (2020)'
#title_one = 'Corsair 16GB ram'
#title_two = 'G Skill 32GB ram'
title_one_arr = np.zeros((1, 42, 300))
title_two_arr = np.zeros((1, 42, 300))
title_one = remove_stop_words(title_one.lower())
title_two = remove_stop_words(title_two.lower())

for idx, word in enumerate(title_one.split(' ')):
    title_one_arr[0, idx] = fasttext_model[word]
    
for idx, word in enumerate(title_two.split(' ')):
    title_two_arr[0, idx] = fasttext_model[word]

In [347]:
model.predict([title_one_arr, title_two_arr])

array([[0.34407118, 0.6559288 ]], dtype=float32)