In [1]:
import pandas as pd
import os
import numpy as np
import random
from tqdm import tqdm
from src.preprocessing import remove_stop_words
from src.common import create_final_data, Common

In [142]:
class LaptopAttributes():
    '''
    Different from LaptopAttributes, this is specific for creating spec data.
    The spec data was gathered from PCPartPicker and is used to create more laptop data.
    '''

    video_card = {'GeForce RTX 2070'}
    ram = [str(x) + ' GB' for x in range(2, 130, 2)]
    hard_drive = [str(x) + ' GB' for x in range(120, 513, 8)] + [str(x) + ' TB' for x in range(1, 8)]
    cpu = {}
    laptop_brands = ['Lenovo ThinkPad', 'Lenovo ThinkBook', 'Lenovo IdeaPad', 'Lenovo Yoga', 'Lenovo Legion', 'HP Envy', 'HP Chromebook', 'HP Spectre', 'HP ZBook', 'HP Probook', 'HP Elitebook', 'HP Pavilion', 'HP Omen', 'Dell Alienware', 'Dell Vostro', 'Dell Inspiron', 'Dell Latitude', 'Dell XPS', 'Dell G Series', 'Dell Precision', 'Apple Macbook', 'Apple Macbook Air', 'Apple Mac', 'Acer Aspire', 'Acer Swift', 'Acer Spin', 'Acer Switch', 'Acer Extensa', 'Acer Travelmate', 'Acer Nitro', 'Acer Enduro', 'Acer Predator', 'Asus ZenBook', 'Asus Vivobook', 'Asus Republic of Gamers', 'Asus ROG', 'Asus TUF GAMING']
    screen = {'1440x900'}
    inches = {'13.3'}
    
    @staticmethod
    def get_all_data():
        return {
            'cpu': LaptopAttributes.cpu.keys(),
            'ram': LaptopAttributes.ram,
            'hard_drive': LaptopAttributes.hard_drive,
            'video_card': LaptopAttributes.video_card,
            'brand': LaptopAttributes.laptop_brands,
            'screen': LaptopAttributes.screen,
            'inches': LaptopAttributes.inches
        }

In [143]:
def populate_spec():
    '''
    Creates a string out of the row of product attributes (so row is a Pandas DataFrame).
    '''

    # Getting the CPU data into LaptopAttrbutes
    cpu_df = pd.read_csv('data/train/cpu_data.csv')
    temp_iloc = cpu_df.iloc()
    for idx in range(len(cpu_df)):
        row = temp_iloc[idx]
        LaptopAttributes.cpu[row['name']] = [row['cores'], row['core_clock']]

    # Getting the video card data into LaptopAttributes
    video_card_df = pd.read_csv('data/train/video-cards-data.csv')
    temp_iloc = video_card_df.iloc()
    for idx in range(len(video_card_df)):
        row = temp_iloc[idx]
        LaptopAttributes.video_card.update([row['chipset']])
    
    # Getting the inches, screen, video card, and CPU data from laptops.csv
    laptops_df = pd.read_csv('data/train/laptops.csv', encoding='latin-1')
    LaptopAttributes.inches.update([str(row.Inches) for row in laptops_df[['Inches']].itertuples()])
    LaptopAttributes.screen.update([row.ScreenResolution for row in laptops_df[['ScreenResolution']].itertuples()])
    LaptopAttributes.video_card.update([row.Gpu for row in laptops_df[['Gpu']].itertuples()]) 
    
    for row in laptops_df.iloc:
        if row.Company != 'Apple':
            LaptopAttributes.cpu[' '.join(row.Cpu.split(' ')[:-1])] = [None, row.Cpu.split(' ')[-1]]
    
    

In [147]:
populate_spec()

In [150]:
def gen_spec_combos():
    '''
    Generates combinations of the spec data (WARNING: THIS TAKES A VERY LONG TIME AND YOU MUST HAVE AT LEAST 16GB RAM TO DO THIS)
    '''

    combos = np.meshgrid(*[list(LaptopAttributes.cpu.keys()), LaptopAttributes.hard_drive, LaptopAttributes.ram])
    combos = np.array(combos).T.reshape(-1, 3)
    np.random.shuffle(combos)
    df = pd.DataFrame(data=combos, columns=['cpu', 'hard_drive', 'ram'])
    df.to_csv('data/train/spec_data_no_brand.csv')

In [151]:
gen_spec_combos()

In [157]:
def concatenate_row(row):
    '''
    Creates a string out of the row of product attributes (so row is a Pandas DataFrame)
    '''
    
    # Split the brand
    row['company'] = row['brand'].split(' ')[0]
    row['product'] = ' '.join(row['brand'].split(' ')[1:])
    
    # Create dictionary for drive types
    drive_options = {'ssd': Common.SSD_TYPES, 'hdd': Common.HARD_DRIVE_TYPES}

    # Special tags at the end of the amount of inches of the laptop and the RAM to simulate real data
    inch_attr = str(row['inches']) + random.choice([' inch', '"'])
    ram_attr = row['ram'] + random.choice([' ram', ' memory'])

    # This modifies the CPU attribute to sometimes have different types of elements to add some difference
    # Ex: Intel Core i7 7700k vs Core i7 7700k 4 Core 4.2 GHz CPU (Something like that)
    cpu_attr = row['cpu']
    cores = LaptopAttributes.cpu[cpu_attr][0]
    ghz = LaptopAttributes.cpu[cpu_attr][1]
    
    if random.random() > 0.5:
        cpu_attr = cpu_attr.split(' ')
        if random.choice([0, 1]):
            if 'Intel' in cpu_attr:
                cpu_attr.remove('Intel')
        if random.choice([0, 1]):
            if 'Core' in cpu_attr:
                cpu_attr.remove('Core')
        if random.choice([0, 1]):
            if 'AMD' in cpu_attr:
                cpu_attr.remove('AMD')
    
        cpu_attr = ' '.join(cpu_attr)
    
    # Random chance of putting the cores in the CPU attribute
    if cores != None:
        if random.random() > 0.7:
            cpu_attr = '{} {} {}'.format(cpu_attr, cores, 'Core')
    
    # Random chance of putting the GHz in the CPU attribute
    if random.random() > 0.7:
        cpu_attr = '{} {}'.format(cpu_attr, ghz)
    
    if random.random() > 0.55:
        cpu_attr = '{} {}'.format(cpu_attr, 'CPU')
    
    # Have random chance of getting rid of company or product attribute
    product_removed = False
    if (random.random() > 0.55):
        row['product'] = ''
        product_removed = True
    
    if (random.random() > 0.65 and not product_removed):
        row['company'] = ''
    
    
    # Create a list for all the product attributes
    order_attrs = [row['company'],
                   row['product'],
                   inch_attr,
                  ]

    # Have a chance of adding "laptop" to the title
    if random.random() > 0.45:
        order_attrs.append('laptop')
    
    # Add the type of drive the hard drive attribute
    row['hard_drive'] = row['hard_drive'] + ' ' + random.choice(drive_options[row['drive_type']])
    
    spec_attrs = [row['hard_drive'],
                  # row['screen'],
                   cpu_attr,
                   ram_attr
                 ]
    
    # Shuffle only the attributte
    random.shuffle(spec_attrs)
    order_attrs = order_attrs + spec_attrs
    
    return ' '.join(order_attrs)


In [173]:
def create_laptop_data(df, neg_attrs):
    temp = []
    for idx in tqdm(range(0, int(len(df) * 0.001))):
        # Must start off with two positive titles
        first_row = df.iloc[idx]
        neg_attr = neg_attrs[idx % len(neg_attrs)]
        
        # Randomly choose the attributes that are not already in the row
        brand = random.choice(LaptopAttributes.laptop_brands)
        inches = random.choice(list(LaptopAttributes.inches))
        screen = random.choice(list(LaptopAttributes.screen))
        hard_drive = random.choice(list(LaptopAttributes.hard_drive))
        drive_type = random.choice(['ssd', 'hdd'])
        
        pos = format_laptop_row(first_row.copy(), brand, inches, screen, drive_type)
        
        new_attr = pos[neg_attr]
        
        while new_attr == pos[neg_attr]:
            new_attr = random.sample(LaptopAttributes.get_all_data()[neg_attr.lower()], 1)[0]
        
        neg = pos.copy()
        neg[neg_attr] = new_attr
        
        pos_pair_1 = remove_stop_words(concatenate_row(pos.copy()))
        pos_pair_2 = remove_stop_words(concatenate_row(pos.copy()))
        neg_pair_1 = remove_stop_words(concatenate_row(neg.copy()))
        neg_pair_2 = remove_stop_words(concatenate_row(neg.copy()))
        
        temp.append([pos_pair_1, pos_pair_2, 1])
        temp.append([neg_pair_1, neg_pair_2, 1])
        temp.append([pos_pair_1, neg_pair_1, 0])
        temp.append([pos_pair_2, neg_pair_2, 0])
    
    return pd.DataFrame(temp, columns=Common.COLUMN_NAMES)

In [172]:
def format_laptop_row(row, brand, inches, screen, drive_type):
    row['brand'] = brand
    row['inches'] = inches
    row['screen'] = screen
    row['drive_type'] = drive_type
    return row

In [155]:
spec_data = pd.read_csv('data/train/spec_data_no_brand.csv')

In [174]:
new_data = create_laptop_data(spec_data, neg_attrs=['brand', 'cpu', 'ram', 'inches', 'hard_drive'])

100%|█████████████████████████████████████████████████████████████| 678/678 [00:06<00:00, 107.87it/s]


In [177]:
for row in new_data.iloc:
    print('1. ', row.title_one)
    print('2. ', row.title_two)
    print('Label: ', row.label)
    print('----------------------------------------------------')

1.  Acer Extensa 14 1 inch laptop Core i7 9700K 8 Core 3 6 GHz CPU 34 GB ram 392 GB Hard Drive
2.  Extensa 14 1 inch 392 GB Hard Drive 34 GB ram Intel Core i7 9700K
Label:  1
----------------------------------------------------
1.  Lenovo 14 1 inch laptop 34 GB ram Intel Core i7 9700K 392 GB Hard Drive
2.  Lenovo IdeaPad 14 1" 392 GB HDD 34 GB ram Core i7 9700K
Label:  1
----------------------------------------------------
1.  Acer Extensa 14 1 inch laptop Core i7 9700K 8 Core 3 6 GHz CPU 34 GB ram 392 GB Hard Drive
2.  Lenovo 14 1 inch laptop 34 GB ram Intel Core i7 9700K 392 GB Hard Drive
Label:  0
----------------------------------------------------
1.  Extensa 14 1 inch 392 GB Hard Drive 34 GB ram Intel Core i7 9700K
2.  Lenovo IdeaPad 14 1" 392 GB HDD 34 GB ram Core i7 9700K
Label:  0
----------------------------------------------------
1.  Lenovo Yoga 13 9" 108 GB ram 304 GB SSD Intel Celeron Dual Core 3205U
2.  Lenovo 13 9" Intel Celeron Dual Core 3205U CPU 108 GB ram 304 GB M 2

1.  Acer 15 6" 5 TB HDD Intel Core i7 7660U 32 GB ram
2.  Acer Nitro 15 6" 32 GB memory Intel Core i9 10900 CPU 5 TB Hard Drive
Label:  0
----------------------------------------------------
1.  Dell 14 0 inch 26 GB ram AMD Ryzen 3 1200 14nm 168 GB HDD
2.  XPS 14 0" laptop AMD Ryzen 3 1200 14nm 3 1 GHz CPU 26 GB memory 168 GB HDD
Label:  1
----------------------------------------------------
1.  Dell 14 0" laptop 168 GB HDD AMD Ryzen 3 1200 14nm 3 1 GHz CPU 10 GB ram
2.  Dell XPS 14 0 inch laptop 168 GB HDD 10 GB memory Ryzen 3 1200 14nm 4 Core CPU
Label:  1
----------------------------------------------------
1.  Dell 14 0 inch 26 GB ram AMD Ryzen 3 1200 14nm 168 GB HDD
2.  Dell 14 0" laptop 168 GB HDD AMD Ryzen 3 1200 14nm 3 1 GHz CPU 10 GB ram
Label:  0
----------------------------------------------------
1.  XPS 14 0" laptop AMD Ryzen 3 1200 14nm 3 1 GHz CPU 26 GB memory 168 GB HDD
2.  Dell XPS 14 0 inch laptop 168 GB HDD 10 GB memory Ryzen 3 1200 14nm 4 Core CPU
Label:  0
--------

----------------------------------------------------
1.  Acer Extensa 12 5" i3 7100U 12 GB memory 3 TB Hard Drive
2.  Acer 12 5" Intel Core i3 7100U 3 TB HDD 12 GB ram
Label:  1
----------------------------------------------------
1.  Acer Extensa 14 1 inch 12 GB ram Core i3 7100U 2 4GHz CPU 3 TB Hard Drive
2.  Acer Extensa 12 5" i3 7100U 12 GB memory 3 TB Hard Drive
Label:  0
----------------------------------------------------
1.  Acer Extensa 14 1" Intel Core i3 7100U CPU 12 GB ram 3 TB Hard Drive
2.  Acer 12 5" Intel Core i3 7100U 3 TB HDD 12 GB ram
Label:  0
----------------------------------------------------
1.  Aspire 13 0" AMD A6 Series A6 9220 CPU 38 GB ram 152 GB SSD
2.  Acer 13 0" laptop 38 GB memory 152 GB Solid State Drive A6 Series A6 9220
Label:  1
----------------------------------------------------
1.  Acer Aspire 13 0 inch laptop 160 GB M 2 SSD 38 GB ram AMD A6 Series A6 9220
2.  Acer Aspire 13 0 inch 160 GB SATA SSD AMD A6 Series A6 9220 2 5GHz CPU 38 GB ram
Label: 

1.  Acer Travelmate 15 0 inch 448 GB Hard Drive Celeron Quad N3160 1 6GHz 40 GB memory
2.  Acer Travelmate 15 0 inch 40 GB ram Intel Celeron Quad Core N3160 CPU 448 GB Hard Drive
Label:  1
----------------------------------------------------
1.  Acer Travelmate 15 4" laptop 40 GB ram Intel Celeron Quad Core N3160 CPU 448 GB HDD
2.  Acer Travelmate 15 0 inch 448 GB Hard Drive Celeron Quad N3160 1 6GHz 40 GB memory
Label:  0
----------------------------------------------------
1.  Acer 15 4" laptop 448 GB HDD Intel Celeron Quad Core N3160 1 6GHz CPU 40 GB ram
2.  Acer Travelmate 15 0 inch 40 GB ram Intel Celeron Quad Core N3160 CPU 448 GB Hard Drive
Label:  0
----------------------------------------------------
1.  Dell 11 6 inch laptop 240 GB Hard Drive Intel Celeron Quad Core N3450 CPU 52 GB ram
2.  Dell 11 6 inch 240 GB Internal Hard Drive 52 GB ram Intel Celeron Quad Core N3450 CPU
Label:  1
----------------------------------------------------
1.  Inspiron 11 6" Intel Celeron Quad Co

Label:  0
----------------------------------------------------
1.  HP Envy 15 4 inch 122 GB ram Ryzen 9 3900X CPU 272 GB Hard Drive
2.  HP 15 4" Intel Core M 7Y30 CPU 272 GB Internal Hard Drive 122 GB memory
Label:  0
----------------------------------------------------
1.  Acer Enduro 15 6" laptop 192 GB Solid State Drive 62 GB memory Ryzen 7 2700 8 Core 3 2 GHz CPU
2.  Acer 15 6" 62 GB ram 192 GB SATA SSD Ryzen 7 2700 3 2 GHz
Label:  1
----------------------------------------------------
1.  Acer Enduro 15 6" laptop 192 GB SATA SSD 20 GB memory Ryzen 7 2700 CPU
2.  Enduro 15 6 inch 192 GB SATA SSD AMD Ryzen 7 2700 8 Core 20 GB memory
Label:  1
----------------------------------------------------
1.  Acer Enduro 15 6" laptop 192 GB Solid State Drive 62 GB memory Ryzen 7 2700 8 Core 3 2 GHz CPU
2.  Acer Enduro 15 6" laptop 192 GB SATA SSD 20 GB memory Ryzen 7 2700 CPU
Label:  0
----------------------------------------------------
1.  Acer 15 6" 62 GB ram 192 GB SATA SSD Ryzen 7 2700 3 

1.  Travelmate 17 3 inch AMD E Series 9000e 1 5GHz 208 GB HDD 100 GB memory
2.  Acer Travelmate 17 3" 100 GB memory 208 GB Internal Hard Drive AMD E Series 9000e
Label:  1
----------------------------------------------------
1.  Asus Vivobook 17 3" laptop 208 GB Hard Drive 100 GB ram E Series 9000e CPU
2.  Travelmate 17 3 inch AMD E Series 9000e 1 5GHz 208 GB HDD 100 GB memory
Label:  0
----------------------------------------------------
1.  Asus Vivobook 17 3" laptop 100 GB ram AMD E Series 9000e 208 GB Internal Hard Drive
2.  Acer Travelmate 17 3" 100 GB memory 208 GB Internal Hard Drive AMD E Series 9000e
Label:  0
----------------------------------------------------
1.  Asus ZenBook 12 3" laptop 360 GB M 2 SSD 110 GB memory Intel Core i7 6600U
2.  Asus ZenBook 12 3 inch 110 GB ram 360 GB SSD Intel Core i7 6600U CPU
Label:  1
----------------------------------------------------
1.  Asus ZenBook 12 3 inch Ryzen 5 3600 6 Core 3 6 GHz 360 GB Solid State Drive 110 GB ram
2.  Asus ZenBo

1.  Asus 15 4" laptop AMD Ryzen 7 3800XT 3 9 GHz 328 GB HDD 126 GB memory
2.  Asus 15 4 inch laptop 328 GB HDD 4 GB memory Ryzen 7 3800XT CPU
Label:  0
----------------------------------------------------
1.  Republic Gamers 15 4 inch laptop AMD Ryzen 7 3800XT 126 GB ram 328 GB HDD
2.  Asus Republic Gamers 15 4" Ryzen 7 3800XT 3 9 GHz 4 GB ram 328 GB Hard Drive
Label:  0
----------------------------------------------------
1.  Apple Macbook 14 0" laptop AMD E Series E2 9000 240 GB Internal Hard Drive 64 GB memory
2.  Apple Macbook 14 0" laptop 64 GB memory E Series E2 9000 CPU 240 GB Hard Drive
Label:  1
----------------------------------------------------
1.  Macbook 11 6 inch AMD E Series E2 9000 CPU 240 GB Internal Hard Drive 64 GB ram
2.  Apple Macbook 11 6 inch 240 GB Internal Hard Drive 64 GB ram E Series E2 9000 2 2GHz CPU
Label:  1
----------------------------------------------------
1.  Apple Macbook 14 0" laptop AMD E Series E2 9000 240 GB Internal Hard Drive 64 GB memory
2. 

1.  Vivobook 14 1 inch laptop Intel Pentium Dual Core 4405Y 1 5GHz 56 GB memory 464 GB Solid State Drive
2.  Asus 14 1 inch Intel Pentium Dual Core 4405Y 464 GB SSD 56 GB ram
Label:  1
----------------------------------------------------
1.  Vivobook 14 1 inch laptop 56 GB ram Intel Pentium Dual Core 4405Y 304 GB SSD
2.  Vivobook 14 1 inch laptop Intel Pentium Dual Core 4405Y 1 5GHz 56 GB memory 464 GB Solid State Drive
Label:  0
----------------------------------------------------
1.  Asus 14 1" Intel Pentium Dual Core 4405Y 1 5GHz CPU 56 GB memory 304 GB M 2 SSD
2.  Asus 14 1 inch Intel Pentium Dual Core 4405Y 464 GB SSD 56 GB ram
Label:  0
----------------------------------------------------
1.  Spin 13 9 inch 344 GB Internal Hard Drive Intel Core i5 7500U 2 7GHz CPU 78 GB ram
2.  Acer Spin 13 9" laptop 344 GB Hard Drive Intel i5 7500U 78 GB ram
Label:  1
----------------------------------------------------
1.  HP 13 9" laptop 344 GB Internal Hard Drive 78 GB memory Intel i5 7500U
2

Label:  1
----------------------------------------------------
1.  Apple 17 0" laptop 80 GB ram AMD Ryzen 3 3300X 448 GB SSD
2.  Apple Macbook 17 0" laptop AMD Ryzen 3 3300X 90 GB memory 448 GB Solid State Drive
Label:  0
----------------------------------------------------
1.  Apple Macbook 17 0" laptop 448 GB SATA SSD 80 GB memory AMD Ryzen 3 3300X 3 8 GHz CPU
2.  Apple Macbook 17 0" 448 GB SATA SSD 90 GB ram Ryzen 3 3300X CPU
Label:  0
----------------------------------------------------
1.  Acer 17 3" laptop 16 GB memory 208 GB SSD Intel Core i7 7560U CPU
2.  Acer 17 3" laptop 16 GB memory Intel Core i7 7560U CPU 208 GB SATA SSD
Label:  1
----------------------------------------------------
1.  Acer Enduro 13 0 inch 16 GB ram Intel Core i7 7560U 208 GB SSD
2.  Acer Enduro 13 0" laptop Intel Core i7 7560U 2 4GHz 16 GB ram 208 GB Solid State Drive
Label:  1
----------------------------------------------------
1.  Acer 17 3" laptop 16 GB memory 208 GB SSD Intel Core i7 7560U CPU
2.  A