In [38]:
import re
import math
import pandas as pd
import os
import random
from tqdm import tqdm
import sys
from itertools import combinations
from nltk.corpus import stopwords
sys.path.append(os.getcwd())
from src.data_creation.laptop_data_creation import LaptopAttributes, populate_spec
from src.preprocessing import unit_matcher, remove_misc, remove_stop_words
from src.common import create_final_data, get_max_len


In [2]:
populate_spec()
class LaptopRetailerRegEx:
    laptop_brands = {'gateway', 'panasonic', 'toughbook', 'msi'}
    product_attrs = {'vivobook'}
    cpu_attributes = {'intel', 'm 2', '2 core', '4 core', '6 core', '8 core'}

    for brand in LaptopAttributes.laptop_brands:
        laptop_brands.add(brand.split(' ')[0].lower())
        product_attrs.add(' '.join(brand.split(' ')[1: ]).lower())

    intel_cpu_df = pd.read_csv('data/base/intel_cpus.csv')
    intel_cpu_df = intel_cpu_df['title'].map(lambda x: remove_stop_words(x, omit_punctuation=['.']).split(' '))
    for i in range(len(intel_cpu_df)):
        cpu_attributes.update(intel_cpu_df.iloc[i])

    amd_cpu_df = pd.read_csv('data/base/amd_cpus.csv')
    amd_cpu_df = amd_cpu_df['title'].map(lambda x: remove_stop_words(x, omit_punctuation=['.']).split(' '))
    for i in range(len(amd_cpu_df)):
        cpu_attributes.update(amd_cpu_df.iloc[i])

    laptop_brands = list(laptop_brands)
    laptop_brands.sort(key=len, reverse=True)

    product_attrs = list(product_attrs)
    product_attrs.sort(key=len, reverse=True)

    cpu_attributes = list(cpu_attributes)
    cpu_attributes.sort(key=len, reverse=True)

    ram_modifiers = ['memory', 'ram', 'ddr4', 'ddr4 ram', 'ddr4 memory']
    ram_modifiers.sort()

    hard_drive_modifiers = ['hdd', 'hard drive', 'disk drive', 'storage', 'hard drive storage', 'hdd storage']
    hard_drive_modifiers.sort(key=len, reverse=True)

    ssd_modifiers = ['ssd', 'solid state drive', 'solid state disk', 'pcie', 'pcie ssd', 'ssd storage']
    ssd_modifiers.sort(key=len, reverse=True)

    annoying_words = ['windows 10', 'win 10', 'windows 10 in s mode', 'windows', '3.0', '3.1', '3.2', 'optical drive', 'cd drive', 'dvd drive']
    annoying_words.sort(key=len, reverse=True)

    ram_modifier_matcher = re.compile("\\b" + "(?!\S)|\\b".join(ram_modifiers) + "(?!\S)", re.IGNORECASE)
    random_matcher = re.compile("\\b" + "(?!\S)|\\b".join(annoying_words) + "(?!\S)", re.IGNORECASE)
    cpu_matcher = re.compile("\\b" + "(?!\S)|\\b".join(cpu_attributes) + "(?!\S)", re.IGNORECASE)
    brand_matcher = re.compile("\\b" + "(?!\S)|\\b".join(laptop_brands) + "(?!\S)", re.IGNORECASE)
    product_attr_matcher = re.compile("\\b" + "(?!\S)|\\b".join(product_attrs) + "(?!\S)", re.IGNORECASE)
    ram_matcher = re.compile(' ?[0-9]+.{0,1}' + 'gb ?' + '(?:' + '|'.join([x for x in ram_modifiers]) + ')(?!\S)', re.IGNORECASE)
    hard_drive_matcher = re.compile(' ?[0-9]+.{0,1}' + '(?:gb|tb) ?' + '(?:' + '|'.join([x for x in hard_drive_modifiers]) + ')(?!\S)', re.IGNORECASE)
    ssd_matcher = re.compile(' ?[0-9]+.{0,1}' + '(?:gb|tb) ?' + '(?:' + '|'.join([x for x in ssd_modifiers]) + ')(?!\S)', re.IGNORECASE)
    gbtb_matcher = re.compile(' ?[0-9]+.{0,1}' + '(?:gb|tb)' + '(?!\S)', re.IGNORECASE)
    inch_matcher = re.compile('[1][0-9]\"?\"?.?[0-9]?\"?\"? ?(?:inch)?(?!\S)', re.IGNORECASE)
    del laptop_brands, product_attrs, cpu_attributes, intel_cpu_df, amd_cpu_df

In [3]:
amazon_laptops = pd.read_csv('data/base/amazon_laptop_titles.csv')
walmart_laptops = pd.read_csv('data/base/walmart_laptop_titles.csv')
newegg_laptops = pd.read_csv('data/base/newegg_laptop_titles.csv')

laptops = remove_misc(pd.concat([amazon_laptops, walmart_laptops, newegg_laptops]))
laptops['title'] = laptops['title'].apply(lambda x: remove_stop_words(x, omit_punctuation=['.']))
laptops = laptops.drop_duplicates(subset=['title'])
laptops

Unnamed: 0,title
0,"hp 2021 premium 14"" hd touchscreen laptop comp..."
1,"2021 newest asus tuf gaming laptop 15.6"" ips f..."
2,acer aspire 5 slim laptop 15.6 inches full hd ...
3,hp chromebook 11 inch laptop up 15 hour batter...
4,hp chromebook 14 inch hd laptop intel celeron ...
...,...
3614,lenovo thinkpad p71 workstation laptop windows...
3615,lenovo thinkpad t480s windows 10 pro laptop in...
3616,"newest dell inspiron 5000 15.6"" touchscreen le..."
3617,lenovo thinkpad p71 workstation laptop windows...


## Testing the RegEx Expressions

In [4]:
test = remove_stop_words('Acer Predator Helios 300 15.6"" inch Gaming Laptop i7-10750H 16GB DDR4 1TB SSD', omit_punctuation=['.'])

In [5]:
test

'acer predator helios 300 15.6"" inch gaming laptop i7 10750h 16gb ddr4 1tb ssd'

In [6]:
LaptopRetailerRegEx.brand_matcher.findall(test)

['acer']

In [7]:
LaptopRetailerRegEx.product_attr_matcher.findall(test)

['predator']

In [8]:
LaptopRetailerRegEx.inch_matcher.findall(test)

['15.6"" inch']

In [9]:
LaptopRetailerRegEx.cpu_matcher.findall(test)

['i7', '10750h']

In [10]:
LaptopRetailerRegEx.ram_matcher.findall(test)

[' 16gb ddr4']

In [11]:
LaptopRetailerRegEx.ssd_matcher.findall(test)

[' 1tb ssd']

In [12]:
LaptopRetailerRegEx.hard_drive_matcher.findall(test)

[]

In [13]:
LaptopRetailerRegEx.inch_matcher.findall('hp laptop intel celeron n4020 4gb ddr4 sdram 64gb emmc 14 inch hd led display microsoft 365 1 year subscription white')

['14 inch']

## Generate the Data

In [15]:
def get_key_attrs(title:str) -> tuple:
    """
    Get each major attribute of a laptop
    """

    # Remove random words that may end up in the important identifiers
    random_words = set(LaptopRetailerRegEx.random_matcher.findall(title))
    for word in random_words:
        title = title.replace(word, '')

    brand = list(map(lambda x: x.strip(), LaptopRetailerRegEx.brand_matcher.findall(title)))
    product_attr = list(map(lambda x: x.strip(), LaptopRetailerRegEx.product_attr_matcher.findall(title)))
    inch = list(map(lambda x: x.strip(), LaptopRetailerRegEx.inch_matcher.findall(title)))
    cpu = list(map(lambda x: x.strip(), LaptopRetailerRegEx.cpu_matcher.findall(title)))
    ram = list(map(lambda x: x.strip(), LaptopRetailerRegEx.ram_matcher.findall(title)))
    ssd = list(map(lambda x: x.strip(), LaptopRetailerRegEx.ssd_matcher.findall(title)))
    hard_drive = list(map(lambda x: x.strip(), LaptopRetailerRegEx.hard_drive_matcher.findall(title)))

    # Before getting the other gb attributes, make sure we don't get ones from ssd, hard drive or ram
    for x in ram:
        title = title.replace(x, '')
    
    for x in ssd:
        title = title.replace(x, '')

    for x in hard_drive:
        title = title.replace(x, '')

    other_gb_attrs = list(map(lambda x: x.strip(), LaptopRetailerRegEx.gbtb_matcher.findall(title)))

    return (brand, product_attr, inch, cpu, ram, ssd, hard_drive, other_gb_attrs)

In [16]:
def get_filler_tokens(orig_title: list, imp_tokens: list) -> list:
    """
    Get all of the filler words (words that are not major attributes)
    """

    filler_tokens = []
    for token in orig_title:
        if token not in imp_tokens:
            filler_tokens.append(token)

    return filler_tokens

In [17]:
def remove_filler_tokens(orig_title: list, filler_tokens: list) -> list:
    """"
    Generates new titles with less filler words in it
    """
    
    new_titles = []
    amt_filler_tokens = len(filler_tokens)
    if (len(filler_tokens) > 1):
        for x in range(len(filler_tokens)): # For as many filler tokens are there are, we are going to create that many new titles
            new_title = orig_title.copy()
            filler_tokens_cp = filler_tokens.copy()
            amt_to_remove = random.randint(int(amt_filler_tokens * 0.25), amt_filler_tokens)
            for x in range(amt_to_remove): # Get a random amount of filler tokens to remove
                filler = random.choice(filler_tokens_cp)
                new_title.remove(filler)
                filler_tokens_cp.remove(filler)
            new_titles.append(' '.join(new_title))
    
    return new_titles

In [18]:
def manipulate_ram(attr: str) -> str:
    """
    Uses different ways of saying ram
    """

    attr = attr.split('gb')[0] + 'gb ' + random.choice(LaptopRetailerRegEx.ram_modifiers)
    return attr

In [19]:
def manipulate_ssd(attr: str) -> str:
    """
    Uses different ways of saying an ssd
    """

    if 'gb' in attr:
        type_drive = 'gb '
    else:
        type_drive = 'tb '

    attr = attr.split(type_drive)[0] + type_drive + random.choice(LaptopRetailerRegEx.ssd_modifiers)
    return attr

In [20]:
def manipulate_hard_drive(attr: str) -> str:
    """
    Uses different ways of saying hard drive
    """

    if 'gb' in attr:
        type_drive = 'gb '
    else:
        type_drive = 'tb '

    attr = attr.split(type_drive)[0] + type_drive + random.choice(LaptopRetailerRegEx.hard_drive_modifiers)
    return attr

In [21]:
def manipulate_title_gbtb(titles: list, ssd_attrs: list, hard_drive_attrs: list, ram_attrs: list) -> str:
    """
    Uses the "manipulate" functions to vary the titles
    """

    modified_titles = []
    for x in titles:
        for drive in ssd_attrs:
            x = x.replace(drive, manipulate_ssd(drive))
        for drive in hard_drive_attrs:
            x = x.replace(drive, manipulate_hard_drive(drive))
        for mem in ram_attrs:
            x = x.replace(mem, manipulate_ram(mem))
        
        modified_titles.append(x)

    return modified_titles

In [45]:
def create_pos_laptop_data(df):
    """
    Using the scraped laptop data, create positive pairs
    """

    MAX_POS_TITLES = 6
    temp = []
    for title in df['title']:
        # Get each major attribute of a laptop
        brand, product_attr, inch, cpu, ram, ssd, hard_drive, other_gb_attrs = get_key_attrs(title)

        # Make sure the product is actually a laptop
        if len(ram) == 0 and len(ssd) == 0 and len(hard_drive) == 0 and len(other_gb_attrs) == 0:
            continue

        # Create a "simple" version of the title using only the major attributes
        shuffle = [cpu, *list(map(lambda x: x.split(' '), ram)),
        *list(map(lambda x: x.split(' '), ssd)), 
        *list(map(lambda x: x.split(' '), hard_drive)), 
        *list(map(lambda x: x.split(' '), other_gb_attrs))]
        random.shuffle(shuffle)

        pos_title1 = brand + product_attr + inch
        for x in shuffle:
            pos_title1 = pos_title1 + x

        # Get all of the filler words (words that are not major attributes)
        orig_title = title.split(' ')
        filler_tokens = get_filler_tokens(orig_title, pos_title1)

        # Generate a list of titles that do not have as many filler words
        new_titles = remove_filler_tokens(orig_title, filler_tokens)

        # Change up the less semantically meaningful attributes on drives/ram
        new_titles = manipulate_title_gbtb(new_titles, ssd, hard_drive, ram)
        
        # Choose how many combos we're going to have
        amt_new_titles = MAX_POS_TITLES
        if (len(new_titles) < MAX_POS_TITLES):
            amt_new_titles = len(new_titles)
        
        # Create the combination with the original title
        temp.append([title, ' '.join(pos_title1), 1])
        for x in range(amt_new_titles):
            pos = random.choice(new_titles)
            temp.append([title, pos, 1])
            new_titles.remove(pos)

        # Among the new titles, pair some of them up for more diversity
        combos = list(combinations(new_titles, 2))
        if (len(combos) > 4):
            ran_pairs = random.sample(combos, 4)
            for pair in ran_pairs:
                temp.append([pair[0], pair[1], 1])

    return pd.DataFrame(temp, columns=['title_one', 'title_two', 'label'])
        

In [46]:
pos_titles = create_pos_laptop_data(laptops)

In [47]:
pos_titles = pos_titles.drop_duplicates(subset=['title_one', 'title_two'])

In [48]:
pos_titles

Unnamed: 0,title_one,title_two,label
0,"hp 2021 premium 14"" hd touchscreen laptop comp...","hp 14"" 2 core amd ryzen 3 3250u 256gb ssd 8gb ram",1
1,"hp 2021 premium 14"" hd touchscreen laptop comp...","hp 14"" computer 2 amd ryzen 3 3250u 8gb ddr4 r...",1
2,"hp 2021 premium 14"" hd touchscreen laptop comp...","hp 14"" laptop computer 2 amd ryzen 3 3250u 8gb...",1
3,"hp 2021 premium 14"" hd touchscreen laptop comp...","hp premium 14"" amd ryzen 3 3250u 8gb ram 256gb...",1
4,"hp 2021 premium 14"" hd touchscreen laptop comp...","hp 14"" touchscreen amd ryzen 3 3250u 8gb ddr4 ...",1
...,...,...,...
40141,"lenovo thinkpad t490s laptop 14.0"" fhd ips 250...","lenovo thinkpad 14.0"" i5 8365u 8gb 256gb ssd p...",1
40142,"lenovo thinkpad t490s laptop 14.0"" nits i5 836...","lenovo thinkpad t490s 14.0"" nits i5 8365u 8gb ...",1
40143,"lenovo thinkpad t490s laptop 14.0"" nits i5 836...","lenovo thinkpad 14.0"" 250 nits i5 8365u graphi...",1
40144,"lenovo thinkpad 14.0"" ips nits i5 8365u uhd 8g...","lenovo thinkpad 14.0"" fhd ips nits i5 8365u 8g...",1


In [49]:
def replace_drive_attribute(attr, ssd=False):
    """
    Replaces the drive attribute with a new one for negative data creation
    """

    gbs = [64, 128, 256, 484, 512, 768]
    tbs = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
    if 'gb' in attr:
        type_drive = 'gb'
    else:
        type_drive = 'tb'
    
    orig_amt = int(attr.split(type_drive)[0].strip())
    
    if type_drive == 'gb':
        if orig_amt in gbs:
            gbs.remove(orig_amt)
        if ssd:
            return str(random.choice(gbs)) + random.choice([' ', '']) + 'gb ' + random.choice(LaptopRetailerRegEx.ssd_modifiers)
        else:
            return str(random.choice(gbs)) + random.choice([' ', '']) + 'gb ' + random.choice(LaptopRetailerRegEx.hard_drive_modifiers)
    
    else:
        if orig_amt in tbs:
            tbs.remove(orig_amt)
        if ssd:
            return str(random.choice(tbs)) + random.choice([' ', '']) + 'tb ' + random.choice(LaptopRetailerRegEx.ssd_modifiers)
        else:
            return str(random.choice(tbs)) + random.choice([' ', '']) + 'tb ' + random.choice(LaptopRetailerRegEx.hard_drive_modifiers)


In [50]:
def replace_ram_attribute(attr):
    """
    Replaces the ram attribute with a new one for negative data creation
    """
    
    gbs = [4, 8, 16, 24, 32, 48, 64]
    orig_amt = int(attr.split('gb')[0].strip())
    
    if orig_amt in gbs:
        gbs.remove(orig_amt)
    return str(random.choice(gbs)) + random.choice([' ', '']) + 'gb ' + random.choice(LaptopRetailerRegEx.ram_modifiers)

In [51]:
def replace_other_attribute(attr):
    """
    Replaces an "other" gb attribute with a new one for negative data creation
    """

    gbs = [64, 128, 256, 484, 512, 768]
    tbs = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
    if 'gb' in attr:
        type_drive = 'gb'
    else:
        type_drive = 'tb'
    
    orig_amt = int(attr.split(type_drive)[0].strip())
    
    if type_drive == 'gb':
        if orig_amt in gbs:
            gbs.remove(orig_amt)
        return str(random.choice(gbs)) + random.choice([' ', '']) + 'gb'
    
    else:
        if orig_amt in tbs:
            tbs.remove(orig_amt)
        return str(random.choice(tbs)) + random.choice([' ', '']) + 'tb'

In [53]:
def create_neg_laptop_data(df):
    """
    Using the scraped laptop data, create positive pairs
    """
    
    temp = []
    for title in df['title']:
        # Get each major attribute of a laptop
        brand, product_attr, inch, cpu, ram, ssd, hard_drive, other_gb_attrs = get_key_attrs(title)

        # Make sure the product is actually a laptop
        if len(ram) == 0 and len(ssd) == 0 and len(hard_drive) == 0 and len(other_gb_attrs) == 0:
            continue
        
        # Substitute negative attributes
        neg_titles = []
        for x in ram:
            neg_title = title.replace(x, replace_ram_attribute(x))
            neg_titles.append(neg_title)
        
        for x in hard_drive:
            neg_title = title.replace(x, replace_drive_attribute(x))

        for x in ssd:
            neg_title = title.replace(x, replace_drive_attribute(x, True))
            neg_titles.append(neg_title)
        
        if (ram == [] and ssd == []):
            for x in other_gb_attrs:
                neg_title = title.replace(x, replace_other_attribute(x))
                neg_titles.append(neg_title)
        
        MAX_NEG_VARIATIONS = 4
        all_neg_titles_variations = []
        for neg_title in neg_titles:
            temp.append([title, neg_title, 0])

            # Get each major attribute of a laptop
            brand, product_attr, inch, cpu, ram, ssd, hard_drive, other_gb_attrs = get_key_attrs(neg_title)

            # Create a "simple" version of the title using only the major attributes
            shuffle = [cpu, *list(map(lambda x: x.split(' '), ram)),
            *list(map(lambda x: x.split(' '), ssd)), 
            *list(map(lambda x: x.split(' '), hard_drive)), 
            *list(map(lambda x: x.split(' '), other_gb_attrs))]
            random.shuffle(shuffle)

            base_neg_title = brand + product_attr + inch
            for x in shuffle:
                base_neg_title = base_neg_title + x
            
            orig_neg_title = neg_title.split(' ')
            filler_tokens = get_filler_tokens(orig_neg_title, base_neg_title)

            # Generate a list of titles that do not have as many filler words
            new_titles = remove_filler_tokens(orig_neg_title, filler_tokens)

            # Change up the less semantically meaningful attributes on drives/ram
            new_titles = manipulate_title_gbtb(new_titles, ssd, hard_drive, ram)
            all_neg_titles_variations.append(new_titles)

            # Add the negative titles
            for idx, new_title in enumerate(new_titles):
                temp.append([title, new_title, 0])
                if (idx + 1 == MAX_NEG_VARIATIONS):
                    break

        # Pair up titles from the negative title variations
        MAX_NEG_VARIATIONS = 5
        if len(all_neg_titles_variations) > 1:
            pot_combos = list(combinations(range(len(all_neg_titles_variations)), 2))
            for x in range(len(pot_combos)):
                idx_pair = random.choice(pot_combos)
                pot_combos.remove(idx_pair)
                try:   
                    for x in range(MAX_NEG_VARIATIONS):
                        t1 = random.choice(all_neg_titles_variations[idx_pair[0]])
                        t2 = random.choice(all_neg_titles_variations[idx_pair[1]])
                        temp.append([t1, t2, 0])

                except IndexError:
                    pass

                if x + 1 == MAX_NEG_VARIATIONS:
                        break

    return pd.DataFrame(temp, columns=['title_one', 'title_two', 'label'])


In [54]:
neg_titles = create_neg_laptop_data(laptops)

In [55]:
neg_titles = neg_titles.drop_duplicates(subset=['title_one', 'title_two'])

In [56]:
neg_titles

Unnamed: 0,title_one,title_two,label
0,"hp 2021 premium 14"" hd touchscreen laptop comp...","hp 2021 premium 14"" hd touchscreen laptop comp...",0
1,"hp 2021 premium 14"" hd touchscreen laptop comp...","hp 2021 premium 14"" hd touchscreen laptop comp...",0
2,"hp 2021 premium 14"" hd touchscreen laptop comp...","hp 2021 14"" touchscreen amd ryzen 3 3250u 24 g...",0
3,"hp 2021 premium 14"" hd touchscreen laptop comp...","hp 2021 premium 14"" touchscreen amd ryzen 3 32...",0
4,"hp 2021 premium 14"" hd touchscreen laptop comp...","hp 2021 premium 14"" hd 2 core amd ryzen 3 3250...",0
...,...,...,...
49481,"lenovo thinkpad t490s laptop 14.0"" fhd ips 250...","lenovo thinkpad t490s laptop 14.0"" fhd ips 250...",0
49482,"lenovo thinkpad t490s laptop 14.0"" fhd ips 250...","lenovo thinkpad 14.0"" fhd ips i5 8365u uhd 8gb...",0
49483,"lenovo thinkpad t490s laptop 14.0"" fhd ips 250...","lenovo thinkpad 14.0"" ips nits i5 8365u graphi...",0
49484,"lenovo thinkpad t490s laptop 14.0"" fhd ips 250...","lenovo thinkpad 14.0"" fhd ips 250 i5 8365u 8gb...",0


In [57]:
retailer_laptop_df = create_final_data(pos_titles, neg_titles)

In [58]:
retailer_laptop_df

Unnamed: 0,title_one,title_two,label
23936,dell xps 9500 laptop 15 intel core i7 10th gen...,dell xps 9500 laptop 15 intel core i7 i7 10875...,1
40093,lenovo thinkpad p71 workstation laptop windows...,lenovo thinkpad pro intel i7 7700hq 32gb ram 2...,1
9088,"asus vivobook 15.6"" fhd laptop nanoedge intel ...","asus vivobook 15.6"" laptop intel core i5 1035g...",1
11387,"asus vivobook 15.6"" fhd touchscreen laptop 108...","asus vivobook 15.6"" laptop nanoedge intel core...",1
18389,"lenovo 15.6"" legion 5 gaming laptop notebook 8...","lenovo 15.6"" legion 5 laptop i7 64 gb ddr4 ram...",0
...,...,...,...
6519,dell latitude e7450 laptop computer 2.90 ghz i...,dell latitude e7450 computer intel i5 core gen...,1
37747,"asus vivobook 15 15.6"" amd quad core ryzen 7 3...","asus vivobook 15 15.6"" customized amd quad cor...",1
22491,dell precision 7740 laptop 17.3 intel core i5 ...,dell precision 7740 laptop 17.3 intel core i5 ...,1
29417,msi 15 intel i7 1165g7 4 64gb ddr4 memory 512g...,msi modern 15 a11m home business intel i7 1165...,1


In [59]:
retailer_laptop_df.to_csv('data/train/retailer_laptop_data.csv')

In [60]:
retailer_laptop_df.dropna(how='all')

Unnamed: 0,title_one,title_two,label
23936,dell xps 9500 laptop 15 intel core i7 10th gen...,dell xps 9500 laptop 15 intel core i7 i7 10875...,1
40093,lenovo thinkpad p71 workstation laptop windows...,lenovo thinkpad pro intel i7 7700hq 32gb ram 2...,1
9088,"asus vivobook 15.6"" fhd laptop nanoedge intel ...","asus vivobook 15.6"" laptop intel core i5 1035g...",1
11387,"asus vivobook 15.6"" fhd touchscreen laptop 108...","asus vivobook 15.6"" laptop nanoedge intel core...",1
18389,"lenovo 15.6"" legion 5 gaming laptop notebook 8...","lenovo 15.6"" legion 5 laptop i7 64 gb ddr4 ram...",0
...,...,...,...
6519,dell latitude e7450 laptop computer 2.90 ghz i...,dell latitude e7450 computer intel i5 core gen...,1
37747,"asus vivobook 15 15.6"" amd quad core ryzen 7 3...","asus vivobook 15 15.6"" customized amd quad cor...",1
22491,dell precision 7740 laptop 17.3 intel core i5 ...,dell precision 7740 laptop 17.3 intel core i5 ...,1
29417,msi 15 intel i7 1165g7 4 64gb ddr4 memory 512g...,msi modern 15 a11m home business intel i7 1165...,1


In [40]:
get_max_len(retailer_laptop_df)

53

In [78]:
val_data = pd.read_csv('data/train/total_data.csv', skiprows=455000, names=['title_one', 'title_two', 'label', 'index'])

In [79]:
val_data

Unnamed: 0,title_one,title_two,label,index
0,kingston dt100g2 8gbz 8gb usb 2 0 hi speed dat...,dolphin 6110 mobile computer 6110gpb1232e0h ho...,0,
1,pny turbo 256gb usb 3 0 flash drive p fd256tbo...,advanced cable technology fb8830 reviews tweakers,0,
2,amd ryzen 5 3600 6 core 3 6 ghz processor,amd ryzen 5 2400g,0,
3,"asus vivobook 15.6"" fhd touchscreen laptop 108...","asus vivobook 15.6"" fhd touchscreen laptop nan...",0,
4,acer aspire z3 715 wtub one core i5 6400t 2 gh...,acer aspire z3 715 ur52 aio core i5 6400t 2 2g...,1,
...,...,...,...,...
9602,4064gb,4064 gb,1,2031.0
9603,goldtouch go travel laptop tablet stand gtls 0...,goldtouch gtls 0055 mobile laptop stand alumin...,1,
9604,buffalo terastation 7120r enterprise nas serve...,buffalo terastation 7120r enterprise nas serve...,1,
9605,seagate enterprise capacity 3 5 hdd sata 6gb 2...,seagate 8tb enterprise capacity sata 6gb 4k na...,1,


In [80]:
del val_data['index']

In [81]:
val_data

Unnamed: 0,title_one,title_two,label
0,kingston dt100g2 8gbz 8gb usb 2 0 hi speed dat...,dolphin 6110 mobile computer 6110gpb1232e0h ho...,0
1,pny turbo 256gb usb 3 0 flash drive p fd256tbo...,advanced cable technology fb8830 reviews tweakers,0
2,amd ryzen 5 3600 6 core 3 6 ghz processor,amd ryzen 5 2400g,0
3,"asus vivobook 15.6"" fhd touchscreen laptop 108...","asus vivobook 15.6"" fhd touchscreen laptop nan...",0
4,acer aspire z3 715 wtub one core i5 6400t 2 gh...,acer aspire z3 715 ur52 aio core i5 6400t 2 2g...,1
...,...,...,...
9602,4064gb,4064 gb,1
9603,goldtouch go travel laptop tablet stand gtls 0...,goldtouch gtls 0055 mobile laptop stand alumin...,1
9604,buffalo terastation 7120r enterprise nas serve...,buffalo terastation 7120r enterprise nas serve...,1
9605,seagate enterprise capacity 3 5 hdd sata 6gb 2...,seagate 8tb enterprise capacity sata 6gb 4k na...,1


In [84]:
x = val_data.to_numpy()

In [92]:
y

array([['kingston dt100g2 8gbz 8gb usb 2 0 hi speed datatraveler 100 generation ncix',
        'dolphin 6110 mobile computer 6110gpb1232e0h honeywell'],
       ['pny turbo 256gb usb 3 0 flash drive p fd256tbop ge data storage page 4 laptops outlet direct',
        'advanced cable technology fb8830 reviews tweakers'],
       ['amd ryzen 5 3600 6 core 3 6 ghz processor', 'amd ryzen 5 2400g'],
       ...,
       ['buffalo terastation 7120r enterprise nas server 96 tb ts 2rzh96t12d workgroup servers cdwg com',
        'buffalo terastation 7120r enterprise nas server 96 tb ts 2rzh96t12d workgroup servers cdw com'],
       ['seagate enterprise capacity 3 5 hdd sata 6gb 2016 4kn secure 8tb prijzen tweakers',
        'seagate 8tb enterprise capacity sata 6gb 4k native sed 3 5 internal hard drive buy connection public sector solutions'],
       ['6 cell pa3785u 1brs li ion battery toshiba nb300 nb305 mini notebook series bestbatt com',
        'roccat skeltr qwerty zwart prijzen tweakers']], dt

In [93]:
y = y.astype('str')

In [94]:
y.dtype

dtype('<U1071')