In [5]:
import re
import math
import pandas as pd
import os
import random
from tqdm import tqdm
import sys
from itertools import combinations
from nltk.corpus import stopwords
sys.path.append(os.getcwd())
from src.data_creation.laptop_data_creation import LaptopAttributes, populate_spec
from src.preprocessing import unit_matcher, remove_misc
from src.common import create_final_data


In [6]:
def remove_stop_words(phrase):
    '''
    Removes the stop words from a string
    '''

    # Creates the stopwords
    to_stop = stopwords.words('english')
    punctuation = "!”#$%&’()*+,-/:;<=>?@[\]^_`{|}~ "
    for c in punctuation:
        to_stop.append(c)
    to_stop.append('null')
    
    for punc in punctuation:
        phrase = phrase.replace(punc, ' ')
    
    return ' '.join((' '.join([x for x in phrase.split(' ') if x not in to_stop])).split()).lower()

In [250]:
populate_spec()
class LaptopRetailerRegEx:
    laptop_brands = {'gateway', 'panasonic', 'toughbook', 'msi'}
    product_attrs = {'vivobook'}
    cpu_attributes = {'intel', 'm 2', '2 core', '4 core', '6 core', '8 core'}

    for brand in LaptopAttributes.laptop_brands:
        laptop_brands.add(brand.split(' ')[0].lower())
        product_attrs.add(' '.join(brand.split(' ')[1: ]).lower())

    intel_cpu_df = pd.read_csv('data/base/intel_cpus.csv')
    intel_cpu_df = intel_cpu_df['title'].map(lambda x: remove_stop_words(x).split(' '))
    for i in range(len(intel_cpu_df)):
        cpu_attributes.update(intel_cpu_df.iloc[i])

    amd_cpu_df = pd.read_csv('data/base/amd_cpus.csv')
    amd_cpu_df = amd_cpu_df['title'].map(lambda x: remove_stop_words(x).split(' '))
    for i in range(len(amd_cpu_df)):
        cpu_attributes.update(amd_cpu_df.iloc[i])

    laptop_brands = list(laptop_brands)
    laptop_brands.sort(key=len, reverse=True)

    product_attrs = list(product_attrs)
    product_attrs.sort(key=len, reverse=True)

    cpu_attributes = list(cpu_attributes)
    cpu_attributes.sort(key=len, reverse=True)

    ram_modifiers = ['memory', 'ram', 'ddr4', 'ddr4 ram', 'ddr4 memory']
    ram_modifiers.sort()

    hard_drive_modifiers = ['hdd', 'hard drive', 'disk drive', 'storage', 'hard drive storage', 'hdd storage']
    hard_drive_modifiers.sort(key=len, reverse=True)

    ssd_modifiers = ['ssd', 'solid state drive', 'solid state disk', 'pcie', 'pcie ssd', 'ssd storage']
    ssd_modifiers.sort(key=len, reverse=True)

    annoying_words = ['windows 10', 'win 10', 'windows 10 in s mode', 'windows', '3.0', '3.1', '3.2', 'optical drive', 'cd drive', 'dvd drive']
    annoying_words.sort(key=len, reverse=True)

    ram_modifier_matcher = re.compile("\\b" + "(?!\S)|\\b".join(ram_modifiers) + "(?!\S)", re.IGNORECASE)
    random_matcher = re.compile("\\b" + "(?!\S)|\\b".join(annoying_words) + "(?!\S)", re.IGNORECASE)
    cpu_matcher = re.compile("\\b" + "(?!\S)|\\b".join(cpu_attributes) + "(?!\S)", re.IGNORECASE)
    brand_matcher = re.compile("\\b" + "(?!\S)|\\b".join(laptop_brands) + "(?!\S)", re.IGNORECASE)
    product_attr_matcher = re.compile("\\b" + "(?!\S)|\\b".join(product_attrs) + "(?!\S)", re.IGNORECASE)
    ram_matcher = re.compile(' ?[0-9]+.{0,1}' + 'gb ?' + '(?:' + '|'.join([x for x in ram_modifiers]) + ')(?!\S)', re.IGNORECASE)
    hard_drive_matcher = re.compile(' ?[0-9]+.{0,1}' + '(?:gb|tb) ?' + '(?:' + '|'.join([x for x in hard_drive_modifiers]) + ')(?!\S)', re.IGNORECASE)
    ssd_matcher = re.compile(' ?[0-9]+.{0,1}' + '(?:gb|tb) ?' + '(?:' + '|'.join([x for x in ssd_modifiers]) + ')(?!\S)', re.IGNORECASE)
    gbtb_matcher = re.compile(' ?[0-9]+.{0,1}' + '(?:gb|tb)' + '(?!\S)', re.IGNORECASE)
    inch_matcher = re.compile('[1][0-9]\"?\"?.?[0-9]?\"?\"? ?(?:inch)?(?!\S)', re.IGNORECASE)
    del laptop_brands, product_attrs, cpu_attributes, intel_cpu_df, amd_cpu_df

In [174]:
amazon_laptops = pd.read_csv('data/base/amazon_laptop_titles.csv')
walmart_laptops = pd.read_csv('data/base/walmart_laptop_titles.csv')
newegg_laptops = pd.read_csv('data/base/newegg_laptop_titles.csv')

laptops = remove_misc(pd.concat([amazon_laptops, walmart_laptops, newegg_laptops]))
laptops['title'] = laptops['title'].apply(lambda x: remove_stop_words(x))
laptops = laptops.drop_duplicates(subset=['title'])
laptops

Unnamed: 0,title
0,"hp 2021 premium 14"" hd touchscreen laptop comp..."
1,"2021 newest asus tuf gaming laptop 15.6"" ips f..."
2,acer aspire 5 slim laptop 15.6 inches full hd ...
3,hp chromebook 11 inch laptop up 15 hour batter...
4,hp chromebook 14 inch hd laptop intel celeron ...
...,...
3614,lenovo thinkpad p71 workstation laptop windows...
3615,lenovo thinkpad t480s windows 10 pro laptop in...
3616,"newest dell inspiron 5000 15.6"" touchscreen le..."
3617,lenovo thinkpad p71 workstation laptop windows...


## Testing the RegEx Expressions

In [219]:
test = remove_stop_words('Acer Predator Helios 300 15.6"" inch Gaming Laptop i7-10750H 16GB DDR4 1TB SSD')

In [220]:
test

'acer predator helios 300 15.6"" inch gaming laptop i7 10750h 16gb ddr4 1tb ssd'

In [199]:
LaptopRetailerRegEx.brand_matcher.findall(test)

['acer']

In [200]:
LaptopRetailerRegEx.product_attr_matcher.findall(test)

['predator']

In [241]:
LaptopRetailerRegEx.inch_matcher.findall(test)

['15.6"" inch']

In [242]:
LaptopRetailerRegEx.cpu_matcher.findall(test)

['i7', '10750h']

In [243]:
LaptopRetailerRegEx.ram_matcher.findall(test)

[' 16gb ddr4']

In [251]:
LaptopRetailerRegEx.ssd_matcher.findall(test)

[' 1tb ssd']

In [252]:
LaptopRetailerRegEx.hard_drive_matcher.findall(test)

[]

## Generate the Data

In [355]:
def get_key_attrs(title:str) -> tuple:
    """
    Get each major attribute of a laptop
    """

    # Remove random words that may end up in the important identifiers
    random_words = set(LaptopRetailerRegEx.random_matcher.findall(title))
    for word in random_words:
        title = title.replace(word, '')

    brand = list(map(lambda x: x.strip(), LaptopRetailerRegEx.brand_matcher.findall(title)))
    product_attr = list(map(lambda x: x.strip(), LaptopRetailerRegEx.product_attr_matcher.findall(title)))
    inch = list(map(lambda x: x.strip(), LaptopRetailerRegEx.inch_matcher.findall(title)))
    cpu = list(map(lambda x: x.strip(), LaptopRetailerRegEx.cpu_matcher.findall(title)))
    ram = list(map(lambda x: x.strip(), LaptopRetailerRegEx.ram_matcher.findall(title)))
    ssd = list(map(lambda x: x.strip(), LaptopRetailerRegEx.ssd_matcher.findall(title)))
    hard_drive = list(map(lambda x: x.strip(), LaptopRetailerRegEx.hard_drive_matcher.findall(title)))

    # Before getting the other gb attributes, make sure we don't get ones from ssd, hard drive or ram
    for x in ram:
        title = title.replace(x, '')
    
    for x in ssd:
        title = title.replace(x, '')

    for x in hard_drive:
        title = title.replace(x, '')

    other_gb_attrs = list(map(lambda x: x.strip(), LaptopRetailerRegEx.gbtb_matcher.findall(title)))

    return (brand, product_attr, inch, cpu, ram, ssd, hard_drive, other_gb_attrs)

In [354]:
def get_filler_tokens(orig_title: list, imp_tokens: list) -> list:
    """
    Get all of the filler words (words that are not major attributes)
    """

    filler_tokens = []
    for token in orig_title:
        if token not in imp_tokens:
            filler_tokens.append(token)

    return filler_tokens

In [356]:
def remove_filler_tokens(orig_title: list, filler_tokens: list) -> list:
    """"
    Generates new titles with less filler words in it
    """
    
    new_titles = []
    amt_filler_tokens = len(filler_tokens)
    if (len(filler_tokens) > 1):
        for x in range(len(filler_tokens)): # For as many filler tokens are there are, we are going to create that many new titles
            new_title = orig_title.copy()
            filler_tokens_cp = filler_tokens.copy()
            amt_to_remove = random.randint(int(amt_filler_tokens * 0.25), amt_filler_tokens)
            for x in range(amt_to_remove): # Get a random amount of filler tokens to remove
                filler = random.choice(filler_tokens_cp)
                new_title.remove(filler)
                filler_tokens_cp.remove(filler)
            new_titles.append(' '.join(new_title))
    
    return new_titles

In [357]:
def manipulate_ram(attr: str) -> str:
    """
    Uses different ways of saying ram
    """

    attr = attr.split('gb')[0] + 'gb ' + random.choice(LaptopRetailerRegEx.ram_modifiers)
    return attr

In [358]:
def manipulate_ssd(attr: str) -> str:
    """
    Uses different ways of saying an ssd
    """

    if 'gb' in attr:
        type_drive = 'gb '
    else:
        type_drive = 'tb '

    attr = attr.split(type_drive)[0] + type_drive + random.choice(LaptopRetailerRegEx.ssd_modifiers)
    return attr

In [359]:
def manipulate_hard_drive(attr: str) -> str:
    """
    Uses different ways of saying hard drive
    """

    if 'gb' in attr:
        type_drive = 'gb '
    else:
        type_drive = 'tb '

    attr = attr.split(type_drive)[0] + type_drive + random.choice(LaptopRetailerRegEx.hard_drive_modifiers)
    return attr

In [360]:
def manipulate_title_gbtb(titles: list, ssd_attrs: list, hard_drive_attrs: list, ram_attrs: list) -> str:
    """
    Uses the "manipulate" functions to vary the titles
    """

    modified_titles = []
    for x in titles:
        for drive in ssd_attrs:
            x = x.replace(drive, manipulate_ssd(drive))
        for drive in hard_drive_attrs:
            x = x.replace(drive, manipulate_hard_drive(drive))
        for mem in ram_attrs:
            x = x.replace(mem, manipulate_ram(mem))
        
        modified_titles.append(x)

    return modified_titles

In [361]:
def create_pos_laptop_data(df):
    """
    Using the scraped laptop data, create positive pairs
    """

    MAX_POS_TITLES = 6
    temp = []
    for title in df['title']:
        # Get each major attribute of a laptop
        brand, product_attr, inch, cpu, ram, ssd, hard_drive, other_gb_attrs = get_key_attrs(title)

        # Make sure the product is actually a laptop
        if ram == [] and ssd == [] and hard_drive and other_gb_attrs == []:
            continue
        
        # Create a "simple" version of the title using only the major attributes
        shuffle = [cpu, *list(map(lambda x: x.split(' '), ram)),
        *list(map(lambda x: x.split(' '), ssd)), 
        *list(map(lambda x: x.split(' '), hard_drive)), 
        *list(map(lambda x: x.split(' '), other_gb_attrs))]
        random.shuffle(shuffle)

        pos_title1 = brand + product_attr + inch
        for x in shuffle:
            pos_title1 = pos_title1 + x

        # Get all of the filler words (words that are not major attributes)
        orig_title = title.split(' ')
        filler_tokens = get_filler_tokens(orig_title, pos_title1)

        # Generate a list of titles that do not have as many filler words
        new_titles = remove_filler_tokens(orig_title, filler_tokens)

        # Change up the less semantically meaningful attributes on drives/ram
        new_titles = manipulate_title_gbtb(new_titles, ssd, hard_drive, ram)
        
        # Choose how many combos we're going to have
        amt_new_titles = MAX_POS_TITLES
        if (len(new_titles) < MAX_POS_TITLES):
            amt_new_titles = len(new_titles)
        
        # Create the combination with the original title
        temp.append([title, ' '.join(pos_title1), 1])
        for x in range(amt_new_titles):
            pos = random.choice(new_titles)
            temp.append([title, pos, 1])
            new_titles.remove(pos)
        
    return pd.DataFrame(temp, columns=['title_one', 'title_two', 'label'])
        

In [362]:
pos_titles = create_pos_laptop_data(laptops)

In [363]:
pos_titles = pos_titles.drop_duplicates(subset=['title_two'])

In [364]:
pos_titles

Unnamed: 0,title_one,title_two,label
0,"hp 2021 premium 14"" hd touchscreen laptop comp...","hp 14"" 256gb ssd 8gb ram 2 core amd ryzen 3 3250u",1
1,"hp 2021 premium 14"" hd touchscreen laptop comp...","hp 14"" amd ryzen 3 3250u 8gb ddr4 256gb solid ...",1
2,"hp 2021 premium 14"" hd touchscreen laptop comp...","hp 14"" amd ryzen 3 3250u 8gb ddr4 256gb ssd",1
3,"hp 2021 premium 14"" hd touchscreen laptop comp...","hp 14"" amd ryzen 3 3250u 8gb ddr4 ram 256gb ssd",1
4,"hp 2021 premium 14"" hd touchscreen laptop comp...","hp 2021 14"" hd touchscreen computer 2 core amd...",1
...,...,...,...
28119,"lenovo thinkpad t490s laptop 14.0"" fhd ips 250...","lenovo thinkpad 14.0"" i5 8365u 8gb 256gb ssd p...",1
28120,"lenovo thinkpad t490s laptop 14.0"" fhd ips 250...","lenovo thinkpad 14.0"" fhd ips i5 8365u uhd 8gb...",1
28121,"lenovo thinkpad t490s laptop 14.0"" fhd ips 250...","lenovo thinkpad t490s 14.0"" fhd ips 250 nits i...",1
28122,"lenovo thinkpad t490s laptop 14.0"" fhd ips 250...","lenovo thinkpad t490s laptop 14.0"" ips 250 nit...",1


In [None]:
def create_neg_lapotp_data(df):
    """
    Using the scraped laptop data, create positive pairs
    """

    MAX_POS_TITLES = 6
    temp = []
    for title in df['title']:
        # Get each major attribute of a laptop
        brand, product_attr, inch, cpu, ram, ssd, hard_drive, other_gb_attrs = get_key_attrs(title)

        # Make sure the product is actually a laptop
        if ram == [] and ssd == [] and hard_drive and other_gb_attrs == []:
            continue

In [190]:
LaptopRetailerRegEx.drive_matcher.findall('hp 2021 premium 14inch hd touchscreen laptop computer 2 core amd ryzen 3 3250u 2.6ghz 8gb ram 256gb ssd no dvd webcam bluetooth wi fi hdmi win 10 s rokc hdmi cable')

[' 8gb', ' 256gb ssd']