In [273]:
import re
import math
import pandas as pd
import os
import random
from tqdm import tqdm
import sys
from itertools import combinations
from nltk.corpus import stopwords
sys.path.append(os.getcwd())
from src.data_creation.laptop_data_creation import LaptopAttributes, populate_spec
from src.preprocessing import unit_matcher, remove_misc
from src.common import create_final_data


In [207]:
def remove_stop_words(phrase):
    '''
    Removes the stop words from a string
    '''

    # Creates the stopwords
    to_stop = stopwords.words('english')
    punctuation = "!”#$%&’()*+,-/:;<=>?@[\]^_`{|}~ "
    for c in punctuation:
        to_stop.append(c)
    to_stop.append('null')
    
    for punc in punctuation:
        phrase = phrase.replace(punc, ' ')
    
    return ' '.join((' '.join([x for x in phrase.split(' ') if x not in to_stop])).split()).lower()

In [468]:
populate_spec()
print(LaptopAttributes.inches)

"""" Set up sets """
laptop_brands = {'gateway', 'panasonic', 'toughbook', 'msi'}
product_attrs = {'vivobook'}
cpu_attributes = {'intel'}

for brand in LaptopAttributes.laptop_brands:
    laptop_brands.add(brand.split(' ')[0].lower())
    product_attrs.add(' '.join(brand.split(' ')[1: ]).lower())

intel_cpu_df = pd.read_csv('data/base/intel_cpus.csv')
intel_cpu_df = intel_cpu_df['title'].map(lambda x: remove_stop_words(x).split(' '))
for i in range(len(intel_cpu_df)):
    cpu_attributes.update(intel_cpu_df.iloc[i])

amd_cpu_df = pd.read_csv('data/base/amd_cpus.csv')
amd_cpu_df = amd_cpu_df['title'].map(lambda x: remove_stop_words(x).split(' '))
for i in range(len(amd_cpu_df)):
    cpu_attributes.update(amd_cpu_df.iloc[i])

laptop_brands = list(laptop_brands)
laptop_brands.sort(key=len, reverse=True)

product_attrs = list(product_attrs)
product_attrs.sort(key=len, reverse=True)

cpu_attributes = list(cpu_attributes)
cpu_attributes.sort(key=len, reverse=True)

cpu_matcher = re.compile("\\b" + "(?!\S)|\\b".join(cpu_attributes) + "(?!\S)", re.IGNORECASE)
brand_matcher = re.compile("\\b" + "(?!\S)|\\b".join(laptop_brands) + "(?!\S)", re.IGNORECASE)
product_attr_matcher = re.compile("\\b" + "(?!\S)|\\b".join(product_attrs) + "(?!\S)", re.IGNORECASE)
gb_matcher = unit_matcher('gb')
tb_matcher = unit_matcher('tb')
#inch_matcher = re.compile('[1][0-9]\"?"? [0-9]?\"?"?(?!\S)', re.IGNORECASE)
inch_matcher = re.compile('[1][0-9]\"?"?\.?[0-9]?\"?\"? ?\\binch?(?!\S)', re.IGNORECASE)

{'17.0', '13.9', '13.3', '13.5', '18.4', '14.1', '15.6', '12.5', '13.0', '15.4', '14.0', '17.3', '10.1', '11.3', '15.0', '11.6', '12.0', '12.3'}


In [226]:
amazon_laptops = pd.read_csv('data/base/amazon_laptop_titles.csv')
walmart_laptops = pd.read_csv('data/base/walmart_laptop_titles.csv')
newegg_laptops = pd.read_csv('data/base/newegg_laptop_titles.csv')

laptops = remove_misc(pd.concat([amazon_laptops, walmart_laptops, newegg_laptops]))
laptops['title'] = laptops['title'].apply(lambda x: remove_stop_words(x))
laptops = laptops.drop_duplicates(subset=['title'])
laptops

Unnamed: 0,title
0,"hp 2021 premium 14"" hd touchscreen laptop comp..."
1,"2021 newest asus tuf gaming laptop 15.6"" ips f..."
2,acer aspire 5 slim laptop 15.6 inches full hd ...
3,hp chromebook 11 inch laptop up 15 hour batter...
4,hp chromebook 14 inch hd laptop intel celeron ...
...,...
3614,lenovo thinkpad p71 workstation laptop windows...
3615,lenovo thinkpad t480s windows 10 pro laptop in...
3616,"newest dell inspiron 5000 15.6"" touchscreen le..."
3617,lenovo thinkpad p71 workstation laptop windows...


In [227]:
test = remove_stop_words('"Acer Predator Helios 300 15.6"" Gaming Laptop i7-10750H 16GB DDR4 1TB SSD"')

In [228]:
test

'"acer predator helios 300 15.6"" gaming laptop i7 10750h 16gb ddr4 1tb ssd"'

In [229]:
brand_matcher.findall(test)

['acer']

In [230]:
product_attr_matcher.findall(test)

['predator']

In [231]:
inch_matcher.findall(test)

['15.6""']

In [232]:
cpu_matcher.findall(test)

['i7', '10750h']

In [233]:
gb_matcher.findall(test)

[' 16gb']

In [234]:
tb_matcher.findall(test)

[' 1tb']

In [446]:
def create_pos_laptop_data(df):
    MAX_POS_TITLES = 6
    temp = []
    for title in df['title']:
        # Get each major attribute of a laptop
        brand = list(map(lambda x: x.strip(), brand_matcher.findall(title)))
        product_attr = list(map(lambda x: x.strip(), product_attr_matcher.findall(title)))
        inch = list(map(lambda x: x.strip(), inch_matcher.findall(title)))
        cpu = list(map(lambda x: x.strip(),cpu_matcher.findall(title)))
        gb = list(map(lambda x: x.strip(), gb_matcher.findall(title)))
        tb = list(map(lambda x: x.strip(), tb_matcher.findall(title)))

        # Make sure the product is actually a laptop
        if gb == [] and tb == []:
            continue
        
        # Create a "simple" version of the title using only the major attributes
        shuffle = [cpu, gb, tb]
        random.shuffle(shuffle)
        pos_title1 = brand + product_attr + inch + shuffle[0] + shuffle[1] + shuffle[2]
        
        # Get all of the filler words (words that are not major attributes)
        orig_title = title.split(' ')
        filler_tokens = []
        for token in orig_title:
            if token not in pos_title1:
                filler_tokens.append(token)

        # Generate a list of titles that do not have as many filler words
        new_titles = []
        amt_filler_tokens = len(filler_tokens)
        if (len(filler_tokens) > 1):
            for x in range(len(filler_tokens)): # For as many filler tokens are there are, we are going to create that many new titles
                new_title = orig_title.copy()
                filler_tokens_cp = filler_tokens.copy()
                amt_to_remove = random.randint(int(amt_filler_tokens * 0.25), amt_filler_tokens)
                for x in range(amt_to_remove): # Get a random amount of filler tokens to remove
                    filler = random.choice(filler_tokens_cp)
                    new_title.remove(filler)
                    filler_tokens_cp.remove(filler)
                new_titles.append(' '.join(new_title))
        
        # Choose how many combos we're going to have
        amt_new_titles = MAX_POS_TITLES
        if (len(new_titles) < MAX_POS_TITLES):
            amt_new_titles = len(new_titles)
        
        # Create the combination with the original title
        temp.append([title, ' '.join(pos_title1), 1])
        for x in range(amt_new_titles):
            pos = random.choice(new_titles)
            temp.append([title, pos, 1])
            new_titles.remove(pos)
        
    return pd.DataFrame(temp, columns=['title_one', 'title_two', 'label'])
        

In [447]:
pos_titles = create_pos_laptop_data(laptops)

In [448]:
pos_titles = pos_titles.drop_duplicates(subset=['title_two'])

In [449]:
pos_titles

Unnamed: 0,title_one,title_two,label
0,"hp 2021 premium 14"" hd touchscreen laptop comp...","hp 14"" 10 8gb 256gb 2 core amd ryzen 3 3250u",1
1,"hp 2021 premium 14"" hd touchscreen laptop comp...","hp 14"" hd touchscreen laptop 2 core amd ryzen ...",1
2,"hp 2021 premium 14"" hd touchscreen laptop comp...","hp premium 14"" hd 2 core amd ryzen 3 3250u 8gb...",1
3,"hp 2021 premium 14"" hd touchscreen laptop comp...","hp 14"" hd touchscreen 2 core amd ryzen 3 3250u...",1
4,"hp 2021 premium 14"" hd touchscreen laptop comp...","hp 2021 14"" hd touchscreen laptop computer 2 c...",1
...,...,...,...
27529,"lenovo thinkpad t490s laptop 14.0"" fhd ips 250...","lenovo thinkpad laptop 14.0"" ips 250 i5 8365u ...",1
27530,"lenovo thinkpad t490s laptop 14.0"" fhd ips 250...","lenovo thinkpad 14.0"" 250 i5 8365u 8gb 256gb 1...",1
27532,"lenovo thinkpad t490s laptop 14.0"" fhd ips 250...","lenovo thinkpad t490s laptop 14.0"" fhd nits i5...",1
27533,"lenovo thinkpad t490s laptop 14.0"" fhd ips 250...","lenovo thinkpad laptop 14.0"" ips 250 nits i5 8...",1


In [None]:
def create_neg_lapotp_data(df):
    

In [467]:
inch_matcher.findall('hp 2021 premium 14inch hd touchscreen laptop computer 2 core amd ryzen 3 3250u 2.6ghz 8gb ram 256gb ssd no dvd webcam bluetooth wi fi hdmi win 10 s rokc hdmi cable')

[]