In [206]:
import re
import math
import pandas as pd
import os
import random
from tqdm import tqdm
import sys
from nltk.corpus import stopwords
sys.path.append(os.getcwd())
from src.data_creation.laptop_data_creation import LaptopAttributes, populate_spec
from src.preprocessing import unit_matcher, remove_misc
from src.common import create_final_data


In [207]:
def remove_stop_words(phrase):
    '''
    Removes the stop words from a string
    '''

    # Creates the stopwords
    to_stop = stopwords.words('english')
    punctuation = "!”#$%&’()*+,-/:;<=>?@[\]^_`{|}~ "
    for c in punctuation:
        to_stop.append(c)
    to_stop.append('null')
    
    for punc in punctuation:
        phrase = phrase.replace(punc, ' ')
    
    return ' '.join((' '.join([x for x in phrase.split(' ') if x not in to_stop])).split()).lower()

In [225]:
populate_spec()
print(LaptopAttributes.inches)

"""" Set up sets """
laptop_brands = {'gateway', 'panasonic', 'toughbook', 'msi'}
product_attrs = {'vivobook'}
cpu_attributes = {'intel'}

for brand in LaptopAttributes.laptop_brands:
    laptop_brands.add(brand.split(' ')[0].lower())
    product_attrs.add(' '.join(brand.split(' ')[1: ]).lower())

intel_cpu_df = pd.read_csv('data/base/intel_cpus.csv')
intel_cpu_df = intel_cpu_df['title'].map(lambda x: remove_stop_words(x).split(' '))
for i in range(len(intel_cpu_df)):
    cpu_attributes.update(intel_cpu_df.iloc[i])

amd_cpu_df = pd.read_csv('data/base/amd_cpus.csv')
amd_cpu_df = amd_cpu_df['title'].map(lambda x: remove_stop_words(x).split(' '))
for i in range(len(amd_cpu_df)):
    cpu_attributes.update(amd_cpu_df.iloc[i])

laptop_brands = list(laptop_brands)
laptop_brands.sort(key=len, reverse=True)

product_attrs = list(product_attrs)
product_attrs.sort(key=len, reverse=True)

cpu_attributes = list(cpu_attributes)
cpu_attributes.sort(key=len, reverse=True)

cpu_matcher = re.compile("\\b" + "(?!\S)|\\b".join(cpu_attributes) + "(?!\S)", re.IGNORECASE)
brand_matcher = re.compile("\\b" + "(?!\S)|\\b".join(laptop_brands) + "(?!\S)", re.IGNORECASE)
product_attr_matcher = re.compile("\\b" + "(?!\S)|\\b".join(product_attrs) + "(?!\S)", re.IGNORECASE)
gb_matcher = unit_matcher('gb')
tb_matcher = unit_matcher('tb')
#inch_matcher = re.compile('[1][0-9]\"?"? [0-9]?\"?"?(?!\S)', re.IGNORECASE)
inch_matcher = re.compile('[1][0-9]\"?"?\.?[0-9]?\"?"?(?!\S)', re.IGNORECASE)

{'17.0', '13.9', '13.3', '13.5', '18.4', '14.1', '15.6', '12.5', '13.0', '15.4', '14.0', '17.3', '10.1', '11.3', '15.0', '11.6', '12.0', '12.3'}


In [226]:
amazon_laptops = pd.read_csv('data/base/amazon_laptop_titles.csv')
walmart_laptops = pd.read_csv('data/base/walmart_laptop_titles.csv')
newegg_laptops = pd.read_csv('data/base/newegg_laptop_titles.csv')

laptops = remove_misc(pd.concat([amazon_laptops, walmart_laptops, newegg_laptops]))
laptops['title'] = laptops['title'].apply(lambda x: remove_stop_words(x))
laptops = laptops.drop_duplicates(subset=['title'])
laptops

Unnamed: 0,title
0,"hp 2021 premium 14"" hd touchscreen laptop comp..."
1,"2021 newest asus tuf gaming laptop 15.6"" ips f..."
2,acer aspire 5 slim laptop 15.6 inches full hd ...
3,hp chromebook 11 inch laptop up 15 hour batter...
4,hp chromebook 14 inch hd laptop intel celeron ...
...,...
3614,lenovo thinkpad p71 workstation laptop windows...
3615,lenovo thinkpad t480s windows 10 pro laptop in...
3616,"newest dell inspiron 5000 15.6"" touchscreen le..."
3617,lenovo thinkpad p71 workstation laptop windows...


In [227]:
test = remove_stop_words('"Acer Predator Helios 300 15.6"" Gaming Laptop i7-10750H 16GB DDR4 1TB SSD"')

In [228]:
test

'"acer predator helios 300 15.6"" gaming laptop i7 10750h 16gb ddr4 1tb ssd"'

In [229]:
brand_matcher.findall(test)

['acer']

In [230]:
product_attr_matcher.findall(test)

['predator']

In [231]:
inch_matcher.findall(test)

['15.6""']

In [232]:
cpu_matcher.findall(test)

['i7', '10750h']

In [233]:
gb_matcher.findall(test)

[' 16gb']

In [234]:
tb_matcher.findall(test)

[' 1tb']

In [235]:
def create_pos_laptop_data(df):
    for title in df['title']:
        brand = brand_matcher.findall(title)
        product_attr = product_attr_matcher.findall(title)
        inch = inch_matcher.findall(title)
        cpu = cpu_matcher.findall(title)
        gb = gb_matcher.findall(title)
        tb = tb_matcher.findall(title)

        if gb == [] and tb == []:
            continue
        
        shuffle = [cpu, gb, tb]
        random.shuffle(shuffle)
        pos_title1 = ' '.join(brand + product_attr + inch + shuffle[0] + shuffle[1] + shuffle[2])
        
        
