In [1]:
import pandas as pd
import sys
import os
import numpy as np
import random
from tqdm import tqdm
from nltk.corpus import stopwords
pd.set_option('mode.chained_assignment', None)

In [2]:
class Common():
    # Max length of a title to be fed into the model
    MAX_LEN = 44

    # Number of training examples
    m = 19380

# These are words that commonly come up with laptops
modifiers = ['premium', 'new', 'fast', 'latest model']
add_ins = ['USB 3.0', 'USB 3.1 Type-C', 'USB Type-C', 'Bluetooth', 'WIFI', 'Webcam', 'FP Reader',
            'HDMI', '802.11ac', '802.11 ac', 'home', 'flagship', 'business', 'GbE LAN', 'DVD-RW',
            'DVD', 'Windows 10']

# For creating laptop data
hard_drive_types = ['HDD', 'Hard Drive', 'Internal Hard Drive']
ssd_types = ['SSD', 'Solid State Drive', 'M.2 SSD', 'SATA SSD']

# The column names for all the DataFrames
COLUMN_NAMES = ['title_one', 'title_two', 'label']
    
def get_max_len(df):
    max_len = 0
    for row in df.itertuples():
        if len(row.title_one.split(' ')) > max_len:
            max_len = len(row.title_one.split(' '))
            
        if len(row.title_two.split(' ')) > max_len:
            max_len = len(row.title_two.split(' '))
    
    return max_len

def print_dataframe(df):
    for idx in range(len(df)):
        print(df.iloc[idx].title_one + '\n' + df.iloc[idx].title_two)
        print('________________________________________________________________')

def create_final_data(pos_df, neg_df):
    pos_df = pos_df.sample(frac=1)
    neg_df = neg_df.sample(frac=1)
    final_df = pd.concat([pos_df[:min(len(pos_df), len(neg_df))], neg_df[:min(len(pos_df), len(neg_df))]])
    final_df = final_df.sample(frac=1)
    return final_df

In [3]:
# ## Data Processsing and Organization
# Here, all we really want to do is prepare the data for training. This includes:
# * Simplifying the original data
# * Normalizing the data 
# * Balancing the positive and negative examples
# * Creating the embedding representations that will actually get fed into the neural network
# Organizing and normalizing the data

def remove_stop_words(phrase):
    # Creates the stopwords
    to_stop = stopwords.words('english')
    punctuation = "!”#$%&’()*+,-./:;<=>?@[\]^_`{|}~ "
    for c in punctuation:
        to_stop.append(c)
    to_stop.append('null')
    
    for punc in punctuation:
        phrase = phrase.replace(punc, ' ')
    
    return ' '.join((' '.join([x for x in phrase.split(' ') if x not in to_stop])).split())

# Drop the Unnamed: 0 column and drop any row where it is all NaN
def remove_misc(df):
    df = df.drop(columns=['Unnamed: 0'])
    df = df.dropna(how='all')
    return df


## Laptop Data (From Laptop Dataframe)

In [4]:
# This class will be used in order to exchange the different attributes
# to create negative examples
class LaptopAttributes():
    company = {'Apple'}
    product = {'MacBook Pro'}
    inches = {'13.3'}
    cpu = {'Intel Core i5 2.3GHz'}
    ram = {'4GB'}
    memory = {'256GB SSD'}
    gpu = {'Intel HD Graphics 520'}
    screen = {'1440x900'}
    
    @staticmethod
    def get_all_data():
        return {
            'company': LaptopAttributes.company,
            'product': LaptopAttributes.product,
            'inches': LaptopAttributes.inches,
            'cpu': LaptopAttributes.cpu,
            'ram': LaptopAttributes.ram,
            'memory': LaptopAttributes.memory,
            'gpu': LaptopAttributes.gpu,
            'screen': LaptopAttributes.screen
        }


In [5]:
# Create attribute sets
def create_attribute_sets(df):
    LaptopAttributes.company.update([row.Company for row in df[['Company']].itertuples()])
    LaptopAttributes.product.update([row.Product for row in df[['Product']].itertuples()])
    LaptopAttributes.inches.update([str(row.Inches) for row in df[['Inches']].itertuples()])
    LaptopAttributes.cpu.update([row.Cpu for row in df[['Cpu']].itertuples()])
    LaptopAttributes.ram.update([row.Ram for row in df[['Ram']].itertuples()])
    LaptopAttributes.memory.update([row.Memory for row in df[['Memory']].itertuples()])
    LaptopAttributes.gpu.update([row.Gpu for row in df[['Gpu']].itertuples()])
    LaptopAttributes.screen.update([row.ScreenResolution for row in df[['ScreenResolution']].itertuples()])


In [26]:
def concatenate_row(row):
    # Note: got rid of everything after the '(' because it has info about the actual specs of the laptop
    # so if we change the specs, we need to fix that too
    
    # Special tags at the end of the amount of inches of the laptop and the RAM to simulate real data
    inch_attr = str(row['Inches']) + random.choice([' inch', '"'])
    ram_attr = row['Ram'] + random.choice([' ram', ' memory'])
    
    cpu_attr = row['Cpu']
    if random.choice([0, 1]):
        cpu_attr = cpu_attr.split(' ')
        if random.choice([0, 1]):
            if 'Intel' in cpu_attr:
                cpu_attr.remove('Intel')
        if random.choice([0, 1]):
            if 'Core' in cpu_attr:
                cpu_attr.remove('Core')
        if random.choice([0, 1]):
            if 'AMD' in cpu_attr:
                cpu_attr.remove('AMD')
    
        cpu_attr = ' '.join(cpu_attr)

    # Create a list for all the product attributes
    order_attrs = [ row['Company'],
                    row['Product'].split('(')[0],
                  ]
    
    more_type_attrs = [ row['TypeName'],
                        inch_attr
                      ]
    
    spec_attrs = [ # row['ScreenResolution'],
                   cpu_attr,
                   ram_attr,
                   row['Memory']
                 ]
    
    random.shuffle(more_type_attrs)
    random.shuffle(spec_attrs)
    
    order_attrs = order_attrs + more_type_attrs + spec_attrs
    
    return ' '.join(order_attrs)

In [7]:
# Creates the negative examples for the laptop data
# The laptop_df is the original data, the new_df is the dataframe to append the new data to
# and the attributes are the attributes to swap for the new data
def create_neg_laptop_data(laptop_df, attributes):
    new_column_names = ['title_one', 'title_two', 'label']
    temp = []
    for row in tqdm(range(len(laptop_df))):
        # Create a copy of the row for the negative example
        neg_row = laptop_df.iloc[row]
        for attribute_class in attributes:
            # Get the row in the laptop_data
            orig_row = laptop_df.iloc[row]
            
            # Get the attribute that we are trying to change
            attribute_val = orig_row[attribute_class]
            
            # Temporarily value for the new value
            new_val = attribute_val
            
            # Make sure we really get a new attribute
            while new_val == attribute_val:
                new_val = random.sample(LaptopAttributes.get_all_data()[attribute_class.lower()], 1)[0]
            
            # Change the value in the neg_row to the new value
            neg_row[attribute_class] = new_val
            
            # Concatenate and normalize the data
            title_one = remove_stop_words(concatenate_row(orig_row).lower())
            title_two = remove_stop_words(concatenate_row(neg_row).lower())
            
            # Append the data to the new df
            temp.append([title_one, title_two, 0])

    return pd.DataFrame(temp, columns=new_column_names)

In [8]:
# Creates the postive examples for the laptop data
# The laptop_df is the original data, the new_df is the dataframe to append the new data to
# and the attributes are the attributes to swap or delete for the new data
def create_pos_laptop_data(laptop_df, rm_attrs, add_attrs):
    new_column_names = ['title_one', 'title_two', 'label']
    temp = []
    for row in tqdm(range(len(laptop_df))):
        # Remove the attribute from the new title
        for attr_list in rm_attrs:
            # Create a copy of the row for the negative example
            new_row = laptop_df.iloc[row]
            orig_row = laptop_df.iloc[row]
            for attr in attr_list:
                new_row[attr] = ''
        
            title_one = remove_stop_words(concatenate_row(orig_row).lower())
            title_two = remove_stop_words(concatenate_row(new_row).lower())
            
            temp.append([title_one, title_two, 1])
    
    return pd.DataFrame(temp, columns=new_column_names)

In [10]:
# Creates the negative examples for the laptop data
# The laptop_df is the original data, the new_df is the dataframe to append the new data to
# and the attributes are the attributes to swap for the new data
def create_neg_laptop_data(laptop_df, attributes):
    new_column_names = ['title_one', 'title_two', 'label']
    temp = []
    for row in tqdm(range(len(laptop_df))):
        # Create a copy of the row for the negative example
        neg_row = laptop_df.iloc[row]
        for attribute_class in attributes:
            # Get the row in the laptop_data
            orig_row = laptop_df.iloc[row]
            
            # Get the attribute that we are trying to change
            attribute_val = orig_row[attribute_class]
            
            # Temporarily value for the new value
            new_val = attribute_val
            
            # Make sure we really get a new attribute
            while new_val == attribute_val:
                new_val = random.sample(LaptopAttributes.get_all_data()[attribute_class.lower()], 1)[0]
            
            # Change the value in the neg_row to the new value
            neg_row[attribute_class] = new_val
            
            # Concatenate and normalize the data
            title_one = remove_stop_words(concatenate_row(orig_row).lower())
            title_two = remove_stop_words(concatenate_row(neg_row).lower())
            
            # Append the data to the new df
            temp.append([title_one, title_two, 0])

    return pd.DataFrame(temp, columns=new_column_names)

In [11]:
def create_laptop_data():
    file_path = 'data/train/final_laptop_data.csv'
    # Load the laptop data
    laptop_df = pd.read_csv('data/train/laptops.csv', encoding='latin-1')
    
    # Create the attribute sets for the LaptopAttributes
    create_attribute_sets(laptop_df)
    
    if not os.path.exists(file_path):
        print('Generating laptop data . . . ')
        # Create the negative and positive dataframes 
        neg_df = create_neg_laptop_data(laptop_df, attributes=['Cpu', 'Memory', 'Ram', 'Inches', 'Product'])
        pos_df = create_pos_laptop_data(laptop_df, rm_attrs = [['Company'], ['TypeName'], ['Product']], add_attrs = [])
        
        # Concatenate the data and save it
        final_laptop_df = create_final_data(pos_df, neg_df)
        final_laptop_df = final_laptop_df.sample(frac=1)
        final_laptop_df.to_csv(file_path)

    else:
        print('Already have laptop data. Moving on . . . ')
        
create_laptop_data()

  0%|          | 1/1303 [00:00<02:47,  7.79it/s]

Generating laptop data . . . 


100%|██████████| 1303/1303 [00:03<00:00, 384.32it/s]
100%|██████████| 1303/1303 [00:02<00:00, 589.73it/s]


In [21]:
file_path = 'data/train/final_laptop_data.csv'
# Load the laptop data
laptop_df = pd.read_csv('data/train/laptops.csv', encoding='latin-1')

# Create the attribute sets for the LaptopAttributes
create_attribute_sets(laptop_df)

In [12]:
neg_df = create_neg_laptop_data(laptop_df, attributes=['Cpu', 'Memory', 'Ram', 'Inches', 'Product'])

NameError: name 'laptop_df' is not defined

In [94]:
print(neg_df.iloc[0].title_one)
print(neg_df.iloc[0].title_two)

apple macbook pro 13 3 inch ultrabook intel core i5 2 3ghz 128gb ssd 8gb memory
apple macbook pro 13 3 inch ultrabook 8gb ram amd a6 series 7310 2ghz 128gb ssd


In [78]:
pos_df = create_pos_laptop_data(laptop_df, rm_attrs = [['Company'], ['TypeName'], ['Product']], add_attrs = [])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
100%|███████████████████████████████████████████████████████████| 1303/1303 [00:05<00:00, 254.88it/s]


In [79]:
print(pos_df.iloc[0].title_one)
print(pos_df.iloc[0].title_two)

apple macbook pro 13 3" ultrabook intel core i5 2 3ghz 8gb memory 128gb ssd
macbook pro ultrabook 13 3" 8gb memory 128gb ssd intel core i5 2 3ghz


## Laptop Data from Spec Attributes

In [13]:
class SpecAttributes():
    """
    Different from LaptopAttributes, this is specific for creating spec data.
    The spec data was gathered from PCPartPicker and is used to create more laptop data.
    """
    video_card = {'GeForce RTX 2070'}
    ram = [str(x) + ' GB' for x in range(2, 130, 2)]
    hard_drive = [str(x) + ' GB' for x in range(120, 513, 8)] + [str(x) + ' TB' for x in range(1, 8)]
    cpu = {}
    laptop_brands = ['Lenovo ThinkPad', 'Lenovo ThinkBook', 'Lenovo IdeaPad', 'Lenovo Yoga', 'Lenovo Legion', 'HP Envy', 'HP Chromebook', 'HP Spectre', 'HP ZBook', 'HP Probook', 'HP Elitebook', 'HP Pavilion', 'HP Omen', 'Dell Alienware', 'Dell Vostro', 'Dell Inspiron', 'Dell Latitude', 'Dell XPS', 'Dell G Series', 'Dell Precision', 'Apple Macbook', 'Apple Macbook Air', 'Apple Mac', 'Acer Aspire', 'Acer swift', 'Acer Spin', 'Acer Switch', 'Acer Extensa', 'Acer Travelmate', 'Acer Nitro', 'Acer Enduro', 'Acer Predator', 'Asus ZenBook', 'Asus Vivobook', 'Asus Republic of Gamers', 'Asus ROG', 'Asus TUF GAMING']
    
    @staticmethod
    def get_all_data():
        return {
            'cpu': SpecAttributes.cpu.keys(),
            'ram': SpecAttributes.ram,
            'hard_drive': SpecAttributes.hard_drive,
            'video_card': SpecAttributes.video_card,
            'laptop_brands': SpecAttributes.laptop_brands
        }


In [25]:
def concatenate_spec_data(row):
    # Special tags at the end of the amount of inches of the laptop and the RAM to simulate real data
    inch_attr = str(row['inches']) + random.choice([' inch', '"'])
    ram_attr = row['ram'] + random.choice([' ram', ' memory'])
    
    # These are words that commonly come up with laptops
    modifiers = ['premium', 'new', 'fast', 'latest model']
    add_ins = ['USB 3.0', 'USB 3.1 Type-C', 'USB Type-C', 'Bluetooth', 'WIFI', 'Webcam', 'FP Reader',
               'HDMI', '802.11ac', '802.11 ac', 'home', 'flagship', 'business', 'GbE LAN', 'DVD-RW',
               'DVD', 'Windows 10']

    cpu_attr = row['cpu']
    cores = SpecAttributes.cpu[cpu_attr][0]
    ghz = SpecAttributes.cpu[cpu_attr][1]
    
    if random.random() > 0.5:
        cpu_attr = cpu_attr.split(' ')
        if random.choice([0, 1]):
            if 'Intel' in cpu_attr:
                cpu_attr.remove('Intel')
        if random.choice([0, 1]):
            if 'Core' in cpu_attr:
                cpu_attr.remove('Core')
        if random.choice([0, 1]):
            if 'AMD' in cpu_attr:
                cpu_attr.remove('AMD')
    
        cpu_attr = ' '.join(cpu_attr)
    
    # Random chance of putting the cores in the CPU attribute
    if random.random() > 0.7:
        cpu_attr = '{} {} {}'.format(cpu_attr, cores, 'Core')
    
    # Random chance of putting the GHz in the CPU attribute
    if random.random() > 0.7:
        cpu_attr = '{} {}'.format(cpu_attr, ghz)
    
    if random.random() > 0.55:
        cpu_attr = '{} {}'.format(cpu_attr, 'CPU')
    
    # Create a list for all the product attributes
    order_attrs = [row['company'],
                   row['product'],
                   inch_attr,
                  ]
    
    spec_attrs = [row['hard_drive'],
                  # row['screen'],
                   cpu_attr,
                   ram_attr
                 ]
    
    random.shuffle(spec_attrs)
    order_attrs = order_attrs + spec_attrs
    
    return ' '.join(order_attrs)

In [24]:
# Creates the postive examples for the laptop data
# The laptop_df is the original data, the new_df is the dataframe to append the new data to
# and the attributes are the attributes to swap or delete for the new data
def create_pos_spec_data(df, rm_attrs, add_attrs):
    temp = []
    df_iloc = df.iloc()
    COLUMN_NAMES = ['title_one', 'title_two', 'label']
    for row in tqdm(range(int(len(df) * 2.8e-4))):
        # Set the new row to the same as the original to begin changing it
        new_row = df_iloc[row]

        # Get the row in the df and add the inch attribute
        orig_row = df_iloc[row]

        # Set product and company
        orig_row['company'] = orig_row['brand'].split(' ', 1)[0]
        orig_row['product'] = orig_row['brand'].split(' ', 1)[1]
        new_row['company'] = orig_row['brand'].split(' ', 1)[0]
        new_row['product'] = orig_row['brand'].split(' ', 1)[1]

        # Get a random inch attribute
        inch_attr = random.choice(list(LaptopAttributes.inches))

        # Get random screen attribute
        # screen_attr = random.choice(list(LaptopAttributes.screen))

        # Get random hard drive attribute and type
        hard_drive_attr = random.choice(list(SpecAttributes.hard_drive))
        
        # Get whether it will be an ssd or a hard drive
        drive_type = random.choice([hard_drive_types, ssd_types])

        # Set the attributes
        orig_row['inches'] = inch_attr
        # orig_row['screen'] = screen_attr

        orig_row['hard_drive'] = '{} {}'.format(hard_drive_attr, random.choice(drive_type))
        new_row['inches'] = inch_attr
        # new_row['screen'] = screen_attr
        new_row['hard_drive'] = '{} {}'.format(hard_drive_attr, random.choice(drive_type))
        
        for attr_list in rm_attrs:
            # Simply create a copy of new_row so that we do not have to keep on generating the same thing
            pos_row = new_row.copy()
            
            for attr in attr_list:
                pos_row[attr] = ''
        
            title_one = remove_stop_words(concatenate_spec_data(orig_row).lower())
            title_two = remove_stop_words(concatenate_spec_data(pos_row).lower())
    
            temp.append([title_one, title_two, 1])
    
    return pd.DataFrame(temp, columns=COLUMN_NAMES)

In [23]:
# Creates the negative examples for the laptop data
# The laptop_df is the original data, the new_df is the dataframe to append the new data to
# and the attributes are the attributes to swap for the new data
def create_neg_spec_laptop(df, attributes):
    df_iloc = df.iloc()
    temp = []
    for row in tqdm(range(int(len(df) * 1.91e-4))):
        # Create a copy of the row for the negative example
        for attribute_class in attributes:
            neg_row = df_iloc[row]
            # Get the row in the laptop_data and add the inch attribute
            orig_row = df_iloc[row]
            
            # Set product and company
            orig_row['company'] = orig_row['brand'].split(' ', 1)[0]
            orig_row['product'] = orig_row['brand'].split(' ', 1)[1]
            neg_row['company'] = orig_row['brand'].split(' ', 1)[0]
            neg_row['product'] = orig_row['brand'].split(' ', 1)[1]
            
            # Get a random inch attribute
            inch_attr = random.choice(list(LaptopAttributes.inches))
            
            # Get random screen attribute
            # screen_attr = random.choice(list(LaptopAttributes.screen))
            
            # Set the attributes
            orig_row['inches'] = inch_attr
            neg_row['inches'] = inch_attr
            # orig_row['screen'] = screen_attr
            # neg_row['screen'] = screen_attr
            
            if attribute_class == 'inches':
                # New inch attribute
                new_inch_attr = inch_attr

                # If the original attribute is still the same, keep getting a random one
                while inch_attr == new_inch_attr:
                    new_inch_attr = random.choice(list(LaptopAttributes.inches))
                
                neg_row['inches'] = new_inch_attr
            
            elif attribute_class == 'screen':
                # Have screen attr
                orig_screen_attr = random.choice(list(LaptopAttributes.screen))
                
                # New screen attribute
                new_screen_attr = screen_attr
                
                # If the original attribute is still the same, keep getting a random one
                while orig_screen_attr == new_screen_attr:
                    new_screen_attr = random.choice(list(LaptopAttributes.screen))
                
                neg_row['screen'] = new_screen_attr
                orig_row['screen'] = orig_screen_attr
            
            elif attribute_class == 'product':
                # New product attr
                new_product_attr = orig_row['product']
                
                # If the original attribute is still the same, keep getting a random one
                while orig_row['product'] == new_product_attr:
                    new_product_attr = random.choice(SpecAttributes.laptop_brands).split(' ', 1)[1]
                
                neg_row['product'] = new_product_attr
            
            elif attribute_class == 'hard_drive':
                # New drive attribute
                new_drive_attr = orig_row['hard_drive']
                
                # If the original attribute is still the same, keep getting a random one
                while orig_row['hard_drive'] == new_drive_attr:
                    new_drive_attr = random.choice(SpecAttributes.hard_drive)
                
                neg_row['hard_drive'] = '{} {}'.format(new_drive_attr, random.choice([random.choice(hard_drive_types), random.choice(ssd_types)]))
                orig_row['hard_drive'] = '{} {}'.format(orig_row['hard_drive'], random.choice([random.choice(hard_drive_types), random.choice(ssd_types)]))
            
            else:
                # Get the attribute that we are trying to change
                attribute_val = orig_row[attribute_class]

                # Temporarily value for the new value
                new_val = attribute_val

                # Make sure we really get a new attribute
                while new_val == attribute_val:
                    new_val = random.sample(SpecAttributes.get_all_data()[attribute_class.lower()], 1)[0]

                # Change the value in the neg_row to the new value
                neg_row[attribute_class] = new_val
            
            # We still need to add the phrasing to the hard drive attribute if it is not the current attribute class
            if attribute_class != 'hard_drive':
                drive_type = random.choice([random.choice(hard_drive_types), random.choice(ssd_types)])
                neg_row['hard_drive'] = '{} {}'.format(neg_row['hard_drive'], drive_type)
                orig_row['hard_drive'] = '{} {}'.format(orig_row['hard_drive'], drive_type)
            
            # Concatenate and normalize the data
            title_one = remove_stop_words(concatenate_spec_data(orig_row).lower())
            title_two = remove_stop_words(concatenate_spec_data(neg_row).lower())
            
            # Append the data to the temp list
            temp.append([title_one, title_two, 0])

    # Return the DataFrame created from temp
    return pd.DataFrame(temp, columns=COLUMN_NAMES)

In [17]:
def populate_spec():
    # Getting the CPU data into SpecAttrbutes
    cpu_df = pd.read_csv('data/train/cpu_data.csv')
    temp_iloc = cpu_df.iloc()
    for idx in range(len(cpu_df)):
        row = temp_iloc[idx]
        SpecAttributes.cpu[row['name']] = [row['cores'], row['core_clock']]

#     # Getting the video card data into SpecAttributes
#     video_card_df = pd.read_csv('data/train/video-cards-data.csv')
#     temp_iloc = video_card_df.iloc()
#     for idx in range(len(video_card_df)):
#         row = temp_iloc[idx]
#         SpecAttributes.video_card.update([row['chipset']])

In [18]:
def gen_spec_combos():
    # Generates combinations of the spec data (WARNING: THIS TAKES A VERY LONG TIME AND YOU MUST HAVE AT LEAST 16GB RAM TO DO THIS)
    combos = np.meshgrid(*[SpecAttributes.laptop_brands, list(SpecAttributes.cpu.keys()), SpecAttributes.hard_drive, SpecAttributes.ram])
    combos = np.array(combos).T.reshape(-1, 4)
    np.random.shuffle(combos)
    df = pd.DataFrame(data=combos, columns=['brand', 'cpu', 'hard_drive', 'ram'])
    df.to_csv('data/train/spec_data.csv')

In [22]:
def create_spec_laptop_data():
    file_path = 'data/train/spec_train_data.csv'
    if not os.path.exists(file_path):
        print('Generating general spec data for laptops . . . ')
        populate_spec()
        if not os.path.exists('data/train/spec_data.csv'):
            print('Generating spec data combinations. WARNING: THIS WILL CONSUME RESOURCES AND TAKE A LONG TIME.')
            gen_spec_combos()
        spec_df = pd.read_csv('data/train/spec_data.csv')
        pos_df = spec_pos_df = create_pos_spec_data(spec_df, rm_attrs = [['company'], ['product']], add_attrs = [])
        neg_df = create_neg_spec_laptop(spec_df, ['cpu', 'ram', 'hard_drive', 'product', 'inches'])
        final_spec_df = create_final_data(pos_df, neg_df)
        print(len(final_spec_df))
        final_spec_df.to_csv(file_path)

    else:
        print('Already have spec data. Moving on . . .')

create_spec_laptop_data()

Generating general spec data for laptops . . . 


100%|██████████| 3665/3665 [00:14<00:00, 247.00it/s]
100%|██████████| 2500/2500 [00:45<00:00, 55.08it/s]


14660


In [64]:
spec_df = pd.read_csv('data/train/spec_data.csv')

In [68]:
populate_spec()

In [95]:
spec_neg_df = create_neg_spec_laptop(spec_df, ['cpu', 'ram', 'hard_drive', 'product', 'inches'])

100%|████████████████████████████████████████████████████████████| 2500/2500 [01:16<00:00, 32.83it/s]


In [96]:
print(spec_neg_df.iloc[500].title_one)
print(spec_neg_df.iloc[500].title_two)

dell vostro 10 1 inch core i7 10700kf 3 8 ghz 26 gb ram 160 gb internal hard drive
dell vostro 10 1" intel core i9 10900kf 3 7 ghz 160 gb internal hard drive 26 gb memory


In [99]:
spec_pos_df = create_pos_spec_data(spec_df, rm_attrs = [['company'], ['product']], add_attrs = [])

100%|███████████████████████████████████████████████████████████| 3011/3011 [00:21<00:00, 139.47it/s]


In [101]:
print(spec_pos_df.iloc[5000].title_one)
print(spec_pos_df.iloc[5000].title_two)

dell latitude 14 0 inch 78 gb ram intel core i5 7400 4 core cpu 208 gb hard drive
latitude 14 0 inch intel core i5 7400 3 ghz 78 gb memory 208 gb hard drive


In [107]:
final_laptop_df = pd.read_csv('data/train/final_laptop_data.csv')

In [108]:
final_spec_df = pd.read_csv('data/train/spec_train_data.csv')

In [109]:
print(len(final_laptop_df))
print(len(final_spec_df))

7818
12044
