In [None]:
# pandas data analysis library
import pandas as pd

# regular expressions library
import re

# this library allows us to read xlsx files
import xlrd

In [None]:
# read csv file containing webscraper output
df = pd.read_csv('../data/webscrapper_output.csv')

In [None]:
# prints entire column width
pd.set_option('display.max_colwidth', 1000)

Split Dataframe into variance and no variance

In [None]:
# determines whether a product is a duplicate based on the product's name
variant_products = (df['productname'].duplicated(keep=False))

# '~' makes it so only false values are returned
no_variant_products = ~variant_products

In [None]:
# assign df to products with no variance
df = df[no_variant_products]

Add Domain To Image Links

In [None]:
# this function adds website domain to image links
def add_domain_to_image_links(image_link):

    domain = 'www.wonatrading.com/'

    # if image link is a string add domain
    if isinstance(image_link, str):
        return domain + image_link

    # otherwise return 
    return ''

In [None]:
# apply add_domain_to_image_links to both image columns 
df['image1'] = df['image1'].apply(add_domain_to_image_links)
df['image2'] = df['image2'].apply(add_domain_to_image_links)

Clean Price Column

In [None]:
# this function returns the first price found in price column
def clean_price(price):

    pattern = re.compile(r'\d.{2}\d')  # pattern we're looking for

    matches = pattern.finditer(price)  # number of matches found

    matches_list = [match.group() for match in matches]

    return matches_list[0]

In [None]:
df['price'] = df['price'].apply(clean_price) 

Combine all 3 category columns into 1 Column

In [None]:
# combine each category column into 1
# JEWELRY;ANKLET/FASHION
df['category'] = df['category1'] + ';' + df['category2'] + '/' + df['category3']

# delete old category columns
df.drop(['category1', 'category2', 'category3'], axis=1, inplace=True)

Product Custom Fields Function

In [None]:
# regex pattern that finds colors in product description
color_pattern = r'((Color : )\w+\s|(Color : )\w+,\s*\w+\s*)'

# series object of all the colors from description column
color_series = df['description'].str.extract(color_pattern)[0]

# replace substring
color_series = color_series.str.replace('Color : ', 'Color=')

color_series

Clean Description Functions

In [None]:
def remove_name_from_product_description(description):

    # r'\s' + adds a white space to the begining of the pattern
    pattern = r'\s' + df['productname']  # pattern we're looking for

    replacement = ''  # what to replace the string with

    target = description  # the string we want to replace

    # re.sub replaces the string and ignorescase
    replaced = re.sub(pattern, replacement, target, flags=re.IGNORECASE)

    # replace description of each product in original file
    return replaced

def remove_style_number_from_product_description(description):

    pattern = r'( Style No : )\d+\s' # pattern we're looking for

    replacement = ''  # replace pattern with nothing

    target = description  # the string we want to replace

    # re.sub replaces the string and ignorescase
    replaced = re.sub(pattern, replacement, target, flags=re.IGNORECASE)

    # return the cleaned string
    return replaced

def remove_colors_from_descrition(description):
    
    # pattern we're looking for
    # 2 patterns
    # (Color : )\w+\s OR (Color : )\w+,\s*\w+\s*
    pattern = r'((Color : )\w+\s|(Color : )\w+,\s*\w+\s*)'    

    replacement = ''             # replace pattern with nothing
    target = description         # the string we want to replace

    # re.sub replaces the string and ignorescase
    replaced = re.sub(pattern, replacement, target, flags=re.IGNORECASE)

    return replaced # return the cleaned string

(Clean Description) Dataframe Description Column = result of all the functions

In [None]:
# remove style number from description
df['description'] = df['description'].apply(remove_style_number_from_product_description)

# remove colors from description
df['description'] = df['description'].apply(remove_colors_from_descrition)

# list of all product names
productname_list = df['productname'].tolist()

# remove product names from product description column
df['description'] = df['description'].str.replace('|'.join(productname_list), '')

# print description column
df['description']

Change column names to the column names in no variant template

In [None]:
# 2 lists containing the field names for input and outputs
input_columns = ['productname','category','image1','image2','description','price']
output_columns = ['Product Name', 'Category','Product Image File - 1','Product Image File - 2','Product Description','Price']

# keys are original column names, values are what they should be changed to
change_column_names_dictionary = dict(zip(input_columns, output_columns))

# this is the command changes column names in dataframe to column names that are in no variant template
# inplace=True means that it applies the change to original data frame
df.rename(columns=change_column_names_dictionary, inplace=True)

df

Create Data Frame for No Variant Template

In [None]:
# open excel file
no_variant_excel_df = pd.read_excel('No Variant Template Excel Spreadsheet.xlsx')

# print file
no_variant_excel_df

Column Names

In [None]:
# Column names are assigned to field column of excel file
column_names = no_variant_excel_df['Field']

# print column names
column_names

Default Values

In [None]:
# default values are assigned to EXAMPLE column of excel file
# .fillna('') coverts NaN values to empty strings
# .tolist() ensures the series data type is converted to a list
default_values = no_variant_excel_df['EXAMPLE'].fillna('').tolist()

# print default values
default_values

No Variant Template Dataframe

In [None]:
# DataFrame for 'No Variant Template.xlsx'
no_variant_template_df = pd.DataFrame(
    columns=column_names    # assign columns of dataframe to columns from excel file 
    )

# print dataframe
no_variant_template_df

Add Data to no_variant_template_df

In [None]:
# append webscrapper data to no_variant_template_df
no_variant_template_df = no_variant_template_df.append(df, sort=False)

Fill NaN values with default values

In [None]:
no_variant_template_df.fillna(
    dict( # Dictionary of 2 lists
        zip(
            no_variant_template_df.columns, # keys are the column names
            default_values                  # values for each column are default_values list 
        )
    ),
    inplace=True # make change to original dataframe
)

# print dataframe
no_variant_template_df

Match Cost Price Column with Price Column 

In [None]:
# cost price column = price column
no_variant_template_df['Cost Price'] = no_variant_template_df['Price']

# print cost price column
no_variant_template_df['Cost Price']

Fill Option Set with product name

In [None]:
# option set column = product name column
no_variant_template_df['Option Set'] = no_variant_template_df['Product Name']

# print option set column
no_variant_template_df['Option Set']

In [None]:
# set Product Custom Fields column equal to color_series
no_variant_template_df['Product Custom Fields'] = color_series

# print Product Custom Fields column
no_variant_template_df['Product Custom Fields']

Write dataframe to csv file

In [None]:
no_variant_template_df.to_csv('output.csv', index=False)