In [0]:
import numpy as np
from pyspark.sql.functions import col,  struct
from textdistance import smith_waterman, jaccard
from pyspark.sql.types import StringType

In [0]:
disqo_us_path = "/mnt/delta/general_data/Disqo_Dataset_Processed_Delta/disqo_us"
disqo_us_df = spark.read.option("header", "true").format("delta").load(disqo_us_path)

In [0]:
retailers = ['walmart.com',
             'costco.com',
             'target.com']
retailers_us = disqo_us_df.filter(col('page_domain').isin(retailers))

In [0]:
category_dictionary =  {'appliances': ['ovens, ranges & cooktops',
                              'dishwashers',
                              'freezers & ice makers',
                              'heating, cooling & air quality',
                              'kitchen appliance packages',
                              'refrigerators',
                              'vacuum cleaners & floor care',
                              'washers & dryers'],
               'arts, crafts & sewing': ['fabric', 
                                        'crafting',
                                        'sewing',
                                        'painting, drawing, & art supplies',
                                        'scrapbooking & stamping',
                                        'beading & jewelry making', 
                                        'knitting & crochet'],
               'automotive': ['tires & wheels',
                              'oils & fluids', 
                              'auto electronics', 
                              'automotive tools & equipment',
                              'car care',
                              'auto parts & accessories',
                              'rv parts & accessories'],
               'baby': ['nursing & feeding',
                        'baby care, health & safety',
                        'diapers & wipes',
                        'car seats & accessories',
                        'strollers & accessories', 
                        'baby & toddler toys', 
                        'gear & activity',
                        'nursery',
                        'bath & potty',
                        'pregnancy & maternity'],
               'beauty': ['hair care',
                          'makeup',
                          'skin care', 
                          'fragrance',
                          'foot, hand & nail care',
                          'tools & accessories',
                          'bath & body'],
               'clothing, shoes & jewelry': ['women', 
                                             'men',
                                             'kids', 
                                             'shoe, jewelry & watch accessories',
                                             'luggage & travel gear'],
               'electronics': ['cell phones & accessories',
                               'computers', 
                               'television & video',
                               'audio',
                               'car & auto electronics',
                               'cameras & photos',
                               'security & surveillance',
                               'ipad & tablets',
                               'video game consoles & accessories',
                               'drones'],
               'entertainment': ['video games', 
                                 'movies & tv',
                                 'music & vinyl', 
                                 'books'],
               'gift cards & tickets': ['gift cards & tickets'],
               'grocery & gourmet food': ['grocery & gourmet food'],
               'health & personal care': ['personal care',
                                          'medicines & medical supplies',
                                          'vitamins & dietary supplements',
                                          'health care',  
                                          'nutrition'],
               'home & kitchen': ['bed & bath', 
                                  'kitchen & dining', 
                                  'furniture', 
                                  'home decor', 
                                  'storage & organization',
                                  'vacuums & floor care',
                                  'irons & steamers'],
               'office products': ['office products'],
               'patio, lawn & garden': ['outdoor lighting',
                                        'outdoor power equipment',
                                        'patio furniture',
                                        'outdoor decor',
                                        'grills & outdoor cooking',
                                        'gardening & lawn care',
                                        'sheds, shade & outdoor storage',
                                        'pools, hot tubs & supplies'],
               'pet supplies': ['pet supplies'],
               'sports & outdoors': ['exercise & fitness',
                                     'winter sports',
                                     'boating & water sports',
                                     'camping',
                                     'hunting & fishing',
                                     'golf',
                                     'bikes, scooters & boards',
                                     'general & team sports',
                                     'fan shop',
                                     'game room', 
                                     'airsoft & paintball',  
                                     'clothing'],
               'tools & home improvement': ['paint, wall treatment & supplies',
                                            'hardware', 
                                            'safety & security',
                                            'plumbing',
                                            'electrical', 
                                            'building materials', 
                                            'tools', 
                                            'welding & soldering',
                                            'lighting & light bulbs'],
               'toys & games': ['toys & games'],
               'handmade': ['handmade'],
               'industrial & scientific': ['industrial & scientific']
              }

In [0]:
def reverseAmazonLevels(level_0, level_1, level_2):
    if level_0 == None:
        return ''
    if level_1 == None:
        return level_0
    if level_2 == None:
        return  level_0 + '|' + level_1
    return  level_0 + '|' + level_1 + '|' + level_2
reverseAmazonLevelsUDF = udf(lambda z: reverseAmazonLevels(z[0], z[1], z[2]),  StringType())

amazon_us_path = "/mnt/delta/general_data/Disqo_Dataset_Processed_Delta/amazon_with_category"
amazon_us_df = spark.read.option("header", "true").format("delta").load(amazon_us_path)
amazon_us_df = amazon_us_df.withColumn('category_hierarchy', reverseAmazonLevelsUDF(struct(col('level_0'), col('level_1'), col('level_2')))).drop('level_0', 'level_1', 'level_2', 'level_3', 'level_4', 'level_5', 'level_6', 'level_7', 'Match_Confidence')

In [0]:
# merge amazon and other online retailers
retailers_us = amazon_us_df.select(retailers_us.columns).union(retailers_us)

In [0]:
def jaccard(category_line, refer_s):
    s1 = set(category_line.split(' '))
    s2 = set(refer_s.split(' '))
    score = round(len(s1.intersection(s2))*1.0 / len(s2), 2)
    return score

def matchingCommon(level_1, level_1_ref):
    cleaned_level_1 = level_1.replace(', ', ' ').replace(' & ', ' ')
    
    # calculate jaccard similarity
    jaccard_similarities = np.array([jaccard(cleaned_level_1, ref.replace(', ', ' ').replace(' & ', ' ')) for ref in level_1_ref])
#     print(jaccard_similarities)
    jaccard_zero_counter = 0
    for jaccard_similarity in jaccard_similarities:
        if jaccard_similarity == 0:
            jaccard_zero_counter += 1
    
    # calcualte smith similarity
    smith_waterman_similarities = np.array([smith_waterman.similarity(cleaned_level_1, ref.replace(', ', ' ').replace(' & ', ' '))/len(ref.replace(', ', ' ').replace(' & ', ' ')) for ref in level_1_ref])
#     print(smith_waterman_similarities)
    
    # add two similarity metrics
    similaries = smith_waterman_similarities + jaccard_similarities
#     print(similaries)    
    
    return similaries, jaccard_similarities, smith_waterman_similarities, jaccard_zero_counter

def matchingAmazon(levels, dictionary):  
    level_1_ref = None
    length = len(levels) 
    if not levels or length == 0: return '',''
    level_0 = levels[0]
    level_1 = levels[1] if length > 1 else None
    level_2 = levels[2] if length > 2 else None
    
    # corner cases for appliances 
    if level_0 == 'appliances' and level_1 == 'laundry appliances' and level_2 is not None:
        level_1 = level_2
    if level_0 == 'appliances' and level_1 == 'parts & accessories'and level_2 is not None:
        level_1 = level_2
    if level_0 == 'home & kitchen' and level_1 == 'heating, cooling & air quality':
        return 'appliances', level_1
    if level_0 == 'appliances' and level_1 == 'parts & accessories'and level_2 is not None:
        level_1 = level_2
    if level_0 == 'tools & home improvement' and level_1 == 'appliances' and level_2 is not None:
        level_0, level_1 = 'appliances', level_2

    # corner cases for automotive
    if level_0 == 'automotive' and level_1 == 'car electronics & accessories':
        return level_0, 'auto electronics'
    
    # corner cases for baby
    if level_0 == 'baby' and level_1 == 'diapering':
        return  level_0, 'diapers & wipes'
    if level_0 == 'baby' and level_1 == 'apparel & accessories':
        return level_0, ''
    
    # cornor cases for clothing, shoes & jewelry
    if level_0 == 'clothing, shoes & jewelry' and (level_1 == 'boys' or level_1 == 'grils' or level_1 == 'baby'):
        return level_0, 'kids'
    if level_0 == 'clothing, shoes & jewelry' and (level_1 == 'novelty & more' or level_1 == 'costumes & accessories' or level_1 == 'uniforms, work & safety'):
        return level_0, ''
    if level_0 == 'luggage & travel gear':
        return 'clothing, shoes & jewelry', level_0
    
    # corner cases for electronics
    if level_0 == 'electronics' and (level_1 == 'gps, finders & accessories' or level_1 == 'ebook readers & accessories' or level_1 == 'accessories & supplies'):
        return level_0, ''
    if level_0 == 'electronics' and level_1 == 'headphones':
        return level_0, 'audio'
    if level_0 == 'electronics' and level_1 == 'computers & accessories':
        return  level_0, 'computers'
    if level_0 == 'cell phones & accessories':
        return 'electronics', level_0
    if level_0 == 'computers':
        return 'electronics', level_0
    
    # cornor cases for electronics
    if level_0 == 'video games':
        return 'entertainment', level_0
    if level_0 == 'movies & tv':
        return 'entertainment', level_0
    if level_0 == 'books':
        return 'entertainment', level_0
    if level_0 == 'cds & vinyl':
        return 'entertainment', 'music & vinyl'
    
    # cornor cases for grocery & gourmet food
    if level_0 == 'grocery & gourmet food':
        return level_0, level_0
    
    # cornor case for health & personal care
    if level_0 == 'beauty' and (level_1 == 'shave & hair removal' or level_1 == 'oral care' or level_1 == 'personal care'):
        return 'health & personal care', 'personal care'
    if level_0 == 'health & personal care' and (level_1 == 'vision care' or level_1 == 'oral care'):
        return level_0, 'personal care'
    if level_0 == 'health & personal care' and level_1 == 'baby & child care':
        return level_0, 'personal care'
    if level_0 == 'health & personal care' and level_1 == 'wellness & relaxation' or level_1 == 'household supplies':
        return level_0, ''
    
    # cornor cases for home & kitchen
    if level_0 == 'home & kitchen' and level_1 == 'lighting & ceiling fans':
        return level_0, ''
    if level_0 == 'home & kitchen' and (level_1 == "kids' rooms" or level_1 == "teens' rooms" ):
        level_1 = level_2
    if level_0 == 'home & kitchen' and level_1 == 'bedding':
        return level_0, 'bed & bath'
    if level_0 == 'tools & home improvement' and level_1 == 'storage & home organization':
        return 'home & kitchen', 'storage & organization'
    if level_0 == 'arts, crafts & sewing' and level_1 == 'organization, storage & transport':
        return 'home & kitchen', 'storage & organization'
    
    # cornor cases for pet supplies
    if level_0 == 'pet supplies':
        return level_0, level_0
    
    # cornor cases for sports & outdoors
    if level_0 == 'sports & outdoors' and level_1 == 'sports & fitness' and (level_2 == 'other sports' or level_2 == 'tennis & racquet sports' or level_2 == 'team sports'):
        return level_0, 'general & team sports'
    if level_0 == 'sports & outdoors' and level_1 == 'sports & fitness' and level_2 == 'leisure sports & game room':
        return level_0, 'game room'
    if level_0 == 'sports & outdoors' and level_1 == 'sports & fitness' and level_2 == 'leisure sports & game room':
        return level_0, 'game room'
    if level_0 == 'sports & outdoors' and level_1 == 'sports & fitness' and level_2 == 'airsoft & paintball':
        return level_0, level_2
    if level_0 == 'sports & outdoors' and level_1 == 'sports & fitness' and (level_2 == 'swimming' or level_2 == 'boating & sailing' or level_2 == 'water sports'):
        return level_0, 'boating & water sports'
    if level_0 == 'sports & outdoors' and level_1 == 'sports & fitness' and level_2 == 'golf':
        return level_0, level_2
    if level_0 == 'sports & outdoors' and level_1 == 'outdoor recreation' and (level_2 == 'cycling' or level_2 == 'skates, skateboards & scooters'):
        return level_0, 'bikes, scooters & boards'
    if level_0 == 'sports & outdoors' and level_1 == 'outdoor recreation' and level_2 == 'winter sports':
        return level_0, level_2
    if level_0 == 'sports & outdoors' and level_1 == 'outdoor recreation' and level_2 == 'outdoor clothing':
        return level_0, 'clothing'
    if level_0 == 'sports & outdoors' and level_1 == 'sports & fitness' and level_2 == 'hunting & fishing':
        return level_0, level_2
    if level_0 == 'sports & outdoors' and level_1 == 'outdoor recreation' and (level_2 == 'climbing' or level_2 == 'camping & hiking'):
        return level_0, 'camping'
    
    # corner cases for gift cards & tickets
    if level_0 == 'gift cards': return 'gift cards & tickets','gift cards & tickets'
    
    # corner cases for tools & home improvement
    if level_0 == 'tools & home improvement' and level_1 == 'kitchen & bath fixtures':
        return level_0, ''
    
    # corner cases for office products
    if level_0 == 'office products':
        return level_0, level_0
    if level_0 == 'electronics' and level_1 == 'office electronics':
        return 'office products', 'office products'
    
    # corner cases for toys & games
    if level_0 == 'toys & games':
        return level_0, level_0
    
    # corner cases for industrial & scientific
    if level_0 == 'industrial & scientific':
        return level_0, level_0
    
    # corner cases for handmade
    if level_0 == 'handmade':
        return level_0, level_0
    
    if level_1 is None and level_0 in dictionary: return level_0, ''
    if level_0 not in dictionary: return '', ''
    level_1_ref = dictionary[level_0]
    similaries, jaccard_similarities, smith_waterman_similarities, jaccard_zero_counter = matchingCommon(level_1, level_1_ref)

    if max(similaries) < 0.3:
        return level_0, ''
        # if no word overlap and string distance is even 
    if jaccard_zero_counter >= (len(level_1_ref) - 1) and max(jaccard_similarities) <= 0.33 and max(smith_waterman_similarities) < max(jaccard_similarities):
        return level_0, ''
    
    return level_0, level_1_ref[similaries.argmax()]

def matchingWalmart(levels, dictionary):
    level_1_ref = None
    length = len(levels) 
    if not levels or length == 0: return '', ''
    level_0 = levels[0]
    level_1 = levels[1].strip('‎') if length > 1 else None
    level_2 = levels[2].strip('‎') if length > 2 else None
 
    if level_0 == 'home': level_0 = 'home & kitchen'
    # corner cases for appliance
    if level_0 == 'home & kitchen' and level_1 and level_1 == 'appliances' and level_2 == 'irons, steamers & accessories':
        level_1 = level_2
    elif level_0 == 'home & kitchen' and level_1 and level_1 == 'appliances':
        level_0, level_1 = level_1, level_2
        level_1_ref = dictionary[level_0]
    if level_0 == 'appliances' and level_1 and level_1 == 'uv light sanitizer wands':
        return level_0, ''
    if level_0 == 'appliances' and level_1 and level_1 == 'appliances parts & accessories':
        level_1 = level_2
    if level_0 == 'home improvement' and level_1 and level_1 == 'heating, cooling, & air quality':
        return 'appliances', level_1
    if level_0 == 'tools & home improvement' and level_1 and level_1 == 'heating, cooling, & air quality':
        return 'appliances', level_1
    
    # corner cases for clothing, shoes & jewelry
    if level_0 == 'home & kitchen' and level_1 and level_1 == 'luggage':
        return 'clothing, shoes & jewelry', level_1
    
    # corner cases for home & kitchen
    if level_0 == 'home & kitchen' and level_1 and level_1 == 'personalized gifts':
        return level_0, ''
    if level_0 == 'home & kitchen' and level_1 and (level_1 == "kids' rooms" or level_1 == "teens' rooms" ):
        level_1 = level_2
    if level_0 == 'home & kitchen' and level_1 and 'bedding' in level_1:
        return level_0, 'bed & bath'
    if level_0 == 'home & kitchen' and level_1 and (level_1 == 'featured shops' or level_1 == 'shop by brand' or level_1 == 'personalized gifts' or level_1 == 'sustainable home'):
        return level_0, ''
    if level_0 == 'home & kitchen' and level_1 and level_1 == 'appliances' and level_2 and level_2 == 'irons, steamers & accessories':
        level_1 = level_2
    if level_0 == 'arts crafts & sewing' and level_1 == 'arts & crafts furniture and storage':
        return 'home & kitchen', 'storage & organization'
    if level_0 == 'tools & home improvement' and level_1 == 'garage & storage':
        return 'home & kitchen', 'storage & organization'
    
    # corner cases for art, crafts & sewing
    if level_0 == 'arts crafts & sewing': level_0 = 'arts, crafts & sewing'
    if level_0 == 'art, crafts & sewing' and level_1 and (level_1 == 'yarn' or level_1 == 'lion brand' or level_1 == 'cricut' or level_1 == 'sizzix' or level_1 == 'brother' or level_1 == 'david textiles' or level_1 == 'spinrite' or level_1 == 'diy wedding'):
        return level_0, ''
    if level_0 == '' and level_1 == 'scrapbooking':
        return level_0, 'scrapbooking & stamping'
    
    # corner cases for automotive
    if level_0 == 'auto & tires': level_0 = 'automotive'
    if level_0 == 'automotive' and level_1 and level_1 in ['auto body','automotive interior','oem parts', 'exterior car accessories', 'automotive replacement parts']:
        return level_0, 'auto parts & accessories'
    if level_0 == 'automotive' and level_1 and level_1 == 'motorcycle' or level_1 == 'atv & off-road':
        return level_0, ''
    if level_0 == 'automotive' and level_1 == 'tires & accessories':
        return level_0, 'tires & wheels'
    
    # corner cases for baby
    if level_0 == 'baby' and level_1 and level_1 == 'baby activities & gear':
        return level_0, 'gear & activity'
    if level_0 == 'baby' and level_1 and level_1 == 'diapering':
        return level_0, 'diapers & wipes'
    if level_0 == 'baby' and level_1 and level_1 in ['new baby products', "parent's choice baby products", 'shop by price','registry',"kristen & dax's baby registry",'baby new arrivals','baby gift ideas','premium baby brands' ,'baby best sellers','baby best sellers']:
        return level_0, ''
    if level_0 == 'baby' and level_1 and level_1 == 'baby bath & skin care' or level_1 == 'baby shower gifts':
        return level_0, 'bath & potty'
    
    # corner cases for beauty
    if level_0 == 'premium beauty':
        level_0 = 'beauty'
        level_1 = level_2
    if level_0 == 'beauty' and level_1 and level_1 in ['beauty by top brands', 'beauty earth day' ,'beauty stock up' ,' black owned beauty brands', 'seasonal beauty']:
        return level_0, ''
    if level_0 == 'beauty' and level_1 and level_1 == 'self-care & pampering':
        level_1 = level_2
    if level_0 == 'beauty' and level_1 =='hairitage':
        return level_0, 'hair care'
    
    # corner cases for clothing, shoes & jewelry
    if level_0 == 'clothing' or level_0 == 'clothing and accessories': level_0 =  'clothing, shoes & jewelry'
    if level_0 == 'clothing, shoes & jewelry' and level_1 and level_1 == 'shoes':
        return level_0, 'shoe, jewelry & watch accessories'
    if level_0 == 'clothing, shoes & jewelry' and level_1 and 'womens' in level_1:
        return level_0, 'women'
    if level_0 == 'clothing, shoes & jewelry' and level_1 and 'mens' in level_1:
        return level_0, 'men'
    if level_0 == 'clothing, shoes & jewelry' and level_1 and level_1 == 'big and tall':
        return level_0, 'men'
    if level_0 == 'clothing, shoes & jewelry' and level_1 and (level_1 == "valentine's day gift guide" or level_1 == 'fashion brands' or level_1 == 'premium brands' or level_1 == 'pre-owned' or level_1 == "mother's day gift guide"):
        return level_0, ''
    if level_0 == 'clothing, shoes & jewelry' and level_1 and (level_1 == 'back to school clothing' or level_1 == 'school uniform store' or level_1 == 'juniors'):
        return level_0, 'kids'
    if level_0 == 'clothing, shoes & jewelry' and level_1 and level_1 == 'bags & accessories':
        return level_0, 'luggage &  travel gear'
    if level_0 == 'jewelry':
        return 'clothing, shoes & jewelry', 'shoe, jewelry & watch accessories'
    
    # corner cases for electronics
    if level_0 == 'cell phones': return 'electronics', 'cell phones & accessories'
    if level_0 == 'electronics' and level_1 and (level_1 == 'walmart private brand electronics' or level_1 == 'walmart exclusive electronics' or level_1 == 'shop electronics by brand' or level_1 == 'electronics accessories'):
        return level_0, ''
    if level_0 == 'electronics' and level_1 and level_1 == 'home phones':
        return level_0, 'cell phones & accessories'
    if level_0 == 'electronics' and level_1 and level_1 == 'smart home' and 'security' in level_2:
        return level_0, 'security & surrveillance'
    
    # corner cases for entertainment
    if level_0 == 'books':
        return 'entertainment', 'books'
    elif level_0 == 'video games':
        return 'entertainment', 'video games'
    elif level_0 == 'movies & tv shows':
        return 'entertainment', 'movies & tv'
    elif level_0 in ['music', 'music on cd or vinyl', 'musical instruments']:
        return 'entertainment','music & vinyl'

    # corner cases for gift cards & tickets
    if level_0 in ['gifts & registry', 'gift cards & tickets']:
        return 'gift cards & tickets', 'gift cards & tickets'
    
    # coner cases for grocery & gourmet food
    if level_0 in ['food', 'unnav']: return 'grocery & gourmet food', 'grocery & gourmet food'

    # corner cases for health & personal care
    if level_0 == 'health': level_0 = 'health & personal care'
    if level_0 == 'pharmacy':return 'health & personal care', 'medicines & medical supplies'
    if level_0 == 'health & personal care' and level_1 and level_1 in ['equate', 'health holiday gifts', 'spa & relaxation']:
            return level_0, ''
    if level_0 == 'health & personal care' and level_1 and level_1 in ['aromatherapy', 'light therapy', 'diabetes care brands', 'bedroom safety & aids', 'diabetes care',  'diabetes management', 'sleep & snoring aids', 'family planning', 'first aid', 'quit smoking', 'sleep better', 'sleep solutions', 'smoking cessation',  'sound machines'] :
        return level_0, 'health care'
    if level_0 == 'health & personal care' and level_1 and (level_1 in ['foot care', 'massage', 'ear care', 'eye care', 'lip care', 'vision centers', 'self-care & pampering'] or 'sexual wellness' in level_1):
        return level_0, 'personal care'
    if level_0 == 'health & personal care' and level_1 and level_1 in ['acid reflux', 'allergy and sinus' ,'cold cough and flu', 'cough cold and flu medicine', 'cpap products', 'digestive health', 'gas relief', 'laxatives', 'motion sickness & nausea', 'must-have covid-19 supplies', 'nausea medicine', 'otc network', 'pain management', 'pain relievers', 'probiotics', 'restock health essentials', 'stomach pain']:
        return level_0, 'medicines & medical supplies'
    if level_0 == 'health & personal care' and level_1 and (level_1 == 'keto diet' or level_1 == 'new & trending keto'):
        return level_0, 'vitamins & dietary supplements'
    if level_0 == 'health & personal care' and level_1 and level_1 in ['protein & fitness', 'sports medicine & injury recovery solution', 'superfoods & cleanses', 'weight management']:
        return level_0, 'nutrition'
    if level_0 in ['personal care','cuidado personal']:
        return 'health & personal care', 'personal care'
    
    # corner case for office products
    if level_0 == 'office supplies': return 'office products','office products'
    
    # corner case for pet supplies
    if level_0 == 'pets': return 'pet supplies','pet supplies'

    # corner case for patio, lawn & garden
    if level_0 == 'patio & garden': level_0 = 'patio, lawn & garden'
    if level_0 == 'patio, lawn & garden' and level_1 and level_1 in ['garden center', 'clearance patio & garden', 'live plants']:
        return level_0, 'gardening & lawn care'
    if level_0 == 'patio, lawn & garden' and level_1 and level_1 == 'outdoor shade':
        return level_0, 'sheds, shade & outdoor storage'
    
    # corner cases for sports & outdoors
    if level_0 == 'sports & outdoors' and level_1 and '@' in level_1:
        return level_0, ''
    if level_0 == 'sports & outdoors' and level_1 and level_1 == 'sports' and level_2 in ['volleyball', 'tennis & racquets', 'squash', 'softball gear & equipment', 'soccer', 'racquetball', 'pickleball', 'pickleball', 'football gear & equipment', 'equestrian gear', 'bowling', 'basketball', 'baseball gear & equipment']:
        return level_0, 'general & team sports'
    if level_0 == 'sports & outdoors' and level_1 and level_1 == 'sports' and level_2 in ['swimming', 'aqua fitness']:
        return level_0, 'boating & water sports'
    if level_0 == 'sports & outdoors' and level_1 and level_1 == 'sports' and level_2 in ['ice & roller hockey', 'hockey equipment']:
        return level_0, 'winter sports'
    if level_0 == 'sports & outdoors' and level_1 and level_1 == 'sports' and level_2 == 'golf equipment':
        return level_0, 'golf'
    if level_0 == 'sports & outdoors' and level_1 and level_1 == 'sports & outdoors clearance':
        level_1 = level_2
    if level_0 == 'sports & outdoors' and level_1 and level_1 in ['sports & outdoors clearance','recreation'] and level_2 in  ['game room clearance','game room']:
        return level_0, 'game room'
    if level_0 == 'sports & outdoors' and level_1 and level_1 == 'recreation' and level_2 ==  'skateboards & skates':
        return level_0, 'bike, scooters & boards'
    if level_0 == 'sports & outdoors' and level_1 and level_1 == 'outdoor sports' and level_2:
        level_1 = level_2
    if level_0 == 'sports & outdoors' and level_1 and level_1 == 'snowsports':
        return level_0, 'winter sports'
    if level_0 == 'sports & outdoors' and level_1 and level_1 in ['shooting', 'self defense tools']:
        return level_0, 'hunting & fishing'
    if level_0 == 'sports & outdoors' and level_1 and (level_1 == 'paddling' or 'boat' in level_1):
        return level_0, 'boating & water sports'
    if level_0 == 'sports & outdoors' and level_1 and level_1 == 'recreational shooting':
        return level_0, 'airsoft & paintball'
    if level_0 == '' and level_1 in ['the eddie bauer shop', 'the mossy oak shop', 'the realtree shop']:
        return level_0, ''
    
    # corner cases for tools & home improvement
    if level_0 == 'home improvement': level_0 = "tools & home improvement"
    if level_0 == 'tools & home improvement' and level_1 and 'renovation' in level_1:
        level_1 = level_2
    if level_0 == 'tools & home improvement' and level_1 and 'plumbing' in level_1:
        return level_0, 'plumbing'
    if level_0 == 'tools & home improvement' and level_1 and 'lighting' in level_1:
        return level_0, 'lighting & light bulbs'
    if level_0 == 'tools & home improvement' and level_1 and level_1 == 'paint':
        return level_0, 'paint, wall treatments & supplies'
    if level_0 == 'tools & home improvement' and level_1 and level_1 == 'water purification':
        return level_0, ''
    
    # corner cases for toys & games
    if level_0 == 'toys':
        return 'toys & games', 'toys & games'
    
    # corner cases for industrial & scientific
    if level_0 == 'industrial & scientific':
        return level_0, level_0
    
    # corner cases for handmade
    if level_0 == 'handmade': return level_0, level_0
    
    if level_1 is None: return level_0, ''
    if level_0 not in dictionary: return '', ''
    level_1_ref = dictionary[level_0]
    print(level_0,level_1)
    similaries, jaccard_similarities, smith_waterman_similarities, jaccard_zero_counter = matchingCommon(level_1, level_1_ref)
    if max(similaries) < 0.3:
        return level_0, ''
        # if no word overlap and string distance is even 
    if jaccard_zero_counter >= (len(level_1_ref) - 1) and max(jaccard_similarities) < 0.33 and max(smith_waterman_similarities) < max(jaccard_similarities):
        return level_0, ''
    
    return level_0, level_1_ref[similaries.argmax()]

def matchingTarget(levels, dictionary): 
    level_1_ref = None
    length = len(levels)
    if not levels or length == 0: return '', ''
    level_0 = levels[0]
    level_1 = levels[1].strip('‎') if length > 1 else None
    level_2 = levels[2].strip('‎') if length > 2 else None    

    # corner cases for appliances
    if level_0 == 'home & kitchen' and level_1 and level_1 == 'heating, cooling & air quality':
        return 'appliances', level_1
    if level_0 == 'home & kitchen' and level_1 and level_1 == 'home appliances':
        level_0 = 'appliances'
        level_1 = level_2
    if level_0 == 'appliances' and level_1 and level_1 in ['refurbished & reconditioned appliances', 'home appliances deals']:
        return level_0, ''
    if  level_0 == 'kitchen & dining' and level_1 and level_1 == 'kitchen appliances' and level_2 == 'freezers & ice makers':
        return 'appliances', level_2
    if level_0 == 'kitchen & dining' and level_1 and level_1 and level_2 and (level_1 == 'kitchen appliances' or level_1 == 'cookware') and ('oven' in level_2 or 'cooktops' in level_2):
        return 'appliances', 'ovens, ranges & cooktops'
    if level_0 == 'kitchen & dining' and level_1 and level_1 and level_2 and level_1 == 'kitchen appliances' and 'frige' in level_2:
        return 'appliances', 'refrigerators'
    if level_0 == 'kitchen & dining':
        return 'home & kitchen', 'kitchen & dining'
    
    # corner cases for home & kichen
    if level_0 == 'home & kitchen' and level_1 and level_1 == 'home appliances' and level_2 == 'irons, steamers & accessories':
        return level_0, 'iron & steamers'
    if level_0 == 'home & kitchen' and level_1 and (level_1 == "kids' rooms") and level_2:
        level_1 = level_2[6:]
    if level_0 == 'home & kitchen' and level_1 and level_1 == 'bedding':
        return level_0, 'bed & bath'
    if level_0 == 'home & kitchen' and level_1 and level_1 ==  'decor style ideas':
        return level_0, 'home decor'
    if level_0 == 'home & kitchen' and level_1 and level_1 == 'home new arrivals':
        return level_0, ''
    
    # corner cases for home improvement
    if level_0 == 'home & kitchen' and level_1 and level_1 == 'home improvement':
        level_0 = 'tools & home improvement'
        level_1 = level_2
    
    # corner cases for art, crafts & sewing
    if level_0 == 'home & kitchen' and level_1 and level_1 in ['art, crafts & sewing', 'arts, crafts & sewing', 'art, crafts', 'arts & crafts']:
        level_0 = 'arts, crafts & sewing'
        level_1 = level_2
    if level_0 == 'appliances' and level_1 and level_1 is not None and 'sewing' in level_1:
        return 'arts, crafts & sewing', 'sewing'
    
    # corner cases for automotive
    if level_0 == 'home & kitchen' and level_1 and level_1 == 'automotive':
        level_0 = level_1
        level_1 = level_2
    if level_0 == 'automotive' and level_2 and level_2 in ['exterior car accessories', 'interior car accessories']:
        return level_0, 'auto parts & accessories'
    
    # corner cases for baby
    if level_0 == 'baby' and level_1  and level_1 == 'diapering':
        return level_0, 'diapers & wipes'
    if level_0 == 'health & household' and level_1 == 'disposable diapers':
        return 'baby', 'diapers & wipes'
    
    # corner cases for beauty
    if level_0 == 'beauty' and level_1 and level_1 == 'nails':
        return level_0, 'foot, hand & nail care'
    
    # corner cases for clothing, shoes & jewelry
    if level_0  == 'clothing, shoes & accessories':
        level_0 = 'clothing, shoes & jewelry'
    if level_0 == 'clothing, shoes & jewelry' and level_1 and 'women' in level_1:
        return level_0, 'women'
    if level_0 == 'clothing, shoes & jewelry' and level_1 and 'men' in level_1:
        return level_0, 'men'
    if level_0 == 'clothing, shoes & jewelry' and level_1 and 'kid' in level_1:
        return level_0, 'kids'
    if level_0 == 'shoes':
        return  'clothing, shoes & jewelry', 'shoe, jewelry & watch accessories'
    if 'women' in level_0:
        return 'clothing, shoes & jewelry', 'women'
    if level_0 == 'men':
        return 'clothing, shoes & jewelry', 'men'
    if 'kids' in level_0:
        return 'clothing, shoes & jewelry', 'kids'
    if level_0 == 'luggage':
        return 'clothing, shoes & jewelry', 'luggage & travel gear'
    
    # corner cases for electronics
    if level_0 == 'electronics' and level_1 and level_1 == 'tvs & home theater':
        return level_0, 'television & video'
    if level_0 == 'electronics' and level_1 and level_1 in ['batteries', 'home phones']:
        return level_0, 'cell phones & accessories'
    if level_0 == 'electronics' and level_1 and (level_1 == 'headphones' or level_1 == 'electronics cleaning supplies'):
        return level_0, ''
    
    # corner cases for entertainment
    if level_0 == 'video games':
        return 'entertainment', level_0
    if level_0 ==  'movies, music & books' and level_1 and level_1=='books':
        return 'entertainment', level_1
    if level_0 ==  'movies, music & books' and level_1 and level_1=='music':
        return 'entertainment', "music & vinyl"
    if level_0 ==  'movies, music & books' and level_1 and level_1=='movies':
        return 'entertainment', "movies & tv"
    
    # corner cases for gift cards & tickets
    if 'gift' in level_0:
        return 'gift cards & tickets', 'gift cards & tickets'
    
    # corner cases for grocery
    if level_0 == 'grocery':
        return 'grocery & gourmet food', 'grocery & gourmet food'
    
    
    # corner cases for health & personal care
    if level_0 == 'health' and level_1 and level_1 in ['first aid', 'incontinence']:
        return 'health & personal care', 'health care'
    if level_0 == 'personal care':
        return 'health & personal care', 'personal care'
    if level_0 == 'health' and level_1 and level_1 in ['feminine products', 'sexual health', 'foot care',  'eye care', '']:
        return 'health & personal care', 'personal care'
    if level_0 == 'health' and level_1 and level_1 in ["kids' health & wellness", "health & wellness", "health & beauty wellness"]:
        return 'health & personal care', ""
    if level_0 == 'health':
        level_0 = 'health & personal care'
        
    # corner cases for school & office supplies
    if level_0 == 'school & office supplies':
        return 'office products', 'office products'
    
    # corner cases for pet supplies
    if level_0 == 'pets':
        return 'pet supplies', 'pet supplies'
    
    # corner cases for patio, lawn & garden
    if level_0 == 'patio & garden':
        level_0 = 'patio, lawn & garden'
    
    # corner cases for sports & outdoors
    if level_0 == 'sports & outdoors' and level_1 and level_1 == 'bikes & cycling':
        return level_0, 'bikes, scooters & boards'
    if level_0 == 'sports & outdoors' and level_1 and level_1 == 'coolers':
        return level_0, 'camping'
    if level_0 == 'sports & outdoors' and level_1 and level_1 == 'sports equipment':
        if level_2 and level_2 in ['baseball equipment & gear',  'basketball equipment & gear','football equipment & gear', 'volleyball equipment & gear', 'soccer equipment & gear', 'softball equipment & gear', 't ball equipment & gear', 'tennis equipment & gear', 'wrestling equipment & gear', 'pickleball equipment & gear', 'training & coaching aids', 'hockey equipment & gear']:
            return level_0, 'general & team sports'
        if level_2 and level_2 == 'figure skating equipment & gear':
            return level_0, 'bikes, scooters & boards'
        if level_2 and level_2 == 'snowboarding equipment & gear':
            return level_0, "winter sports"
        if level_2: level_1 = level_2
    if level_0 == 'sports & outdoors' and level_1 and level_1 == 'outdoor recreation':
        if level_2:
            if level_2 in ['hiking', 'hiking gear', 'coolers', 'sleeping bags']:
                return level_0, 'camping'
            if level_2 in ['swimming gear', 'swimming pools']:
                return level_0, 'boating & water sports'
            level_1 = level_2
    if level_0 == 'sports & outdoors' and level_1 and level_1 == 'camping & outdoor recreation':
        if level_2:
            if level_2 == 'fishing gear':
                return level_0, 'hunting & fishing'
            elif level_2 in ['swimming gear', 'swimming pools']:
                return level_0, 'boating & water sports'
            return level_0, 'camping'
    if level_0 == 'sports & outdoors' and level_1 and level_1 in ['sports & outdoors gift ideas', 'new in sports, outdoors, & recreation', 'sports equipment']:
        return level_0, ''
    
    # corner cases for toys & games
    if level_0 == 'toys':return 'toys & games', 'toys & games'
    # corner cases for handmade
    if level_0 == 'handmade': return level_0, level_0

    if level_1 is None and level_0 in dictionary: return level_0, ''
    if level_0 not in dictionary: return '', ''
    level_1_ref = dictionary[level_0]
    print(level_1_ref)
    print(level_0,level_1)
    similaries, jaccard_similarities, smith_waterman_similarities, jaccard_zero_counter = matchingCommon(level_1, level_1_ref)
    if max(similaries) < 0.3:
        return level_0, ''
        # if no word overlap and string distance is even 
    if jaccard_zero_counter >= (len(level_1_ref) - 1) and max(jaccard_similarities) < 0.33 and max(smith_waterman_similarities) < max(jaccard_similarities):
        return level_0, ''
    
    return level_0, level_1_ref[similaries.argmax()]

def matchingCostco(levels, dictionary): 
    level_1_ref = None
    length = len(levels) 
    if not levels or length == 0: return '', ''
    level_0 = levels[0] 
    level_1 = levels[1].strip('‎') if length > 1 else None
    level_2 = levels[2].strip('‎') if length > 2 else None
    
        
    # corner cases for appliances
    if level_0 == 'home & kitchen' and level_1 and level_1 == 'small kitchen appliances':
        return 'appliances', 'kitchen appliance packages'
    if level_0 == 'appliances' and level_1 and level_1 == 'appliance warehouse displays':
        return level_0, ''
    if level_0 == 'appliances' and level_1 and level_1 == 'wine cellars & coolers':
        return 'appliances', 'freezers & ice makers'
    if level_0 == 'appliances' and level_1 and level_1 in ['cooking appliances', 'microwaves']:
        return level_0, 'ovens, ranges & cooktops'
    
    # corner cases for arts, crafts & sewing
    if level_0 == 'toys & books' and level_1 and level_1 == 'arts, crafts & hobby':
        return 'arts, crafts & sewing', ''
    
    # corner cases for automotive
    if level_0 == 'tires & auto':
        level_0 = 'automotive'
    if level_0 == 'automotive' and level_1 and level_1 == 'atv & powersport accessories':
        return level_0, 'auto parts & accessories'
    if level_0 == 'automotive' and level_1 and level_1 == 'garage & shop accessories':
        return level_0, 'automotive tools & equipment'
    if level_0 == 'automotive' and level_1 and level_1 == 'motor oil':
        return level_0, 'oils & fluids'
    
    # corner cases for baby
    if level_0 == 'baby' and level_1 and level_1 == 'baby essentials':
        return level_0, ''
    if level_0 == 'baby'  and level_1 and level_1 == 'kids tables & chairs':
        return level_0, 'nursery'
    
    # corner cases for beauty
    if level_0 == 'beauty' and level_1 and level_1 == 'nail care':
        return level_0,  'foot, hand & nail care'
    if level_0 == 'beauty' and level_1 and level_1 in ['sunscreen', 'sun care', 'sunscreen & sun care']:
        return level_0, 'skin care'
    
    # corner cases for clothing, shoes & jewelry
    if level_0 == 'clothing, luggage & handbags':
        level_0 = "clothing, shoes & jewelry"
    if level_0 == "clothing, shoes & jewelry" and level_1 and level_1 == 'handbags & wallets':
        return level_0, 'luggage & travel gear'
    if level_0 == 'clothing, shoes & jewelry' and level_1  and ('boys' in level_1 or 'girls' in level_1):
        return level_0, 'kids'
    if level_0 == 'clothing, shoes & jewelry' and level_1 and 'women' in level_1:
        return level_0, 'women'
    if level_0 == 'clothing, shoes & jewelry' and level_1 and 'men' in level_1:
        return level_0, 'men'
    if level_0 == 'jewelry, watches & sunglasses':
        return 'clothing, shoes & jewelry', 'shoe, jewelry & watch accessories'
    if level_0 == 'clothing, shoes & jewelry' and level_1 and level_1 == 'footwear':
        return level_0, 'shoe, jewelry & watch accessories'
    
    # corner cases for electronics
    if level_0 == 'computers':
        return 'electronics', level_0
    if level_0 == 'electronics' and level_1 and level_1 == 'smart home & safety':
        return level_0, 'security & surveillance'
    if level_0 == 'electronics' and level_1 and level_1 == 'cameras & camcorders' and level_2 and level_2 == 'drones':
        return level_0, level_2
    if level_0 == 'electronics' and level_1 and level_1 == 'tvs':
        return level_0, 'television & video'
    if level_0 == 'electronics' and level_1 and level_1 == 'video games':
        return 'entertainment', level_1
    if level_0 == 'electronics' and level_1 and level_1 in ['digital codes', 'batteries']:
        return level_0, ''
    
    # corner cases for entertainment
    if level_0 == 'electronics' and level_1 and level_1 == 'musical instruments':
        return 'entertainment', 'music & vinyl'
    if level_0 == 'toys & books' and level_1 and level_1 == 'books':
        return 'entertainment', level_1
    if level_0 == 'movies & tv shows':
        return 'entertainment', 'movies & tv'
    
    # corner cases for gift cards & tickets
    if level_0 == 'gift cards & tickets':
        return level_0, level_0

    # coner cases for pet supplies
    if level_0 == 'food, household & pet' and level_1 and level_1 == 'pet supplies':
        return 'pet supplies', 'pet supplies'
    
    # corner cases for grocery
    if level_0 == 'grocery, household essentials & pet' and level_1 and (level_1 == 'pet supplies' or 'dog' in level_1):
        return 'pet supplies', 'pet supplies'
    if level_0 == 'grocery, household essentials & pet' and level_1 and level_1 in ['2-day deliver', 'cleaning supplies', 'paper & plastic products']:
        return '', ''
    if level_0 == 'grocery, household essentials & pet' and not level_1:
        return '', ''
    elif level_0 == 'grocery, household essentials & pet':
        return 'grocery & gourmet food', 'grocery & gourmet food'
    if level_0 == 'gourmet foods':
        return 'grocery & gourmet food', 'grocery & gourmet food'
    if level_0 == 'food, household & pet':
        return 'grocery & gourmet food', 'grocery & gourmet food'
    
    # corner cases for health & personal care
    if level_0 == 'health & personal care' and level_1 and level_1 == 'health & medicines':
        return level_0, 'medicines & medical supplies'
    if level_0 == 'health & personal care' and level_1 and level_1 == 'travel immunizations':
        return level_0, 'health care'
    if level_0 == 'health & personal care' and level_1 and level_1 == 'massage equipment':
        return level_0, 'personal care'
    
    # corner cases for office products
    if level_0 == 'office products':
        return level_0, level_0
    if level_0 == 'furniture' and level_1 and level_1 == 'office furniture':
        return 'office products', 'office products'
    
    # corner cases for toys & games
    if level_0 == 'toys & books':
        return 'toys & games', 'toys & games'
    
    # corner cases for home & kitchen
    if level_0 == 'home & kitchen' and level_1 and level_1 in ['cookware & bakeware', 'tableware']:
        return level_0, 'kitchen & dining'
    if level_0 == 'home & kitchen' and level_1 and level_1 == 'rugs':
        return level_0, 'home decor'
    if level_0 in ['furniture & mattresses', 'mattresses', 'furniture', 'all chairs']:
        return 'home & kitchen', 'furniture'
    
    # corner cases for sports & outdoors
    if level_0 == 'sports & fitness':
        level_0 = 'sports & outdoors'
    if level_0 == 'sports & outdoors' and level_1 and level_1 == 'tailgating':
        return level_0, ''
    if level_0 == 'sports & outdoors' and level_1 and level_1 == 'paddle, surf & kayaks':    
        return level_0, 'boating & water sports'    
    
    # corner cases for tools & home improvement
    if level_0 == 'home improvement':
        level_0 = 'tools & home improvement'
    if level_0 == 'tools & home improvement' and level_1 and 'flooring' in level_1:
        return level_0, 'building materials'
    if level_0 == 'tools & home improvement' and level_1 and level_1 in ['garage', 'storage & organization']:
        return 'home & kitchen', 'storage & organization'
    if level_0 == 'tools & home improvement' and level_1 and level_1 == 'safes':
        return level_0, 'safety & security'
    if level_0 == 'tools & home improvement' and level_1 and level_1 == 'installed products & services':
        return level_0, ''
    if level_0 == 'tools & home improvement' and level_1 and level_1 == 'tools & hardware':
        return level_0, 'tools'
    
     # corner cases for patio, lawn & garden
    if level_0 == 'patio, lawn & garden' and level_1 and level_1 in ['backyard playground equipment', 'outdoor heating & cooling']:
        return level_0, ''
    if level_0 == 'patio, lawn & garden' and level_1 and level_1 in ['awnings & window coverings', 'backyard & outdoor structures', 'patio covers & shade structures']:
        return level_0, 'sheds, shade & outdoor storage'
    if level_0 == 'patio, lawn & garden' and level_1 and level_1 in ['landscaping', 'bulbs & seed', 'plants, bulbs & seeds']:
        return level_0, 'gardening & lawn care'
    if level_0 == 'tools & home improvement' and level_1 and 'generators' in level_1:
        return 'patio, lawn & garden', 'outdoor power equipment'
    
    # corner cases for handmade
    if level_0 == 'handmade':return level_0, level_0

    if level_1 is None and level_0 in dictionary: return level_0, ''
    if level_0 not in dictionary: return '', ''
    level_1_ref = dictionary[level_0]
    print(level_1_ref)
    print(level_0,level_1)
    similaries, jaccard_similarities, smith_waterman_similarities, jaccard_zero_counter = matchingCommon(level_1, level_1_ref)
    if max(similaries) < 0.3:
        return level_0, ''
        # if no word overlap and string distance is even 
    if jaccard_zero_counter >= (len(level_1_ref) - 1) and max(jaccard_similarities) < 0.33 and max(smith_waterman_similarities) < max(jaccard_similarities):
        return level_0, ''
    
    return level_0, level_1_ref[similaries.argmax()]

def processForAmazon(levels, dictionary):    
    final_level_0, final_level_1 = matchingAmazon(levels, dictionary)
    if final_level_0 and len(final_level_0) > 0:
        return  '|'.join([final_level_0, final_level_1])
    return ''

def processForWalmart(levels, dictionary):
    level_0 = levels[0]
    if level_0 == 'home page':
        levels = levels[1:]

    final_level_0, final_level_1 = matchingWalmart(levels, dictionary)
    if final_level_0 and len(final_level_0) > 0:
        return  '|'.join([final_level_0, final_level_1])
    return ''

def processForTarget(levels, dictionary):
    level_0 = levels[0]
    if level_0 == 'home':
        level_0 = levels[0] = 'home & kitchen'
    if level_0 == 'furniture':
        if len(levels) > 1 and levels[1] == 'kichen & dining furniture':
            return 'home & kichen|kiichen & dining'
        return 'home & kitchen|furniture'
    final_level_0, final_level_1 = matchingTarget(levels, dictionary)
    if final_level_0 and len(final_level_0) > 0:
        return  '|'.join([final_level_0, final_level_1])
    return  ""

def processForCostco(levels, dictionary):
    level_0 = levels[0]
    if level_0 == 'home': levels = levels[1:]
    final_level_0, final_level_1 = matchingCostco(levels, dictionary)
    if final_level_0 and len(final_level_0) > 0:
        return  '|'.join([final_level_0, final_level_1])
    return  ""

def mapping_category_hierarchy(domain, category_hierarchy_column, dictionary):
        final = None
        if not category_hierarchy_column: return final
        levels = category_hierarchy_column.lower().split('|')

        if domain == 'amazon.com':
            final = processForAmazon(levels, dictionary)
        elif domain == 'walmart.com':
            final = processForWalmart(levels, dictionary)
        elif domain == 'target.com':
            final = processForTarget(levels, dictionary)
        elif domain == 'costco.com':
            final = processForCostco(levels, dictionary)
        elif domain == 'instacart.com': # not yet being included
            final = processForInstacart(levels, dictionary)
        elif domain == 'bestbuy.com': # not yet being included
            final = processForBestbuy(levels, dictionary)
        elif domain == 'homedepot.com': # not yet being included
            final = processForHomedepot(levels, dictionary)
        return final
    
def split_levels_0(processed_levels):
    if not processed_levels: return None
    return processed_levels.split('|')[0]

def split_levels_1(processed_levels):
    if not processed_levels: return None
    levels = processed_levels.split('|')
    return levels[1] if len(levels) > 1 else None

mapping_category_hierarchy_UDF = udf(lambda z: mapping_category_hierarchy(z[0], z[1], category_dictionary),  StringType())
split_levels_0_UDF = udf(lambda x: split_levels_0(x), StringType())
split_levels_1_UDF = udf(lambda x: split_levels_1(x), StringType())

In [0]:
sample = retailers_us.where((col('category_hierarchy').isNotNull()) & (col('event_name') != 'productSearch')).select('page_domain', 'category_hierarchy').cache()
retailers_us_with_levels = retailers_us.withColumn('levels', mapping_category_hierarchy_UDF(struct(col('page_domain'), col('category_hierarchy'))))
del retailers_us
retailers_us_with_levels = retailers_us_with_levels.withColumn('level_0', split_levels_0_UDF(col('levels'))) \
                            .withColumn('level_1', split_levels_1_UDF(col('levels'))) \
                            .drop(col('levels'))

In [0]:
retailers_us_delta_file_path = "/mnt/delta/general_data/Disqo_Dataset_Processed_Delta/retailers_us_with_levels"
dbutils.fs.rm(retailers_us_delta_file_path, True)
retailers_us_with_levels.write.format("delta").save(retailers_us_delta_file_path)

In [0]:
%sql
drop table if exists retailers_us_with_levels;
-- Headsup: Delete the exsisting table before creating this one
CREATE TABLE retailers_us_with_levels
USING delta
LOCATION "/mnt/delta/general_data/Disqo_Dataset_Processed_Delta/retailers_us_with_levels"