In [74]:
from collections import defaultdict
import pandas as pd

In [75]:
products = pd.read_csv("../data/processed/products.csv")
products['Brand'] = products['Brand'].astype(str).str.lower().str.strip()

def convert_to_str(value):
    if not isinstance(value, str):
        return str(value)  # Convert non-strings to strings
    else:
        return value

products['Brand'] = products['Brand'].apply(convert_to_str)

print(products['Brand'])

0            bontrager
1               armada
2                  nan
3            obermeyer
4            bontrager
             ...      
21391       fox racing
21392    coal headwear
21393           armada
21394              uni
21395              fox
Name: Brand, Length: 21396, dtype: object


In [79]:
unique_counts = products['Brand'].value_counts().tail(100)
print(unique_counts)

Brand
nan                         8209
fox                         2453
burton                      1013
tgt                          763
airblaster                   728
                            ... 
msw                            1
cghabitats                     1
sportsaccessoriesamerica       1
gear                           1
crankbrothers                  1
Name: count, Length: 66, dtype: int64


In [85]:
def condense_brands(brands, threshold=2, distance_func=None):
    """
    Condenses similar brands based on a distance threshold.

    Args:
        brands: List of brand strings.
        threshold: Maximum allowed distance between similar brands.
        distance_func: Function to calculate the distance between two brands.  
            If not provided, a basic string comparison is used.

    Returns:
        A list of brands with duplicates replaced by a chosen representative.
    """

    if distance_func is None:
        # Basic string comparison (modify this based on your needs)
        def distance_func(brand1, brand2):
            return sum(c1 != c2 for c1, c2 in zip(brand1, brand2))

    duplicates = defaultdict(list)
    for i, brand in enumerate(brands):
        cleaned_brand = "".join(c for c in brand.lower() if c.isalnum())
        for j in range(i + 1, len(brands)):
            other_brand = brands[j]  # Lowercase for comparison

            distance = distance_func(cleaned_brand, other_brand)
            if distance <= threshold:
                duplicates[cleaned_brand].append(other_brand)
                duplicates[other_brand].append(cleaned_brand)

    representative_brands = {}
    for brand, near_duplicates in duplicates.items():
        # Choose representative based on your logic (e.g., most frequent)
        representative_brand = max(set(near_duplicates), key=near_duplicates.count)
        representative_brands[brand] = representative_brand

    for i, brand in enumerate(brands):
        print(f"Brand at index {i}: {brand}")  # Check brand type
        cleaned_brand = "".join(c for c in brand if c.isalnum())
        if brand in representative_brands:  # Use brand.lower() for lookup
            brands[i] = representative_brands[brand.lower()]

    return brands

products['Brand'] = condense_brands(products['Brand'],threshold=4)

Brand at index 0: nan
Brand at index 1: nan
Brand at index 2: nan
Brand at index 3: nan
Brand at index 4: nan
Brand at index 5: nan
Brand at index 6: nan
Brand at index 7: nan
Brand at index 8: nan
Brand at index 9: nan
Brand at index 10: nan
Brand at index 11: nan
Brand at index 12: nan
Brand at index 13: nan
Brand at index 14: nan
Brand at index 15: nan
Brand at index 16: nan
Brand at index 17: nan
Brand at index 18: nan
Brand at index 19: nan
Brand at index 20: nan
Brand at index 21: nan
Brand at index 22: nan
Brand at index 23: nan
Brand at index 24: nan
Brand at index 25: nan
Brand at index 26: nan
Brand at index 27: nan
Brand at index 28: nan
Brand at index 29: nan
Brand at index 30: nan
Brand at index 31: nan
Brand at index 32: nan
Brand at index 33: nan
Brand at index 34: nan
Brand at index 35: nan
Brand at index 36: nan
Brand at index 37: nan
Brand at index 38: nan
Brand at index 39: nan
Brand at index 40: nan
Brand at index 41: nan
Brand at index 42: nan
Brand at index 43: na

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [78]:
brand_list = products['Brand'].value_counts().index.tolist() #create a new brand called 'other' --> 1s and 2 count; top 50 brands
brand_list.sort()
print(brand_list)

['1', 'airblaster', 'anon', 'armada', 'auclair', 'black crows', 'blackburn', 'blue84', 'bontrager', 'bula', 'burton', 'cap', 'cghabitats', 'champion', 'coal', 'crankbrothers', 'dakine', 'dragon', 'electra', 'endura', 'ergon', 'fox', 'full tilt', 'g3', 'gear', 'gear for sports', 'gearforsports', 'giro', 'head', 'kmc', 'ks', 'kuat', 'line', 'maxxis', 'microshift', 'msw', 'mucoff', 'nan', 'never summer industries', 'obermeyer', 'odi', 'quicksilver', 'quiksilver', 'salomon', 'schwalbe', 'screamer', 'seirus', 'shimano', 'silipint', 'sks', 'smith', 'sportsaccessoriesamerica', 'sram', 'stance', 'sunlite', 'swix', 'terramar', 'tgt', 'tifosioptics', 'trek', 'uni', 'union', 'volcom', 'volkl', 'yes', 'yesterdays']


In [84]:
unique_counts = products['Brand'].value_counts().tail(100)
print(unique_counts)

Brand
nan    21396
Name: count, dtype: int64


['zoom']


In [82]:
products.groupby('Brand').nunique()
#histogram of all value counts; decide which brand to capture and put the other in an other category


Unnamed: 0_level_0,Category,Description,Keyword,UPC,MSRP,Quantity,SKU,Color,Size,StyleNumber,StyleName,ParentCategory
Brand,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
airblaster,43,914,21,901,78,12,924,292,65,64,100,6
armada,24,500,5,452,75,10,500,91,65,139,93,4
burton,50,1031,15,1016,123,10,1032,179,81,125,172,8
fox,127,2408,100,2278,291,31,2521,496,221,308,281,13
g3,94,1826,42,1632,312,23,1832,372,152,104,120,10
head,67,1194,43,1062,132,20,1200,174,122,139,103,9
,218,9141,213,7919,656,55,9233,1156,894,852,948,25
obermeyer,16,447,5,446,51,7,449,37,38,47,56,4
smith,82,1164,64,984,209,36,1170,298,137,214,159,11
stance,52,797,22,721,153,17,798,111,62,189,169,6


In [40]:
clean_brands = {
    "yesterdays": "yesterday",
    "wheels manufactuing": "wheels manufacturing",
    "j and b importers": "j&b",
    "j. america": "j america",
    "shimano": "shm",
    "nize ize": "nite ize",
    "head - tyrolia": "head/tyrolia",
    "head-tyrolia": "head/tyrolia",
    "marker volkl usa": "marker volkl",
    "marker_pse": "marker",
    "louis garneau": "garneau",
    "never summer industries": "never summer",
    "oticket by hive": "e*thirteen by the hive",
    "shimano": "shm",
    "microshift": "msw",
    "mountain mtn. media": "mountain media",
    "mountainflow": "mountain flow",
    "mr tuffy": "mr tuffy",
    "neighborhoods": "neighbor hood",
    "nite ize": "nize ize",
    "portland design works": "pdw",  # Portland Design Works is often abbreviated to PDW
    "promax": "pro-max",  # promax is often hyphenated
    "qbp": "quality wheels",  # QBP is the parent company of Quality Wheels
    "shimano": "shimano",  # shimano is already listed correctly
    "shimano": "shimano",  # shimano is already listed correctly
    "ski hood": "false",  # Ski hood is not in the list
    "smith optics": "smtih",  # smith optics is misspelled
    "sockguy": "sock guy",  # sockguy is misspelled
    "sports accessories/america": "sports accessories america",
    "stance": "stans",  # Stance is a different brand than Stans
    "stans no tubes": "stan's notubes",
    "stans notubes": "stan's notubes",
    "thule": "thoule",  # thule is misspelled
    "tifosi optics": "tifsoi optics",  # tifosi optics is misspelled
    "time sports": "time",  # Time is the main brand, Time Sports is a division
    "toe toaster": "toe toasters",  # toe toaster is misspelled
    "trek bikes": "trek",  # Trek is the main brand, Trek Bikes is redundant
    "triflow": "tri-flow",  # triflow is often hyphenated
    "under armor": "under armour",  # under armor is misspelled
    "union bindings": "union",  # Union is the main brand, Union Bindings is redundant
    "velox": "velo",  # Velox is a sub-brand of Velo
    "vittoria": "victora",  # vittoria is misspelled
    "völkl": "volkl",  # völkl is already listed correctly
    "wtb": "w.t.b",  # WTB is often abbreviated with periods
    "yes": "yes.",  # yes. is a typo of yes
}