# Products

## Imports

In [514]:
import pandas as pd
products_url = "https://raw.githubusercontent.com/MerleSt/Eniac/main/Data-Eniac/products.csv"
products = pd.read_csv(products_url)
products

Unnamed: 0,sku,name,desc,price,promo_price,in_stock,type
0,RAI0007,Silver Rain Design mStand Support,Aluminum support compatible with all MacBook,59.99,499.899,1,8696
1,APP0023,Apple Mac Keyboard Keypad Spanish,USB ultrathin keyboard Apple Mac Spanish.,59,589.996,0,13855401
2,APP0025,Mighty Mouse Apple Mouse for Mac,mouse Apple USB cable.,59,569.898,0,1387
3,APP0072,Apple Dock to USB Cable iPhone and iPod white,IPhone dock and USB Cable Apple iPod.,25,229.997,0,1230
4,KIN0007,Mac Memory Kingston 2GB 667MHz DDR2 SO-DIMM,2GB RAM Mac mini and iMac (2006/07) MacBook Pr...,34.99,31.99,1,1364
...,...,...,...,...,...,...,...
19321,BEL0376,Belkin Travel Support Apple Watch Black,compact and portable stand vertically or horiz...,29.99,269.903,1,12282
19322,THU0060,"Enroute Thule 14L Backpack MacBook 13 ""Black",Backpack with capacity of 14 liter compartment...,69.95,649.903,1,1392
19323,THU0061,"Enroute Thule 14L Backpack MacBook 13 ""Blue",Backpack with capacity of 14 liter compartment...,69.95,649.903,1,1392
19324,THU0062,"Enroute Thule 14L Backpack MacBook 13 ""Red",Backpack with capacity of 14 liter compartment...,69.95,649.903,0,1392


## Drop Duplicates

In [515]:
products.duplicated().sum()

8746

In [516]:
products.drop_duplicates(inplace=True)
products.duplicated().sum()

0

In [517]:
products.shape

(10580, 7)

## Missing Values

In [518]:
products.isna().sum()

sku             0
name            0
desc            7
price          46
promo_price     0
in_stock        0
type           50
dtype: int64

### Desc

In [519]:
(f"7 missing values represents {((products.desc.isna().sum() / products.shape[0])*100).round(5)}% of the rows in our DataFrame")

'7 missing values represents 0.06616% of the rows in our DataFrame'

In [520]:
products.desc.isna().value_counts(normalize=True)

False    0.999338
True     0.000662
Name: desc, dtype: float64

In [521]:
products = products.loc[~products.desc.isna(), :]
products.isna().sum()

sku             0
name            0
desc            0
price          46
promo_price     0
in_stock        0
type           50
dtype: int64

### Price

In [522]:
(f"46 missing values represents {((products.price.isna().sum() / products.shape[0])*100).round(5)}% of the rows in our DataFrame")

'46 missing values represents 0.43507% of the rows in our DataFrame'

In [523]:
products.price.isna().value_counts(normalize=True)

False    0.995649
True     0.004351
Name: price, dtype: float64

In [524]:
products = products.loc[~products.price.isna(), :]
products.isna().sum()

sku             0
name            0
desc            0
price           0
promo_price     0
in_stock        0
type           50
dtype: int64

### Type

In [525]:
(f"50 missing values represents {((products.type.isna().sum() / products.shape[0])*100).round(5)}% of the rows in our DataFrame")

'50 missing values represents 0.47497% of the rows in our DataFrame'

In [526]:
products.type.isna().value_counts(normalize=True)

False    0.99525
True     0.00475
Name: type, dtype: float64

In [527]:
products = products.loc[~products.type.isna(), :]
products.isna().sum()

sku            0
name           0
desc           0
price          0
promo_price    0
in_stock       0
type           0
dtype: int64

## Datatypes & Format

In [528]:
products.dtypes

sku            object
name           object
desc           object
price          object
promo_price    object
in_stock        int64
type           object
dtype: object

In [529]:
products['in_stock'] = products['in_stock'].astype(bool)

### Price

In [530]:
products.price.str.contains("\d+\.\d+\.\d+").value_counts()

False    10104
True       373
Name: price, dtype: int64

In [531]:
products.promo_price.str.contains("\d+\.\d+\.\d+").value_counts()

False    5921
True     4556
Name: promo_price, dtype: int64

In [532]:
products[products['price'].str.contains("\d+\.\d+\.\d+", na=False)]

Unnamed: 0,sku,name,desc,price,promo_price,in_stock,type
665,CRU0015-2,Crucial memory Mac 16GB (2x8GB) SO-DIMM DDR3 1...,RAM 16GB (2x8GB) 135V MacBook Pro iMac (2012/2...,1.639.792,1.629.894,True,1364
827,PAC0339,NewerTech miniStack 4TB Hard Drive Mac,External Box Hard Drive Mac + 4TB.,2.199.791,2.199.901,False,11935397
885,PAC0376,OWC Mercury Elite Pro Dual Thunderbolt + 8TB,RAID outer box 35 inch SATA connection Thunder...,5.609.698,5.549.895,False,11935397
898,REP0156,iPhone 5 GSM antenna repair,Repair service including parts and labor for i...,69.989.909,699.899,False,"1,44E+11"
941,REP0185,Home button repair iPad mini,Repair service including parts and labor for i...,69.989.909,699.899,False,"1,44E+11"
...,...,...,...,...,...,...,...
19312,REP0424,Input repair Headphones iPad,Repair service including parts and labor for iPad,6.999.003,69.99,False,"1,44E+11"
19313,REP0421,iPad charging connector repair,Repair service including parts and labor for iPad,6.999.003,69.99,False,"1,44E+11"
19314,REP0416,iPad front camera repair,Repair service including parts and labor for iPad,6.999.003,69.99,False,"1,44E+11"
19315,REP0413,repair rear camera iPad,Repair service including parts and labor for iPad,6.999.003,69.99,False,"1,44E+11"


In [533]:
# mask = products['promo_price'].str.contains(r'^(?!\d{2}\.\d{3}$)(?:\d+\.\d{3}\.\d{3}|\d+\.\d{3})$', regex=True)

# # Handle the n.nnn.nnn format
# two_dots_mask = products['promo_price'][mask].str.count('\.') == 2
# products.loc[mask & two_dots_mask, 'promo_price'] = products['promo_price'][mask & two_dots_mask].str.replace('.', '', 1).str.replace('.', '', regex=False).str[:-3] + '.' + products['promo_price'][mask & two_dots_mask].str[-3:]

# # Handle the nnn.nnn format
# one_dot_mask = products['promo_price'][mask].str.count('\.') == 1
# products.loc[mask & one_dot_mask, 'promo_price'] = products['promo_price'][mask & one_dot_mask].str.replace('.', '', regex=False).str[:-4] + '' + products['promo_price'][mask & one_dot_mask].str[-4:]
# products.tail(100)

In [534]:
two_dot_percentage = ((products.price.str.contains("\d+\.\d+\.\d+").value_counts()[1] / products.shape[0])*100).round(2)
print(f"The 2 dot problem represents {two_dot_percentage}% of the rows in our DataFrame")

The 2 dot problem represents 3.56% of the rows in our DataFrame


In [535]:
two_dot_order_sku_list = products.loc[products.price.str.contains("\d+\.\d+\.\d+"), "sku"]
products = products.loc[~products.sku.isin(two_dot_order_sku_list)]
products[products['price'].str.contains("\d+\.\d+\.\d+", na=False)]

Unnamed: 0,sku,name,desc,price,promo_price,in_stock,type


In [536]:
products['price'] = pd.to_numeric(products['price'])

In [537]:
products.dtypes

sku             object
name            object
desc            object
price          float64
promo_price     object
in_stock          bool
type            object
dtype: object

### Promo Price

Format price, into r'/d+/./d{2}

In [538]:
products.loc[:,'price'] = products['price'].round(2)

In [539]:
products[['promo_price','price']].head(50)

Unnamed: 0,promo_price,price
0,499.899,59.99
1,589.996,59.0
2,569.898,59.0
3,229.997,25.0
4,31.99,34.99
5,420.003,45.0
6,146.471,18.99
7,274.694,36.99
8,669.904,74.0
9,330.003,35.0


``` python
# Extract the number of characters before the last dot for both columns
products['price_pre_decimal_count'] = products['price'].apply(lambda x: len(str(int(x))))
products['promo_price_pre_decimal_count'] = products['promo_price'].str.extract(r'(\d+)\.').astype(str).applymap(len)[0]

# Handle format: nn.nn and nnn.nnn
mask_2_3 = (products['price_pre_decimal_count'] == 2) & (products['promo_price_pre_decimal_count'] == 3)
products.loc[mask_2_3, 'promo_price'] = products['promo_price'].str[:-2] + products['promo_price'].str[-2:]

# Handle format: ppp.pp and p.ppp.ppp
mask_3_1_3 = (products['price_pre_decimal_count'] == 3) & (products['promo_price'].str.count('\.') == 2)
products.loc[mask_3_1_3, 'promo_price'] = products['promo_price'].str.replace('.', '', 1)

# Convert to float and round
products['price'] = products['price'].round(2)
products['promo_price'] = products['promo_price'].astype(float).round(2)

# Handle cases where promo_price > price
mask_promo_gt_price = products['promo_price'] > products['price']
products.loc[mask_promo_gt_price, 'promo_price'] = products['promo_price'] / 10

# Drop helper columns
products = products.drop(columns=['price_pre_decimal_count', 'promo_price_pre_decimal_count'])

# Display
print(products)
````


In [540]:
# Count the number of digits before the decimal in the 'price' column
products['price_pre_decimal_count'] = products['price'].apply(lambda x: len(str(int(x))))
products.head(50)

Unnamed: 0,sku,name,desc,price,promo_price,in_stock,type,price_pre_decimal_count
0,RAI0007,Silver Rain Design mStand Support,Aluminum support compatible with all MacBook,59.99,499.899,True,8696,2
1,APP0023,Apple Mac Keyboard Keypad Spanish,USB ultrathin keyboard Apple Mac Spanish.,59.0,589.996,False,13855401,2
2,APP0025,Mighty Mouse Apple Mouse for Mac,mouse Apple USB cable.,59.0,569.898,False,1387,2
3,APP0072,Apple Dock to USB Cable iPhone and iPod white,IPhone dock and USB Cable Apple iPod.,25.0,229.997,False,1230,2
4,KIN0007,Mac Memory Kingston 2GB 667MHz DDR2 SO-DIMM,2GB RAM Mac mini and iMac (2006/07) MacBook Pr...,34.99,31.99,True,1364,2
5,APP0073,Apple Composite AV Cable iPhone and iPod white,IPhone and iPod AV Cable Dock to Composite Video.,45.0,420.003,False,1230,2
6,KIN0008,Mac Memory Kingston 1GB 667MHz DDR2 SO-DIMM,1GB RAM Mac mini and iMac (2006/07) MacBook Pr...,18.99,146.471,False,1364,2
7,KIN0009,Mac Memory Kingston 2GB 800MHz DDR2 SO-DIMM,2GB RAM iMac with Intel Core 2 Duo (Penryn).,36.99,274.694,False,1364,2
8,KIN0001-2,Mac memory Kingston 4GB (2x2GB) 667MHz DDR2 SO...,RAM 4GB (2x2GB) Mac mini and iMac (2006/07) Ma...,74.0,669.904,False,1364,2
9,APP0100,Apple Adapter Mini Display Port to VGA,Adapter Mini Display Port to VGA MacBook and M...,35.0,330.003,False,1325,2


In [546]:
def count_digits_before_last_dot(s):
    parts = s.split('.')
    if len(parts) > 1:
        # Join all parts except the last one, then count the digits
        return len(''.join(parts[:-1]))
    else:
        return len(parts[0])

products['promo_price_pre_decimal_count'] = products['promo_price'].apply(count_digits_before_last_dot)
products.tail(50)
# products.loc[products['promo_price_pre_decimal_count']==6]

Unnamed: 0,sku,name,desc,price,promo_price,in_stock,type,price_pre_decimal_count,promo_price_pre_decimal_count
19258,TWS0106-A,Open - Twelve South Bridge Magic Keyboard for ...,Light reconditioned support to hold the Keyboa...,48.0,260.978,False,8696,2,3
19259,TPL0030-A,Open - TP-Link TL-PA4010P Passthrough Powerlin...,Refurbished Kit internet amplifiers with trans...,54.33,381.891,False,1334,2,3
19260,HOC0025,Hoco Grand Series Metal strap 38mm Apple Watch...,Stainless steel strap Hoco for Apple Watch 38mm.,54.99,499.899,True,2449,2,3
19261,HOC0027,Hoco Grand Series Metal 38mm Apple Watch Strap...,Stainless steel strap Hoco for Apple Watch 38mm.,65.99,599.906,True,2449,2,3
19262,HOC0029,Hoco Grand Series 38mm Apple Watch metal Strap...,Stainless steel strap Hoco for Apple Watch 38mm.,65.99,599.906,True,2449,2,3
19263,HOC0026,Hoco Grand Series Metal strap 42mm Apple Watch...,Stainless steel strap Hoco for Apple Watch 42mm.,54.99,499.899,False,2449,2,3
19264,HOC0028,Hoco Grand Series Metal 42mm Apple Watch Strap...,Stainless steel strap Hoco for Apple Watch 42mm.,65.99,599.906,False,2449,2,3
19265,WDT0417,"WD Hard Drive 6TB Gold 35 ""Servers",Hard Western Digital 6TB 35 inches SATA 6 Gb /...,329.0,2.565.841,False,12655397,3,4
19266,WDT0416,"WD Hard Drive 8TB Gold 35 ""Servers",Hard Western Digital 8TB 35 inches SATA 6 Gb /...,419.0,3.059.945,True,12655397,3,4
19267,WDT0415,"WD Hard Drive 10TB Gold 35 ""Servers",Hard Western Digital 10TB 35 inches SATA 6 Gb ...,519.0,3.865.841,False,12655397,3,4


In [558]:
def correct_promo_price(row):
    # Check if there are two dots in promo_price
    if row['promo_price'].count('.') == 2:
        # If both counts are equal, drop the first dot
        if row['price_pre_decimal_count'] == row['promo_price_pre_decimal_count']:
            corrected = row['promo_price'].replace('.', '', 1)  # replace the first dot occurrence only
            return corrected
    
    # Your original condition
    if row['price_pre_decimal_count'] != row['promo_price_pre_decimal_count']:
        corrected = row['promo_price'].replace('.', '')
        split_index = row['price_pre_decimal_count']
        corrected = corrected[:split_index] + '.' + corrected[split_index:]
        return corrected
    
    return row['promo_price']

products['promo_price'] = products.apply(lambda row: correct_promo_price(row), axis=1)

# Print rows where price is greater than 1000
products.loc[products['price'] > 1000]

Unnamed: 0,sku,name,desc,price,promo_price,in_stock,type,price_pre_decimal_count,promo_price_pre_decimal_count
51,APP0344,"Apple Thunderbolt Display 27 ""Monitor Mac",Monitor Display 27-inch Apple Thunderbolt (MC9...,1149.00,1044.9923,False,1296,4,4
100,APP0390,"Apple MacBook Pro 133 ""Core i5 25GHz | 4GB RAM...",MacBook Pro laptop 133 inches (MD101Y / A).,1199.00,1145.5917,False,1282,4,4
109,PAC0508,Apple MacBook Pro 133 '' 25GHz | 16GB RAM | 1T...,Apple MacBook Pro Fusion Drive 16GB 2 internal...,1919.00,1699.9895,False,1282,4,4
118,PAC0507,Apple MacBook Pro 133 '' 25Ghz | 16GB RAM | Fu...,Apple MacBook Pro Fusion Drive 16GB 2 internal...,1639.00,1598.9896,False,1282,4,4
127,PAC0515,"Apple MacBook Pro 133 ""i7 29GHz | RAM 16GB | 5...",Apple MacBook Pro 133 inches (MD101Y / A) and ...,2039.00,2037.9897,False,1282,4,4
...,...,...,...,...,...,...,...,...,...
19168,APP2067-A,"Open - Apple MacBook Air 13 ""1.8GHz dual-core ...",Reconditioned computer MacBook Air 13 inch i5 ...,1355.59,1134.2684,False,"2,17E+11",4,5
19194,DLL0053,"Dell UltraSharp UP2718Q Monitor 27 ""4K HDR",Monitor 27 inch 4K 4K and 6ms response height ...,1869.99,1569.9895,False,1296,4,5
19197,PAC2510,"Apple iMac 27 ""Core i5 3.3GHz Retina 5K | 16GB...",27-inch iMac 5K Retina refitted with 16GB of R...,2869.00,2099.0045,False,"5,74E+15",4,5
19198,AP20461,"Apple MacBook Pro 15 ""Core i7 Touch Bar 26GHz ...",Refurbished MacBook Pro and 15-inch Apple cert...,2699.00,2198.9935,True,"1,02E+12",4,5


In [559]:
products.promo_price.str.contains("\d+\.\d+\.\d+").value_counts()

False    10104
Name: promo_price, dtype: int64

In [561]:
# Convert to float and round
products['promo_price'] = products['promo_price'].astype(float).round(2)
products

Unnamed: 0,sku,name,desc,price,promo_price,in_stock,type,price_pre_decimal_count,promo_price_pre_decimal_count
0,RAI0007,Silver Rain Design mStand Support,Aluminum support compatible with all MacBook,59.99,49.99,True,8696,2,2
1,APP0023,Apple Mac Keyboard Keypad Spanish,USB ultrathin keyboard Apple Mac Spanish.,59.00,59.00,False,13855401,2,2
2,APP0025,Mighty Mouse Apple Mouse for Mac,mouse Apple USB cable.,59.00,56.99,False,1387,2,2
3,APP0072,Apple Dock to USB Cable iPhone and iPod white,IPhone dock and USB Cable Apple iPod.,25.00,23.00,False,1230,2,2
4,KIN0007,Mac Memory Kingston 2GB 667MHz DDR2 SO-DIMM,2GB RAM Mac mini and iMac (2006/07) MacBook Pr...,34.99,31.99,True,1364,2,2
...,...,...,...,...,...,...,...,...,...
19321,BEL0376,Belkin Travel Support Apple Watch Black,compact and portable stand vertically or horiz...,29.99,26.99,True,12282,2,3
19322,THU0060,"Enroute Thule 14L Backpack MacBook 13 ""Black",Backpack with capacity of 14 liter compartment...,69.95,64.99,True,1392,2,3
19323,THU0061,"Enroute Thule 14L Backpack MacBook 13 ""Blue",Backpack with capacity of 14 liter compartment...,69.95,64.99,True,1392,2,3
19324,THU0062,"Enroute Thule 14L Backpack MacBook 13 ""Red",Backpack with capacity of 14 liter compartment...,69.95,64.99,False,1392,2,3


In [562]:
for index, row in products.iterrows():
    if row['promo_price'] > row['price']:
        products.at[index, 'promo_price'] = row['promo_price'] / 10

# Print rows where price is greater than 1000
products.loc[products['price'] > 1000]


Unnamed: 0,sku,name,desc,price,promo_price,in_stock,type,price_pre_decimal_count,promo_price_pre_decimal_count
51,APP0344,"Apple Thunderbolt Display 27 ""Monitor Mac",Monitor Display 27-inch Apple Thunderbolt (MC9...,1149.00,1044.99,False,1296,4,4
100,APP0390,"Apple MacBook Pro 133 ""Core i5 25GHz | 4GB RAM...",MacBook Pro laptop 133 inches (MD101Y / A).,1199.00,1145.59,False,1282,4,4
109,PAC0508,Apple MacBook Pro 133 '' 25GHz | 16GB RAM | 1T...,Apple MacBook Pro Fusion Drive 16GB 2 internal...,1919.00,1699.99,False,1282,4,4
118,PAC0507,Apple MacBook Pro 133 '' 25Ghz | 16GB RAM | Fu...,Apple MacBook Pro Fusion Drive 16GB 2 internal...,1639.00,1598.99,False,1282,4,4
127,PAC0515,"Apple MacBook Pro 133 ""i7 29GHz | RAM 16GB | 5...",Apple MacBook Pro 133 inches (MD101Y / A) and ...,2039.00,2037.99,False,1282,4,4
...,...,...,...,...,...,...,...,...,...
19168,APP2067-A,"Open - Apple MacBook Air 13 ""1.8GHz dual-core ...",Reconditioned computer MacBook Air 13 inch i5 ...,1355.59,1134.27,False,"2,17E+11",4,5
19194,DLL0053,"Dell UltraSharp UP2718Q Monitor 27 ""4K HDR",Monitor 27 inch 4K 4K and 6ms response height ...,1869.99,1569.99,False,1296,4,5
19197,PAC2510,"Apple iMac 27 ""Core i5 3.3GHz Retina 5K | 16GB...",27-inch iMac 5K Retina refitted with 16GB of R...,2869.00,2099.00,False,"5,74E+15",4,5
19198,AP20461,"Apple MacBook Pro 15 ""Core i7 Touch Bar 26GHz ...",Refurbished MacBook Pro and 15-inch Apple cert...,2699.00,2198.99,True,"1,02E+12",4,5
