Author:
Naeimeh Sharghivand <https://www.linkedin.com/in/naeimeh-sharghivand-87666053/>

Last modified: 17-Jun-2023

# Importing required modules

In [None]:
import pandas as pd
pd.set_option('display.float_format', lambda x: '%.2f' % x)
pd.set_option('display.max_rows', 1000)

# Importing the Dataset

In [None]:
# products.csv
url = "https://drive.google.com/file/d/1kEoSHwPEeamEa2KbwVd3TRoZ56AZVnN-/view?usp=sharing"
path = "https://drive.google.com/uc?export=download&id="+url.split("/")[-2]
products = pd.read_csv(path)

# Cleaning the Eniac's `products` dataset

## Data Cleaning with Pandas

In [None]:
products.sample(15)

Unnamed: 0,sku,name,desc,price,promo_price,in_stock,type
16625,APP2348,"Apple Macbook Pro 15 ""Core i7 Touch Bar 31GHz ...",New MacBook Pro 15-inch Core i7 Touch Bar 31Gh...,3999.0,37.990.043,1,"1,02E+12"
12214,BEL0194,MIXIT Belkin Car Charger Premium White,Metallic car charger for iPad iPhone and iPad,14.99,149.895,1,13615399
10200,PAC1594,"Apple iMac 27 ""Core i5 3.2GHz Retina 5K | 32GB...",Desktop computer iMac 27-inch 3.2GHz Core i5 5...,3889.0,28.489.898,0,"5,74E+15"
2882,APP1093,Apple iPod Touch 64GB Rosa,New 6th generation iPod Touch 64GB with 8 mega...,292.81,3.428.112,0,11821715
44,MOS0062,Moshi USB Ethernet Adapter,USB to Ethernet adapter.,29.0,249.865,0,1325
14132,APP1889,"Apple MacBook Pro 15 ""Core i7 Touch Bar 29GHz ...",New MacBook Pro 15-inch Core i7 Touch Bar 29Gh...,3559.0,3.387.585,0,2158
4907,PAC1042,"Apple iMac 27 ""Core i5 3.3GHz Retina 5K | 32GB...",IMac desktop computer 27 inch 5K Retina i5 3.3...,3349.0,2796.99,0,"5,74E+15"
5631,PAC1055,"Apple iMac 27 ""Core i7 Retina 5K 4GHz | 32GB |...",IMac desktop computer 27 inch 5K Retina 4GHz i...,3949.0,33.669.896,0,"5,74E+15"
2319,MOP0062,Mophie Juice Pack Air Case Battery (2750 mAh) ...,Case 2750 mAh battery and LED indicator for iP...,109.99,519.937,0,"5,49E+11"
6040,PAC1062,"Apple iMac 27 ""Core i5 3.3GHz Retina 5K | 32GB...",IMac desktop computer 27 inch 5K Retina i5 3.3...,3649.0,30.809.903,0,"5,74E+15"


In [None]:
products.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19326 entries, 0 to 19325
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   sku          19326 non-null  object
 1   name         19326 non-null  object
 2   desc         19319 non-null  object
 3   price        19280 non-null  object
 4   promo_price  19326 non-null  object
 5   in_stock     19326 non-null  int64 
 6   type         19276 non-null  object
dtypes: int64(1), object(6)
memory usage: 1.0+ MB


In [None]:
products.isna().sum()

sku             0
name            0
desc            7
price          46
promo_price     0
in_stock        0
type           50
dtype: int64

In [None]:
products.duplicated().sum()

8746

In [None]:
products.drop_duplicates(inplace=True)

-> Duplicated rows are removed.

In [None]:
len(products)

10580

In [None]:
products.in_stock.nunique()

2

In [None]:
products.duplicated(subset=['sku']).sum()

1

In [None]:
products.loc[products.duplicated(subset=['sku'])]

Unnamed: 0,sku,name,desc,price,promo_price,in_stock,type
8000,APP1197,"Apple iMac 21.5 ""Core i5 31 GHz Retina display...",Desktop Apple iMac 21.5 inch i5 31 GHz Retina ...,,1305.59,0,1282


In [None]:
products.loc[products['sku'] == 'APP1197']

Unnamed: 0,sku,name,desc,price,promo_price,in_stock,type
7992,APP1197,"Apple iMac 21.5 ""Core i5 31 GHz Retina display...",Desktop Apple iMac 21.5 inch i5 31 GHz Retina ...,1729.0,1305.59,0,1282
8000,APP1197,"Apple iMac 21.5 ""Core i5 31 GHz Retina display...",Desktop Apple iMac 21.5 inch i5 31 GHz Retina ...,,1305.59,0,1282


In [None]:
filter = products['sku'] == 'APP1197'
products.drop(products[filter].index[0], inplace=True)

-> One of the duplicated 'sku' rows is removed.

In [None]:
products.duplicated().sum()

0

In [None]:
products.isna().sum()

sku             0
name            0
desc            7
price          46
promo_price     0
in_stock        0
type           50
dtype: int64

Having a closer look at `products.desc`

In [None]:
products.loc[products.desc.isna(),:]

Unnamed: 0,sku,name,desc,price,promo_price,in_stock,type
16126,WDT0211-A,"Open - Purple 2TB WD 35 ""PC Security Mac hard ...",,107,814.659,0,1298
16128,APP1622-A,Open - Apple Smart Keyboard Pro Keyboard Folio...,,1.568.206,1.568.206,0,1298
17843,PAC2334,Synology DS718 + NAS Server | 10GB RAM,,566.35,5.659.896,0,12175397
18152,KAN0034-A,Open - Kanex USB-C Gigabit Ethernet Adapter Ma...,,29.99,237.925,0,1298
18490,HTE0025,Hyper Pearl 1600mAh battery Mini USB Mirror an...,,24.99,22.99,1,1515
18612,OTT0200,OtterBox External Battery Power Pack 20000 mAHr,,79.99,56.99,1,1515
18690,HOW0001-A,Open - Honeywell thermostat Lyric zonificador ...,,199.99,1.441.174,0,11905404


-> Missed `products.desc` data are filled using `products.name`

In [None]:
products.loc[products['desc'].isna(), 'desc'] = products.loc[products['desc'].isna(), 'name']

### Looking at the `products.price`

In [None]:
products.price.isna().sum()

46

In [None]:
products.price.isna().value_counts()

False    10533
True        46
Name: price, dtype: int64

In [None]:
print(f'The missing percentage of the price: {(products.price.isna().value_counts(normalize=True)[1] * 100).round(2)}%')

The missing percentage of the price: 0.43%


In [None]:
products.loc[products.price.isna()][:10]

Unnamed: 0,sku,name,desc,price,promo_price,in_stock,type
34,TWS0019,Twelve South MagicWand support Apple Magic Tra...,MagicWand for wireless keyboard and Magic Trac...,,299.899,0,8696
1900,AII0008,Aiino Case MacBook Air 11 '' Transparent,MacBook Air 11-inch casing with matte finish.,,22.99,0,13835403
2039,CEL0020,Celly Ambo Luxury Leather Case + iPhone 6 Case...,Cover and housing together with magnet for iPh...,,399.905,0,11865403
2042,CEL0007,Celly Wallet Case with removable cover Black i...,Case Book for iPhone 6 card case type.,,128.998,0,11865403
2043,CEL0012,Celly Silicone Hard Shell iPhone 6 Blue,Hard Shell Silicone iPhone 6.,,4.99,0,11865403
2044,CEL0014,Celly Silicone Hard Shell iPhone 6 Amarillo,Hard Shell Silicone iPhone 6.,,59.895,0,11865403
2049,CEL0015,Celly fur-lined Powerbank battery 4000mAh Black,Leather-wrapped External Battery 4000mAh for i...,,239.895,0,1515
2051,CEL0018,Celly Wallet Leather Case cover Black iPhone 6,Card case with transparent protective cover fo...,,294.877,0,11865403
2052,CEL0023,Celly Ambo Luxury Leather Case + Case Gold iPh...,Cover and housing together with magnet for iPh...,,329.894,0,11865403
2053,CEL0025,Celly Ambo Luxury Leather Case + Case iPhone 6...,Cover and housing together with magnet for iPh...,,449.878,0,11865403


In [None]:
products.dropna(subset=['price'], inplace=True)

-> 46 missing prices for products most of which were not in stock are deleted.

Looking to know how many values are affected by the 2 decimal points or 3 decimal places problems in the `products.price`

In [None]:
price_problems_qty = products.loc[(products.price.astype(str).str.contains("\d+\.\d+\.\d+"))|(products.price.astype(str).str.contains("\d+\.\d{3,}")), :].shape[0]
price_problems_qty

542

In [None]:
print(f"The column price has in total {price_problems_qty} wrong values. This is {round(((price_problems_qty / products.shape[0]) * 100), 2)}% of the rows of the DataFrame")

The column price has in total 542 wrong values. This is 5.15% of the rows of the DataFrame


In [None]:
products = products.loc[(~products.price.str.contains("\d+\.\d+\.\d+"))&(~products.price.str.contains("\d+\.\d{3,}")), :]

-> Rows consisting of problems in the price are deleted.

In [None]:
products.sample(10)

Unnamed: 0,sku,name,desc,price,promo_price,in_stock,type
13976,APP1815,"Apple MacBook Pro 13 ""with Touch Bar GHz Core ...",New MacBook Pro 13-inch Core i5 Touch Bar to 2...,2199.0,20.365.849,0,2158
12799,IFX0116,Kit libreación iFixit iPhone 5 Black,Set screws and screwdrivers Philips for iPhone 5,9.95,9.49,0,12645406
1084,PUR0113,Pure Sport Armband iPhone Amarillo,Sweat resistant sports bracelet iPhone.,19.99,199.892,0,5405
14752,GTE0085,G-Technology G-Drive Slim USB Drive 1TB SSD-C ...,SSD slim design and USB-C connection compatibl...,388.99,307.795,0,11935397
11424,BAN0007,Panama Band & Strap Watch Strap Black 42mm Apple,Leather strap for easy installation Apple Watc...,59.0,399.905,0,2449
10852,MOS0167,Moshi iGlaze Armor Case for iPhone 6 / 6S Copp...,Adjustable thin aluminum casing for iPhone 6 /...,45.0,299.899,0,11865403
14743,TIG0019-A,Open - Tigra Support + Bike Case iPhone 6 Plus,Bike Waterproof Case for iPhone 6 Plus support,39.99,294.348,0,1298
12589,IFX0078,iFixit piece WiFi Antenna + Tools for iPhone 6,Wifi antenna for iPhone 6 together with the to...,19.95,199.904,1,21485407
13553,APP1780,Apple Watch Series 1 38mm Aluminum Gold Case R...,Apple Watch 38 mm Series 1 sports-strap in pin...,339.0,3.389.997,0,24885185
13348,OTT0163,OtterBox Defender iPhone Case Triple Layer 7 B...,An ultra-resistant protective layers 3 iPhone 7,49.99,349.896,0,11865403


### Looking at the `products.promo_price`

In [None]:
promo_problems_qty = products.loc[(products.promo_price.str.contains("\d+\.\d+\.\d+"))|(products.promo_price.str.contains("\d+\.\d{3,}")), :].shape[0]
promo_problems_qty

9232

In [None]:
((products.promo_price.str.contains("\d+\.\d+\.\d+"))|(products.promo_price.str.contains("\d+\.\d{3,}"))).value_counts(normalize=True)*100

True    92.40
False    7.60
Name: promo_price, dtype: float64

Over 90% of the data in the promo_price column are corrupted. This column seems to be unreliable, I delete the column.

In [None]:
products_cl = products.drop(columns=['promo_price'])

In [None]:
products_cl.head()

Unnamed: 0,sku,name,desc,price,in_stock,type
0,RAI0007,Silver Rain Design mStand Support,Aluminum support compatible with all MacBook,59.99,1,8696
1,APP0023,Apple Mac Keyboard Keypad Spanish,USB ultrathin keyboard Apple Mac Spanish.,59.0,0,13855401
2,APP0025,Mighty Mouse Apple Mouse for Mac,mouse Apple USB cable.,59.0,0,1387
3,APP0072,Apple Dock to USB Cable iPhone and iPod white,IPhone dock and USB Cable Apple iPod.,25.0,0,1230
4,KIN0007,Mac Memory Kingston 2GB 667MHz DDR2 SO-DIMM,2GB RAM Mac mini and iMac (2006/07) MacBook Pr...,34.99,1,1364


In [None]:
products_cl.isna().sum()

sku          0
name         0
desc         0
price        0
in_stock     0
type        46
dtype: int64

In [None]:
products_cl.loc[(products_cl['type'].isna())]

Unnamed: 0,sku,name,desc,price,in_stock,type
307,SAN0017,SanDisk Cruzer Edge USB 2.0 Flash Drive 16GB,Pendrive USB 2.0 Mac and PC.,8.0,0,
530,SAN0026,SanDisk Extreme Cruzer 16GB USB 3.0 Flash Drive,USB 3.0 flash drive 16GB USB Mac and PC.,22.0,0,
798,APP0675,Apple iPhone 5S 32GB Space Gray,New Free iPhone 5S 32GB (ME435Y / A).,559.0,0,
1193,APP0823,Apple iPhone 6 16GB Silver,New iPhone 6 16GB Free (MG482QL / A).,639.0,0,
1199,APP0829,Apple iPhone 6 Plus 16GB Silver,New iPhone 6 Plus 16G Free (MGA92QL / A).,749.0,0,
1200,APP0822,Apple iPhone 6 16GB Space Gray,New iPhone 6 16GB Free (MG472QL / A).,639.0,0,
1201,APP0825,Apple iPhone 6 64GB Space Gray,New iPhone 6 64GB Free (MG4F2QL / A).,749.0,0,
1202,APP0826,Apple iPhone 6 64GB Silver,New iPhone 6 64GB Free (MG4H2QL / A).,749.0,0,
1203,APP0828,Apple iPhone 6 Plus 16GB Space Gray,New 16GB iPhone 6 Plus Free (MGA82QL / A).,749.0,0,
1280,APP0856,Apple iPhone 6 128GB Gold,New Free iPhone 6 128GB (MG4E2QL / A).,899.0,0,


In [None]:
products_cl.loc[(products_cl['type'].isna()) & (products_cl['in_stock'] == 1)]

Unnamed: 0,sku,name,desc,price,in_stock,type
18822,DOD0010,Dodocool Sports Wireless Stereo Headphones Black,Wireless stereo headphones with remote control...,24.99,1,


Except for one row, all the products for missing type values are not in stock. Hence, the rows with missing types are deleted.

In [None]:
products_cl.dropna(subset=['type'], inplace=True)

In [None]:
products_cl.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9945 entries, 0 to 19325
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   sku       9945 non-null   object 
 1   name      9945 non-null   object 
 2   desc      9945 non-null   object 
 3   price     9945 non-null   float64
 4   in_stock  9945 non-null   int64  
 5   type      9945 non-null   object 
dtypes: float64(1), int64(1), object(4)
memory usage: 543.9+ KB


In [None]:
products['price'] = pd.to_numeric(products['price'])

In [None]:
products_cl['price'] = pd.to_numeric(products['price'])

In [None]:
products_cl.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9945 entries, 0 to 19325
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   sku       9945 non-null   object 
 1   name      9945 non-null   object 
 2   desc      9945 non-null   object 
 3   price     9945 non-null   float64
 4   in_stock  9945 non-null   int64  
 5   type      9945 non-null   object 
dtypes: float64(1), int64(1), object(4)
memory usage: 543.9+ KB


In [None]:
products_cl.isna().sum()

sku         0
name        0
desc        0
price       0
in_stock    0
type        0
dtype: int64

In [None]:
from google.colab import files

products_cl.to_csv('products_cl.csv', index=False)
files.download('products_cl.csv')

products_cl.to_parquet('products_cl.parquet')
files.download('products_cl.parquet')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>