# Identifying and Handling Missing Data
# step 1 Finding missing values

In [1]:
#import libraries

import pandas as pd
import numpy as np

In [4]:
#Load dataset

df=pd.read_csv('data_cleaning_in_python.csv')
df.head(5)

Unnamed: 0,customer_id,Product Name,Price,Quantity,Customer_Type,Order_Date,customer_email
0,1027,laptop,169.12,3,PREMIUM,,bob@email.com
1,1046,Smartphone,1189.32,4,PREMIUM,,charlie@email.com
2,1097,MOUSE,MISSING,2,Regular,2024-01-06,eve@email.com
3,1042,laptop,MISSING,5,Premium,2024-01-21,alice@email.com
4,1045,headphones,MISSING,3,regular,2024-01-04,charlie@email.com


In [5]:
# info about the df
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 230 entries, 0 to 229
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   customer_id       230 non-null    int64 
 1    Product Name     230 non-null    object
 2   Price             230 non-null    object
 3   Quantity          230 non-null    int64 
 4   Customer_Type     230 non-null    object
 5   Order_Date        99 non-null     object
 6    customer_email   230 non-null    object
dtypes: int64(2), object(5)
memory usage: 12.7+ KB


In [6]:
#missing values
df.isnull().sum()

customer_id           0
 Product Name         0
Price                 0
Quantity              0
Customer_Type         0
Order_Date          131
 customer_email       0
dtype: int64

In [7]:
# Let's also check for 'MISSING' and empty strings
print("\nOther forms of missing data:")
print("'MISSING' values in Price column:", (df['Price'] == 'MISSING').sum())
print("Empty strings in Order_Date:", (df['Order_Date'] == '').sum())


Other forms of missing data:
'MISSING' values in Price column: 118
Empty strings in Order_Date: 0


## Step 2: Cleaning Missing Data

In [9]:
# First, let's replace 'MISSING' and empty strings with actual NaN valuespr

df['Price'] = df['Price'].replace('MISSING', np.nan)

In [10]:
df.isnull().sum()

customer_id           0
 Product Name         0
Price               118
Quantity              0
Customer_Type         0
Order_Date          131
 customer_email       0
dtype: int64

In [12]:
# Strategy 1: Drop rows with missing values (use carefully!)
df_drop_missing = df.dropna()


df_drop_missing.shape

(49, 7)

In [13]:
# Strategy 2: Fill missing values with appropriate replacements
df_filled = df.copy()

In [15]:
# Fill missing prices with the median price (after converting to numeric)

#convert price to numeric column
df_filled['Price'] = pd.to_numeric(df_filled['Price'], errors='coerce')

In [17]:
# Calculate median price

median_price= df_filled['Price'].median()

In [20]:
#replace missing price with median price
df_filled['Price']= df_filled['Price'].fillna(median_price)

In [21]:
df_filled.isnull().sum()

customer_id           0
 Product Name         0
Price                 0
Quantity              0
Customer_Type         0
Order_Date          131
 customer_email       0
dtype: int64

In [24]:
df_filled['Order_Date']= df_filled['Order_Date'].fillna('2024-01-22')

In [25]:
df_filled.isnull().sum()

customer_id         0
 Product Name       0
Price               0
Quantity            0
Customer_Type       0
Order_Date          0
 customer_email     0
dtype: int64

## ️ Cleaning Column Names and Data Types
### Fixing column names

In [26]:
df_filled.columns

Index(['customer_id', ' Product Name ', 'Price', 'Quantity', 'Customer_Type',
       'Order_Date', ' customer_email '],
      dtype='object')

In [27]:
# Method 1: Strip spaces and standardize
df_filled.columns = df_filled.columns.str.strip() # Remove leading/trailing spaces

In [28]:
df_filled.columns

Index(['customer_id', 'Product Name', 'Price', 'Quantity', 'Customer_Type',
       'Order_Date', 'customer_email'],
      dtype='object')

In [29]:
df_filled=df_filled.rename(columns= {
    "Product Name" : "product_names",
    "Price" : "price",
    "Quantity" : "quantity",
    "Customer Type" : "customer_type"
})

df_filled.columns

Index(['customer_id', 'product_names', 'price', 'quantity', 'Customer_Type',
       'Order_Date', 'customer_email'],
      dtype='object')

# cleaning data text

In [30]:
df_filled.head()

Unnamed: 0,customer_id,product_names,price,quantity,Customer_Type,Order_Date,customer_email
0,1027,laptop,169.12,3,PREMIUM,2024-01-22,bob@email.com
1,1046,Smartphone,1189.32,4,PREMIUM,2024-01-22,charlie@email.com
2,1097,MOUSE,527.895,2,Regular,2024-01-06,eve@email.com
3,1042,laptop,527.895,5,Premium,2024-01-21,alice@email.com
4,1045,headphones,527.895,3,regular,2024-01-04,charlie@email.com


 ## 🧽 Cleaning Text Data
### Standardizing Text Fields

In [32]:
print("Get duplicates")

print(df_filled.duplicated().sum())

Get duplicates
10


In [33]:
df_filled=df_filled.duplicates().sum()

print("duplicate rows:",print(df_filled.duplicated().sum()))

AttributeError: 'DataFrame' object has no attribute 'duplicates'