In [3]:
import pandas as pd
import numpy as np

# Example DataFrame
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve', 'Frank', 'Eve'],
    'Age': [25, np.nan, 35, 120, 28, -5, 28],
    'Salary': [50000, 60000, 70000, 80000, None, 45000, None]
}
df = pd.DataFrame(data)


In [4]:
# ✅ Import Libraries
import pandas as pd
import numpy as np

# ✅ Sample DataFrame
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve', 'Frank', 'Eve'],
    'Age': [25, np.nan, 35, 120, 28, -5, 28],
    'Salary': [50000, 60000, 70000, 80000, None, 45000, None]
}

df = pd.DataFrame(data)
print("Original DataFrame:\n", df)

# -----------------------------------------
# 🔍 1. Handling Null Values
# -----------------------------------------
print("\n🔍 Null Values (Before):\n", df.isnull().sum())

# Option 1: Drop rows with any null values
df_no_nulls = df.dropna()

# Option 2: Fill nulls with median or mean
df['Age'].fillna(df['Age'].median(), inplace=True)
df['Salary'].fillna(df['Salary'].mean(), inplace=True)

print("\n✅ Null Values (After Fill):\n", df.isnull().sum())

# -----------------------------------------
# 🚨 2. Handling Outliers (Using IQR)
# -----------------------------------------
def remove_outliers_iqr(data, column):
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return data[(data[column] >= lower_bound) & (data[column] <= upper_bound)]

df = remove_outliers_iqr(df, 'Age')
df = remove_outliers_iqr(df, 'Salary')

print("\n✅ After Removing Outliers:\n", df)

# -----------------------------------------
# 🔁 3. Handling Duplicates
# -----------------------------------------
print("\n🔁 Duplicates Found:", df.duplicated().sum())
df = df.drop_duplicates()

print("\n✅ Final Cleaned DataFrame:\n", df)


Original DataFrame:
       Name    Age   Salary
0    Alice   25.0  50000.0
1      Bob    NaN  60000.0
2  Charlie   35.0  70000.0
3    David  120.0  80000.0
4      Eve   28.0      NaN
5    Frank   -5.0  45000.0
6      Eve   28.0      NaN

🔍 Null Values (Before):
 Name      0
Age       1
Salary    2
dtype: int64

✅ Null Values (After Fill):
 Name      0
Age       0
Salary    0
dtype: int64

✅ After Removing Outliers:
   Name   Age   Salary
1  Bob  28.0  60000.0
4  Eve  28.0  61000.0
6  Eve  28.0  61000.0

🔁 Duplicates Found: 1

✅ Final Cleaned DataFrame:
   Name   Age   Salary
1  Bob  28.0  60000.0
4  Eve  28.0  61000.0


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Salary'].fillna(df['Salary'].mean(), inplace=True)


In [5]:
def remove_outliers_iqr(data, column):
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return data[(data[column] >= lower_bound) & (data[column] <= upper_bound)]

# Remove outliers from 'Age'
df = remove_outliers_iqr(df, 'Age')


In [6]:
# Check for duplicates
print(df.duplicated().sum())

# Remove duplicate rows
df = df.drop_duplicates()


0


In [7]:
print(df)


  Name   Age   Salary
1  Bob  28.0  60000.0
4  Eve  28.0  61000.0
