# Pandas Data Cleaning Example
This notebook demonstrates how to use all key pandas functions for data cleaning using a sample CSV file.

In [1]:
import pandas as pd

# Load CSV file
df = pd.read_csv('sample_data_cleaning.csv')
df

Unnamed: 0,Name,Age,Gender,City,Salary
0,Alice,25.0,F,New York,70000.0
1,Bob,30.0,M,Los Angeles,80000.0
2,Charlie,,M,Chicago,50000.0
3,David,22.0,M,,60000.0
4,,28.0,F,Houston,


## Data Exploration

In [2]:
# Basic info
df.head()
df.tail()
df.info()
df.describe(include='all')
df.shape
df.columns
df.index
df.dtypes
df.values

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Name    4 non-null      object 
 1   Age     4 non-null      float64
 2   Gender  5 non-null      object 
 3   City    4 non-null      object 
 4   Salary  4 non-null      float64
dtypes: float64(2), object(3)
memory usage: 332.0+ bytes


array([['Alice', 25.0, 'F', 'New York', 70000.0],
       ['Bob', 30.0, 'M', 'Los Angeles', 80000.0],
       ['Charlie', nan, 'M', 'Chicago', 50000.0],
       ['David', 22.0, 'M', nan, 60000.0],
       [nan, 28.0, 'F', 'Houston', nan]], dtype=object)

## Handling Missing Values

In [9]:
# Check for missing values
df.isnull()
df.isnull().sum()



Name              1
Age               1
Gender            0
City              1
Salary            1
Tax               1
SalaryAfterTax    1
dtype: int64

In [7]:
# Drop rows/columns
df.dropna()
df.dropna(axis=1)

Unnamed: 0,Gender
0,F
1,M
2,M
3,M
4,F


In [8]:
# Fill missing values
df.fillna(0)
df.fillna(method='ffill')
df.fillna(method='bfill')

  df.fillna(method='ffill')
  df.fillna(method='bfill')


Unnamed: 0,Name,Age,Gender,City,Salary,Tax,SalaryAfterTax
0,Alice,25.0,F,New York,70000.0,7000.0,63000.0
1,Bob,30.0,M,Los Angeles,80000.0,8000.0,72000.0
2,Charlie,22.0,M,Chicago,50000.0,5000.0,45000.0
3,David,22.0,M,Houston,60000.0,6000.0,54000.0
4,,28.0,F,Houston,,,


## Data Manipulation

In [4]:
# Rename columns
df.rename(columns={'Name': 'FullName'})
# Replace values
df.replace({'M': 'Male', 'F': 'Female'})
# Drop columns
df.drop('City', axis=1)
# Filter rows
df[df['Age'] > 25]
# Add new column
df['Tax'] = df['Salary'] * 0.1
# Apply function
df['SalaryAfterTax'] = df['Salary'].apply(lambda x: x * 0.9 if pd.notnull(x) else x)

## Sorting and Grouping

In [5]:
# Sort
df.sort_values(by='Age')
# Group
df.groupby('Gender')['Salary'].mean()
# Aggregation
df.agg({'Age': ['mean', 'max'], 'Salary': ['sum', 'min']})

Unnamed: 0,Age,Salary
mean,26.25,
max,30.0,
sum,,260000.0
min,,50000.0


## Merging and Joining

In [6]:
# Create another DataFrame
df2 = pd.DataFrame({
    'Name': ['Alice', 'Bob', 'Charlie'],
    'Bonus': [5000, 4000, 3000]
})

# Merge
pd.merge(df, df2, how='left', left_on='Name', right_on='Name')
# Join
df.set_index('Name').join(df2.set_index('Name'), how='left')

Unnamed: 0_level_0,Age,Gender,City,Salary,Tax,SalaryAfterTax,Bonus
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Alice,25.0,F,New York,70000.0,7000.0,63000.0,5000.0
Bob,30.0,M,Los Angeles,80000.0,8000.0,72000.0,4000.0
Charlie,,M,Chicago,50000.0,5000.0,45000.0,3000.0
David,22.0,M,,60000.0,6000.0,54000.0,
,28.0,F,Houston,,,,
