## Data Cleaning in Pandas

In [1]:
import pandas as pd

In [2]:
data = [
    {"Name": "Nouman",     "Age": 27, "Department": "IT",        "Salary": 78000},
    {"Name": "Ahsan",    "Age": 31, "Department": "HR",        "Salary": 62000},
    {"Name": "Tariq",   "Age": 29, "Department": "Finance",   "Salary": None},
    {"Name": "Ayesha",  "Age": None, "Department": "Marketing", "Salary": 73000},
    {"Name": "Sara",   "Age": 26, "Department": "Sales",     "Salary": 50000},
    {"Name": "Hira",    "Age": 33, "Department": "IT",        "Salary": 81000}
]
df= pd.DataFrame(data)
df

Unnamed: 0,Name,Age,Department,Salary
0,Nouman,27.0,IT,78000.0
1,Ahsan,31.0,HR,62000.0
2,Tariq,29.0,Finance,
3,Ayesha,,Marketing,73000.0
4,Sara,26.0,Sales,50000.0
5,Hira,33.0,IT,81000.0


### Check for null values 

In [3]:
df.isnull().sum()

Name          0
Age           1
Department    0
Salary        1
dtype: int64

### Get rid of rows with null values

In [4]:
df.dropna()

Unnamed: 0,Name,Age,Department,Salary
0,Nouman,27.0,IT,78000.0
1,Ahsan,31.0,HR,62000.0
4,Sara,26.0,Sales,50000.0
5,Hira,33.0,IT,81000.0


In [5]:
df.dropna(how='any')

Unnamed: 0,Name,Age,Department,Salary
0,Nouman,27.0,IT,78000.0
1,Ahsan,31.0,HR,62000.0
4,Sara,26.0,Sales,50000.0
5,Hira,33.0,IT,81000.0


In [6]:
df.dropna(how='all')

Unnamed: 0,Name,Age,Department,Salary
0,Nouman,27.0,IT,78000.0
1,Ahsan,31.0,HR,62000.0
2,Tariq,29.0,Finance,
3,Ayesha,,Marketing,73000.0
4,Sara,26.0,Sales,50000.0
5,Hira,33.0,IT,81000.0


### Fill the missing values

In [7]:
df.fillna(0) #fills the missing values with zeros, but not a good practice

Unnamed: 0,Name,Age,Department,Salary
0,Nouman,27.0,IT,78000.0
1,Ahsan,31.0,HR,62000.0
2,Tariq,29.0,Finance,0.0
3,Ayesha,0.0,Marketing,73000.0
4,Sara,26.0,Sales,50000.0
5,Hira,33.0,IT,81000.0


In [8]:
df["Age"]=df["Age"].fillna(df["Age"].mean())

In [9]:
df["Salary"].fillna(df["Salary"].median()) # fill the mssiing values with median

0    78000.0
1    62000.0
2    73000.0
3    73000.0
4    50000.0
5    81000.0
Name: Salary, dtype: float64

### Non Methematical Fill 

In [10]:
df["Age"].ffill()

0    27.0
1    31.0
2    29.0
3    29.2
4    26.0
5    33.0
Name: Age, dtype: float64

In [11]:
df["Age"].bfill()

0    27.0
1    31.0
2    29.0
3    29.2
4    26.0
5    33.0
Name: Age, dtype: float64

### Replacing Specific Values

In [12]:
df["Name"] = df["Name"].replace("Nouman", "Nomii")
df

Unnamed: 0,Name,Age,Department,Salary
0,Nomii,27.0,IT,78000.0
1,Ahsan,31.0,HR,62000.0
2,Tariq,29.0,Finance,
3,Ayesha,29.2,Marketing,73000.0
4,Sara,26.0,Sales,50000.0
5,Hira,33.0,IT,81000.0


In [13]:
# df.replace({"Name": {"Nomii": "Nouman"}}, inplace=True)
df["Name"].replace("Nouman", "Nomii", inplace=True)
df

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Name"].replace("Nouman", "Nomii", inplace=True)


Unnamed: 0,Name,Age,Department,Salary
0,Nomii,27.0,IT,78000.0
1,Ahsan,31.0,HR,62000.0
2,Tariq,29.0,Finance,
3,Ayesha,29.2,Marketing,73000.0
4,Sara,26.0,Sales,50000.0
5,Hira,33.0,IT,81000.0
