In [5]:
#Data Preprocessing and Cleaning in Python

import pandas as pd
import numpy as np

# Sample dataset with missing values and messy data
data = {
    "Name": ["Ram", "Shyam", "Hari", "Sita", None],
    "Age": [25, np.nan, 35, 29, 40],
    "Salary": ["20000", "30000", "N/A", "25000", "40000"],
    "Department": ["IT", "HR", None, "Finance", "IT"]
}

df = pd.DataFrame(data)

print("Original Data:")
print(df)
# 1. Handle Missing Values
df["Name"].fillna("Unknown", inplace=True)
df["Age"].fillna(df["Age"].mean(), inplace=True)
df["Department"].fillna("General", inplace=True)

# 2. Handle Incorrect Data
df["Salary"].replace("N/A", np.nan, inplace=True)
df["Salary"] = df["Salary"].astype(float)
df["Salary"].fillna(df["Salary"].mean(), inplace=True)

# 3. Remove Duplicates (if any)
df.drop_duplicates(inplace=True)

# 4. Normalize / Standardize text data
df["Department"] = df["Department"].str.upper()

print("\nCleaned Data:")
print(df)


Original Data:
    Name   Age Salary Department
0    Ram  25.0  20000         IT
1  Shyam   NaN  30000         HR
2   Hari  35.0    N/A       None
3   Sita  29.0  25000    Finance
4   None  40.0  40000         IT

Cleaned Data:
      Name    Age   Salary Department
0      Ram  25.00  20000.0         IT
1    Shyam  32.25  30000.0         HR
2     Hari  35.00  28750.0    GENERAL
3     Sita  29.00  25000.0    FINANCE
4  Unknown  40.00  40000.0         IT


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Name"].fillna("Unknown", inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Age"].fillna(df["Age"].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values a