In [1]:
import pandas as pd

# Example dataset
data = {
    "Name": ["Alice", "Bob", "Charlie", None, "Eve"],
    "Age": [25, 30, None, 40, 29],
    "Salary": [50000, 60000, 55000, None, 70000],
    "Department": ["HR", "IT", "Finance", "IT", None]
}
df = pd.DataFrame(data)

print(df.head())
print("\nData Info:")
print(df.info())
print("\nSummary Stats:")
print(df.describe(include="all"))


      Name   Age   Salary Department
0    Alice  25.0  50000.0         HR
1      Bob  30.0  60000.0         IT
2  Charlie   NaN  55000.0    Finance
3     None  40.0      NaN         IT
4      Eve  29.0  70000.0       None

Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Name        4 non-null      object 
 1   Age         4 non-null      float64
 2   Salary      4 non-null      float64
 3   Department  4 non-null      object 
dtypes: float64(2), object(2)
memory usage: 292.0+ bytes
None

Summary Stats:
         Name        Age        Salary Department
count       4   4.000000      4.000000          4
unique      4        NaN           NaN          3
top     Alice        NaN           NaN         IT
freq        1        NaN           NaN          2
mean      NaN  31.000000  58750.000000        NaN
std       NaN   6.377042   8539.125638    

In [2]:
# Create example
df = pd.DataFrame({
    "Name": ["A", "B", None, "D", "E"],
    "Age": [23, None, 25, None, 29],
    "Salary": [50000, 52000, None, None, 60000]
})

# Check missing
print(df.isnull().sum())

# Drop missing rows
df_drop_rows = df.dropna()

# Drop missing columns
df_drop_cols = df.dropna(axis=1)

# Fill missing with constant
df_fill_const = df.fillna("Unknown")

# Fill numerical with mean/median/mode
df["Age"] = df["Age"].fillna(df["Age"].mean())
df["Salary"] = df["Salary"].fillna(df["Salary"].median())

# Forward/Backward fill -- for time series data
df_ffill = df.fillna(method="ffill")
df_bfill = df.fillna(method="bfill")


Name      1
Age       2
Salary    2
dtype: int64


  df_ffill = df.fillna(method="ffill")
  df_bfill = df.fillna(method="bfill")


In [3]:
df = pd.DataFrame({
    "ID": [1, 2, 2, 3, 3, 3],
    "Name": ["A", "B", "B", "C", "C", "C"]
})

# Find duplicates
print(df.duplicated())

# Drop duplicates
df_unique = df.drop_duplicates()

df_unique

0    False
1    False
2     True
3    False
4     True
5     True
dtype: bool


Unnamed: 0,ID,Name
0,1,A
1,2,B
3,3,C


In [4]:
df = pd.DataFrame({
    "Department": ["IT", "hr", "Finance", "FINANCE", "it", None]
})

# Lowercase normalization
df["Department"] = df["Department"].str.lower()

# Replace variants
df["Department"] = df["Department"].replace({"finance": "fin", "it": "tech"})

# Fill missing categorical
df["Department"] = df["Department"].fillna("unknown")

# Encoding categories
df["Dept_code"] = df["Department"].astype("category").cat.codes

df

Unnamed: 0,Department,Dept_code
0,tech,2
1,hr,1
2,fin,0
3,fin,0
4,tech,2
5,unknown,3


In [5]:
df = pd.DataFrame({
    "Salary": ["50,000", "60k", "70,000", None, "80K"]
})

# Remove symbols/letters and convert
df["Salary"] = df["Salary"].replace({",": "", "k": "000", "K": "000"}, regex=True)
df["Salary"] = pd.to_numeric(df["Salary"], errors="coerce")

# Fill missing
df["Salary"] = df["Salary"].fillna(df["Salary"].median())

df

Unnamed: 0,Salary
0,50000.0
1,60000.0
2,70000.0
3,65000.0
4,80000.0


In [6]:
df = pd.DataFrame({
    "JoinDate": ["2021/01/01", "2020-05-15", "15-06-2019", None]
})

# Convert to datetime
df["JoinDate"] = pd.to_datetime(df["JoinDate"], errors="coerce", dayfirst=True)

# Extract features
df["Year"] = df["JoinDate"].dt.year
df["Month"] = df["JoinDate"].dt.month
df["Day"] = df["JoinDate"].dt.day
df["Weekday"] = df["JoinDate"].dt.day_name()
df

Unnamed: 0,JoinDate,Year,Month,Day,Weekday
0,2021-01-01,2021.0,1.0,1.0,Friday
1,NaT,,,,
2,NaT,,,,
3,NaT,,,,
