In [1]:
import pandas as pd

# 1. Load the Data (example dataset).
data = {
    "Name": ["Alice", "Bob", "Charlie", "David", "Alice", None, "Eve"],
    "Age": [25, None, 30, 22, 25, 28, None],
    "City": ["NY", "LA", None, "LA", "NY", "NY", "SF"]
}
df = pd.DataFrame(data)

# 2. Handling Missing Values
# 2.1 Identify Missing Values
print("Missing values per column:\n", df.isnull().sum())

# 2.2 Fill Missing Values
df["Age"] = df["Age"].fillna(df["Age"].mean())  # Fill Age with mean
df["City"] = df["City"].fillna("Unknown")      # Fill City with "Unknown"
df["Name"] = df["Name"].fillna("Unknown")      # Fill Name with "Unknown"

# 3. Handling Duplicates
# 3.1 Identify Duplicates
print("\nDuplicates in dataframe:\n", df.duplicated())

# 3.2 Remove Duplicates
df_cleaned = df.drop_duplicates()

# 4. Combined Practice on New Dataset
new_data = {
    "Name": ["Tom", "Jerry", "Tom", None, "Spike", "Jerry"],
    "Age": [34, 29, 34, 22, None, 29],
    "City": ["Boston", None, "Boston", "Boston", "Chicago", None]
}
new_df = pd.DataFrame(new_data)

# Handling missing values
new_df["Age"] = new_df["Age"].fillna(new_df["Age"].median())
new_df["City"] = new_df["City"].fillna("Unknown")
new_df["Name"] = new_df["Name"].fillna("Unknown")

# Remove duplicates
new_df_cleaned = new_df.drop_duplicates()

print("\nCleaned new dataframe:\n", new_df_cleaned)


Missing values per column:
 Name    1
Age     2
City    1
dtype: int64

Duplicates in dataframe:
 0    False
1    False
2    False
3    False
4     True
5    False
6    False
dtype: bool

Cleaned new dataframe:
       Name   Age     City
0      Tom  34.0   Boston
1    Jerry  29.0  Unknown
3  Unknown  22.0   Boston
4    Spike  29.0  Chicago


In [2]:
import pandas as pd

# 1. Standardize Text Data
data = {
    "Name": ["Alice", "BOB", "Charlie", "david", "EVE"],
    "Age": [25.7, 30.2, 22.9, 27.5, 31.1]
}
df = pd.DataFrame(data)

# 1.1 Convert All Names to Lowercase
df["Name"] = df["Name"].str.lower()

# 2. Format Numerical Data
# 2.1 Round Age Column to Nearest Integer
df["Age"] = df["Age"].round().astype(int)

# 3. Combined Practice on Another Dataset
product_data = {
    "Product": ["iPhone", "samsung", "PIXEL", "iphone", "Samsung"],
    "Price": [999.995, 799.1234, 899.987, 999.50, 799.1]
}
prod_df = pd.DataFrame(product_data)

# 3.1 Standardize Product Names (lowercase)
prod_df["Product"] = prod_df["Product"].str.lower()

# 3.2 Format Prices to Two Decimal Places
prod_df["Price"] = prod_df["Price"].round(2)

print(df)
print(prod_df)


      Name  Age
0    alice   26
1      bob   30
2  charlie   23
3    david   28
4      eve   31
   Product    Price
0   iphone  1000.00
1  samsung   799.12
2    pixel   899.99
3   iphone   999.50
4  samsung   799.10
