In [6]:
import pandas as pd

# Sample DataFrame with missing values
data = {
    "age": [25, None, 35, None, 45],
    "income": [50000, 60000, 70000, 80000, 90000],
    "status": ["employed", "unemployed", "employed", "unemployed", "employed"]
}
df = pd.DataFrame(data)

# Fill missing 'age' values:
# If status is 'employed', fill missing age with 40,
# else fill missing age with 30
df["age"] = df.apply(
    lambda row: 40 if pd.isna(row["age"]) and row["status"] == "employed"
    else (30 if pd.isna(row["age"]) and row["status"] == "unemployed" else row["age"]),
    axis=1
)

print(df)


    age  income      status
0  25.0   50000    employed
1  30.0   60000  unemployed
2  35.0   70000    employed
3  30.0   80000  unemployed
4  45.0   90000    employed


In [7]:

import pandas as pd
import numpy as np

# Sample DataFrame with outliers
data = {
    "age": [25, 30, 35, 40, 100, 28, 27, 29, 31, 32],
    "income": [50000, 52000, 51000, 48000, 600000, 49500, 50500, 49000, 51500, 53000]
}
df = pd.DataFrame(data)

# Calculate z-scores for the 'age' column
df['age_zscore'] = (df['age'] - df['age'].mean()) / df['age'].std()

# Define threshold for outlier detection (e.g., abs(z) > 3)
threshold = 3

# Filter out outliers
df_no_outliers = df[df['age_zscore'].abs() <= threshold].drop(columns=['age_zscore'])

print(df_no_outliers)

   age  income
0   25   50000
1   30   52000
2   35   51000
3   40   48000
4  100  600000
5   28   49500
6   27   50500
7   29   49000
8   31   51500
9   32   53000


In [8]:
import pandas as pd
import numpy as np

# Sample DataFrame with missing values and Age as floats
data = {
    "Name": ["Alice", "Bob", "Charlie", "David"],
    "Age": [25.0, np.nan, 30.5, np.nan]
}
df = pd.DataFrame(data)

# Fill missing 'Age' values with the mean age
mean_age = df['Age'].mean()
df['Age'] = df['Age'].fillna(mean_age)

# Convert 'Age' column to integers
df['Age'] = df['Age'].astype(int)

print(df)


      Name  Age
0    Alice   25
1      Bob   27
2  Charlie   30
3    David   27


In [9]:
import pandas as pd

def clean_data(df, fillna_values=None):
    """
    Automate data cleaning:
    - Fill missing values (dict: column -> value)
    - Remove duplicates
    - Standardize column names (lowercase, strip spaces)
    
    Args:
        df (pd.DataFrame): Input DataFrame
        fillna_values (dict, optional): Dict with column names as keys and fill values as values
    
    Returns:
        pd.DataFrame: Cleaned DataFrame
    """
    df_clean = df.copy()
    
    # Standardize column names: lowercase and strip spaces
    df_clean.columns = df_clean.columns.str.lower().str.strip()
    
    # Fill missing values if specified
    if fillna_values:
        df_clean.fillna(value=fillna_values, inplace=True)
    
    # Remove duplicates
    df_clean.drop_duplicates(inplace=True)
    
    return df_clean

# Example usage
data = {
    " Name ": ["Alice", "Bob", "Alice", "Charlie"],
    "Age": [25, None, 25, 30]
}
df = pd.DataFrame(data)

cleaned_df = clean_data(df, fillna_values={"age": 28})
print(cleaned_df)


      name   age
0    Alice  25.0
1      Bob  28.0
3  Charlie  30.0


In [10]:
import pandas as pd

def min_max_normalize(series):
    """
    Normalize a pandas Series using min-max scaling to range [0, 1].
    """
    min_val = series.min()
    max_val = series.max()
    return (series - min_val) / (max_val - min_val)

# Example usage
data = {
    "score": [50, 20, 30, 80, 100]
}
df = pd.DataFrame(data)

df["score_normalized"] = min_max_normalize(df["score"])
print(df)


   score  score_normalized
0     50             0.375
1     20             0.000
2     30             0.125
3     80             0.750
4    100             1.000
