In [9]:
# Question 1: Handling Missing Values with Conditional Filling
# Description: Fill missing values in a specific column based on a condition from another column.
import pandas as pd
import numpy as np

# Sample DataFrame
data = {'ID': [1, 2, 3, 4, 5],
        'Age': [25, np.nan, 30, np.nan, 28],
        'Gender': ['M', 'F', 'M', 'F', 'F'],
        'Salary': [50000, np.nan, 60000, 55000, np.nan]}

df = pd.DataFrame(data)

# Fill missing 'Age' based on the average 'Age' by 'Gender'
df['Age'] = df.groupby('Gender')['Age'].transform(lambda x: x.fillna(x.mean()))

# Fill missing 'Salary' based on the average 'Salary' by 'Gender'
df['Salary'] = df.groupby('Gender')['Salary'].transform(lambda x: x.fillna(x.mean()))

print(df)



   ID   Age Gender   Salary
0   1  25.0      M  50000.0
1   2  28.0      F  55000.0
2   3  30.0      M  60000.0
3   4  28.0      F  55000.0
4   5  28.0      F  55000.0


In [10]:
# Question 2: Removing Outliers by Rescaling
# Description: Remove outliers by standardizing a numerical column using z-scores.

import pandas as pd
import numpy as np
from scipy.stats import zscore

# Sample DataFrame
data = {'ID': [1, 2, 3, 4, 5, 6, 7],
        'Salary': [50000, 45000, 60000, 700000, 55000, 65000, 48000]}

df = pd.DataFrame(data)

# Calculate Z-scores for the 'Salary' column
df['Z_Score'] = zscore(df['Salary'])

# Identify outliers (Z-score > 3 or Z-score < -3)
outliers = df[np.abs(df['Z_Score']) > 3]

print("Outliers based on Z-score:")
print(outliers)
# Rescale outliers by replacing them with the mean
df['Salary'] = np.where(np.abs(df['Z_Score']) > 3, df['Salary'].mean(), df['Salary'])

# Drop the Z_Score column
df = df.drop(columns='Z_Score')

print("DataFrame after rescaling outliers:")
print(df)



Outliers based on Z-score:
Empty DataFrame
Columns: [ID, Salary, Z_Score]
Index: []
DataFrame after rescaling outliers:
   ID    Salary
0   1   50000.0
1   2   45000.0
2   3   60000.0
3   4  700000.0
4   5   55000.0
5   6   65000.0
6   7   48000.0


In [11]:
# Question 3: Applying Data Type Conversion
# Description: Convert the 'Age' column to integers after filling missing values.
import pandas as pd
import numpy as np

# Sample DataFrame
data = {'ID': [1, 2, 3, 4, 5],
        'Age': [25, np.nan, 30, np.nan, 28]}

df = pd.DataFrame(data)

# Step 1: Fill missing 'Age' with the mean of the column
mean_age = df['Age'].mean()  # Calculate the mean of the 'Age' column
df['Age'].fillna(mean_age, inplace=True)  # Fill missing values with the mean

# Step 2: Convert 'Age' column to integers
df['Age'] = df['Age'].astype(int)

print(df)



   ID  Age
0   1   25
1   2   27
2   3   30
3   4   27
4   5   28


In [12]:
import pandas as pd

# Function to automate data cleaning
def clean_data(df, fill_missing_method='mean', fill_value=None):
    """
    Clean the data by:
    1. Filling missing values based on a chosen method.
    2. Removing duplicates.
    3. Standardizing column names.

    Parameters:
    df (pd.DataFrame): The DataFrame to be cleaned.
    fill_missing_method (str): Method for filling missing values ('mean', 'median', 'mode').
    fill_value (any): A custom value for filling missing data (used if fill_missing_method='value').

    Returns:
    pd.DataFrame: Cleaned DataFrame.
    """
    
    # 1. Standardize column names (convert to lowercase and remove spaces)
    df.columns = df.columns.str.lower().str.replace(' ', '_')

    # 2. Fill missing values for numeric columns
    for column in df.select_dtypes(include=['number']).columns:  # Only for numeric columns
        if fill_missing_method == 'mean':
            df[column].fillna(df[column].mean(), inplace=True)
        elif fill_missing_method == 'median':
            df[column].fillna(df[column].median(), inplace=True)
        elif fill_missing_method == 'mode':
            df[column].fillna(df[column].mode()[0], inplace=True)
        elif fill_missing_method == 'value' and fill_value is not None:
            df[column].fillna(fill_value, inplace=True)
        else:
            raise ValueError("Invalid fill_missing_method or fill_value.")

    # For non-numeric columns (like 'Name'), fill missing values with the mode
    for column in df.select_dtypes(exclude=['number']).columns:  # Only for non-numeric columns
        df[column].fillna(df[column].mode()[0], inplace=True)

    # 3. Remove duplicates
    df.drop_duplicates(inplace=True)
    
    return df

# Sample DataFrame
data = {'ID': [1, 2, 3, 4, 5, 5],
        'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve', 'Eve'],
        'Age': [25, 30, None, 35, None, 28],
        'Salary': [50000, 55000, 60000, None, 50000, 55000]}

df = pd.DataFrame(data)

# Apply the clean_data function
df_cleaned = clean_data(df, fill_missing_method='mean')

print(df_cleaned)


   id     name   age   salary
0   1    Alice  25.0  50000.0
1   2      Bob  30.0  55000.0
2   3  Charlie  29.5  60000.0
3   4    David  35.0  54000.0
4   5      Eve  29.5  50000.0
5   5      Eve  28.0  55000.0


In [13]:
# Question 5: Complex Data Normalization
# Description: Normalize a numeric column to a range using min-max scaling.

import pandas as pd

# Sample DataFrame
data = {'ID': [1, 2, 3, 4, 5],
        'Salary': [50000, 55000, 60000, 70000, 65000]}

df = pd.DataFrame(data)

# Step 1: Min-Max Scaling of the 'Salary' column
min_salary = df['Salary'].min()
max_salary = df['Salary'].max()

df['Normalized_Salary'] = (df['Salary'] - min_salary) / (max_salary - min_salary)

print(df)


   ID  Salary  Normalized_Salary
0   1   50000               0.00
1   2   55000               0.25
2   3   60000               0.50
3   4   70000               1.00
4   5   65000               0.75
