In [8]:
# Question 1: Handling Missing Values with Conditional Filling
# Description: Fill missing values in a specific column based on a condition from another column.
import pandas as pd
import numpy as np

# Sample DataFrame
data = {'ID': [1, 2, 3, 4, 5],
        'Age': [25, np.nan, 30, np.nan, 28],
        'Gender': ['M', 'F', 'M', 'F', 'F'],
        'Salary': [50000, np.nan, 60000, 55000, np.nan]}

df = pd.DataFrame(data)

# Fill missing 'Age' based on the average 'Age' by 'Gender'
df['Age'] = df.groupby('Gender')['Age'].transform(lambda x: x.fillna(x.mean()))

# Fill missing 'Salary' based on the average 'Salary' by 'Gender'
df['Salary'] = df.groupby('Gender')['Salary'].transform(lambda x: x.fillna(x.mean()))

print(df)



   ID   Age Gender   Salary
0   1  25.0      M  50000.0
1   2  28.0      F  55000.0
2   3  30.0      M  60000.0
3   4  28.0      F  55000.0
4   5  28.0      F  55000.0


In [9]:
import pandas as pd
import numpy as np

# Function to automate data cleaning
def clean_data(df, fill_missing_method='mean', fill_value=None):
    """
    Clean the data by:
    1. Filling missing values based on a chosen method.
    2. Removing outliers.
    3. Converting columns to appropriate data types.
    4. Standardizing column names.

    Parameters:
    df (pd.DataFrame): The DataFrame to be cleaned.
    fill_missing_method (str): Method for filling missing values ('mean', 'median', 'mode', 'value').
    fill_value (any): A custom value for filling missing data (used if fill_missing_method='value').

    Returns:
    pd.DataFrame: Cleaned DataFrame.
    """
    
    # 1. Standardize column names (convert to lowercase and remove spaces)
    df.columns = df.columns.str.lower().str.replace(' ', '_')

    # 2. Fill missing values for numeric columns
    valid_methods = ['mean', 'median', 'mode', 'value']
    if fill_missing_method not in valid_methods:
        raise ValueError(f"Invalid fill_missing_method: {fill_missing_method}. Valid methods are {valid_methods}.")
    
    for column in df.select_dtypes(include=['number']).columns:  # Only for numeric columns
        if fill_missing_method == 'mean':
            df[column].fillna(df[column].mean(), inplace=True)
        elif fill_missing_method == 'median':
            df[column].fillna(df[column].median(), inplace=True)
        elif fill_missing_method == 'mode':
            mode_value = df[column].mode()
            if len(mode_value) > 1:
                print(f"Warning: Ambiguous mode for '{column}', using the first value.")
                mode_value = mode_value[0]
            df[column].fillna(mode_value, inplace=True)
        elif fill_missing_method == 'value' and fill_value is not None:
            df[column].fillna(fill_value, inplace=True)

    # 3. Handle missing values for non-numeric columns
    for column in df.select_dtypes(exclude=['number']).columns:  # Only for non-numeric columns
        mode_value = df[column].mode()
        if len(mode_value) > 1:
            print(f"Warning: Ambiguous mode for '{column}', using the first value.")
            mode_value = mode_value[0]
        df[column].fillna(mode_value, inplace=True)

    # 4. Remove duplicates
    df.drop_duplicates(inplace=True)

    # 5. Outlier detection using IQR (Interquartile Range)
    for column in df.select_dtypes(include=['number']).columns:
        Q1 = df[column].quantile(0.25)
        Q3 = df[column].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        df[column] = df[column].clip(lower_bound, upper_bound)  # Clip outliers to the IQR bounds

    # 6. Data type conversion (e.g., Age column to integer after rounding)
    if 'age' in df.columns:
        mean_age = round(df['age'].mean())  # Round the mean to avoid casting issues
        df['age'].fillna(mean_age, inplace=True)
        df['age'] = df['age'].astype(int)  # Now safely convert to int

    # 7. Normalize 'Salary' column using Min-Max Scaling
    if 'salary' in df.columns:
        min_salary = df['salary'].min()
        max_salary = df['salary'].max()
        if max_salary == min_salary:
            raise ValueError("Min and Max values for 'Salary' are the same. Cannot normalize data.")
        df['normalized_salary'] = (df['salary'] - min_salary) / (max_salary - min_salary)

    return df


# Sample DataFrame for testing
data = {'ID': [1, 2, 3, 4, 5, 5],
        'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve', 'Eve'],
        'Gender': ['F', 'M', 'M', 'M', 'F', None],
        'Age': [25, 30, None, 35, None, 28],
        'Salary': [50000, 55000, 60000, None, 50000, 55000]}

df = pd.DataFrame(data)

# Apply the clean_data function
df_cleaned = clean_data(df, fill_missing_method='mean')

# Output the cleaned DataFrame
print(df_cleaned)


   id     name gender  age   salary  normalized_salary
0   1    Alice      F   26  50000.0                0.0
1   2      Bob      M   30  55000.0                0.5
2   3  Charlie      M   29  60000.0                1.0
3   4    David      M   32  54000.0                0.4
4   5      Eve      F   29  50000.0                0.0
5   5      Eve    NaN   28  55000.0                0.5


In [14]:
# Import necessary libraries
import pandas as pd
import numpy as np

# Sample DataFrame for testing
data = {
    'ID': [1, 2, 3, 4, 5, 6],
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve', 'Eve'],
    'Gender': ['F', 'M', 'M', 'M', 'F', None],
    'Age': [25, 30, None, 35, None, 28],
    'Salary': [50000, 55000, 60000, None, 50000, 55000]
}

df = pd.DataFrame(data)

# Function to standardize column names
def standardize_columns(df):
    df.columns = df.columns.str.lower().str.replace(' ', '_')
    return df

# Function to handle missing values based on a condition
def fill_missing_by_condition(df, column, condition_column, method='mean'):
    """
    Fill missing values in a specific column based on the condition from another column.
    Parameters:
        df (pd.DataFrame): The DataFrame to process
        column (str): The column to fill missing values for
        condition_column (str): The column to base the condition on (e.g., 'Gender')
        method (str): The method to fill missing values ('mean', 'median', 'mode')
    """
    if method not in ['mean', 'median', 'mode']:
        raise ValueError(f"Invalid method '{method}'. Valid methods are 'mean', 'median', 'mode'.")
    
    # Fill missing values by the condition column's groups
    if method == 'mean':
        df[column] = df.groupby(condition_column)[column].transform(lambda x: x.fillna(x.mean()))
    elif method == 'median':
        df[column] = df.groupby(condition_column)[column].transform(lambda x: x.fillna(x.median()))
    elif method == 'mode':
        df[column] = df.groupby(condition_column)[column].transform(lambda x: x.fillna(x.mode()[0] if not x.mode().empty else np.nan))
    
    return df

# Function to remove outliers using IQR (Interquartile Range)
def remove_outliers_iqr(df, column):
    """
    Remove outliers from the specified column using the IQR method.
    """
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    df[column] = df[column].clip(lower=lower_bound, upper=upper_bound)
    return df

# Function to convert 'Age' column to integer after rounding
def convert_age_to_integer(df):
    """
    Convert 'Age' column to integers after filling missing values and rounding.
    """
    mean_age = round(df['age'].mean())
    df['age'].fillna(mean_age, inplace=True)
    df['age'] = df['age'].astype(int)
    return df

# Function to normalize 'Salary' column using Min-Max scaling
def normalize_salary(df):
    """
    Normalize 'Salary' column using Min-Max scaling.
    """
    min_salary = df['salary'].min()
    max_salary = df['salary'].max()
    
    if max_salary == min_salary:
        raise ValueError("Min and Max values for 'Salary' are the same. Cannot normalize data.")
    
    df['normalized_salary'] = (df['salary'] - min_salary) / (max_salary - min_salary)
    return df

# Function to clean data - Handles missing values, duplicates, outliers, type conversion, and normalization
def clean_data(df, fill_missing_method='mean', fill_value=None):
    """
    Automate the data cleaning process.
    1. Fill missing values.
    2. Remove duplicates.
    3. Remove outliers using IQR.
    4. Convert columns to appropriate data types.
    5. Normalize numeric columns.
    """
    # Standardize column names
    df = standardize_columns(df)
    
    # Fill missing values based on the specified method
    valid_methods = ['mean', 'median', 'mode', 'value']
    if fill_missing_method not in valid_methods:
        raise ValueError(f"Invalid fill_missing_method: {fill_missing_method}. Valid methods are {valid_methods}.")
    
    for column in df.select_dtypes(include=['number']).columns:
        if fill_missing_method == 'mean':
            df[column].fillna(df[column].mean(), inplace=True)
        elif fill_missing_method == 'median':
            df[column].fillna(df[column].median(), inplace=True)
        elif fill_missing_method == 'mode':
            mode_value = df[column].mode()
            if len(mode_value) > 1:
                print(f"Warning: Ambiguous mode for '{column}', using the first value.")
                mode_value = mode_value[0]
            df[column].fillna(mode_value, inplace=True)
        elif fill_missing_method == 'value' and fill_value is not None:
            df[column].fillna(fill_value, inplace=True)
    
    # Remove duplicates
    df.drop_duplicates(inplace=True)
    
    # Remove outliers using IQR for numeric columns
    for column in df.select_dtypes(include=['number']).columns:
        df = remove_outliers_iqr(df, column)
    
    # Convert 'Age' column to integers after rounding
    if 'age' in df.columns:
        df = convert_age_to_integer(df)
    
    # Normalize 'Salary' column using Min-Max scaling
    if 'salary' in df.columns:
        df = normalize_salary(df)
    
    return df

# Clean the sample data using the clean_data function
df_cleaned = clean_data(df, fill_missing_method='mean')

# Display cleaned data
print(df_cleaned)


   id     name gender  age   salary  normalized_salary
0   1    Alice      F   26  50000.0                0.0
1   2      Bob      M   30  55000.0                0.5
2   3  Charlie      M   29  60000.0                1.0
3   4    David      M   32  54000.0                0.4
4   5      Eve      F   29  50000.0                0.0
5   6      Eve   None   28  55000.0                0.5


In [10]:
# Question 2: Removing Outliers by Rescaling
# Description: Remove outliers by standardizing a numerical column using z-scores.

import pandas as pd
import numpy as np
from scipy.stats import zscore

# Sample DataFrame
data = {'ID': [1, 2, 3, 4, 5, 6, 7],
        'Salary': [50000, 45000, 60000, 700000, 55000, 65000, 48000]}

df = pd.DataFrame(data)

# Calculate Z-scores for the 'Salary' column
df['Z_Score'] = zscore(df['Salary'])

# Identify outliers (Z-score > 3 or Z-score < -3)
outliers = df[np.abs(df['Z_Score']) > 3]

print("Outliers based on Z-score:")
print(outliers)
# Rescale outliers by replacing them with the mean
df['Salary'] = np.where(np.abs(df['Z_Score']) > 3, df['Salary'].mean(), df['Salary'])

# Drop the Z_Score column
df = df.drop(columns='Z_Score')

print("DataFrame after rescaling outliers:")
print(df)



Outliers based on Z-score:
Empty DataFrame
Columns: [ID, Salary, Z_Score]
Index: []
DataFrame after rescaling outliers:
   ID    Salary
0   1   50000.0
1   2   45000.0
2   3   60000.0
3   4  700000.0
4   5   55000.0
5   6   65000.0
6   7   48000.0


In [11]:
# Question 3: Applying Data Type Conversion
# Description: Convert the 'Age' column to integers after filling missing values.
import pandas as pd
import numpy as np

# Sample DataFrame
data = {'ID': [1, 2, 3, 4, 5],
        'Age': [25, np.nan, 30, np.nan, 28]}

df = pd.DataFrame(data)

# Step 1: Fill missing 'Age' with the mean of the column
mean_age = df['Age'].mean()  # Calculate the mean of the 'Age' column
df['Age'].fillna(mean_age, inplace=True)  # Fill missing values with the mean

# Step 2: Convert 'Age' column to integers
df['Age'] = df['Age'].astype(int)

print(df)



   ID  Age
0   1   25
1   2   27
2   3   30
3   4   27
4   5   28


In [12]:
import pandas as pd

# Function to automate data cleaning
def clean_data(df, fill_missing_method='mean', fill_value=None):
    """
    Clean the data by:
    1. Filling missing values based on a chosen method.
    2. Removing duplicates.
    3. Standardizing column names.

    Parameters:
    df (pd.DataFrame): The DataFrame to be cleaned.
    fill_missing_method (str): Method for filling missing values ('mean', 'median', 'mode').
    fill_value (any): A custom value for filling missing data (used if fill_missing_method='value').

    Returns:
    pd.DataFrame: Cleaned DataFrame.
    """
    
    # 1. Standardize column names (convert to lowercase and remove spaces)
    df.columns = df.columns.str.lower().str.replace(' ', '_')

    # 2. Fill missing values for numeric columns
    for column in df.select_dtypes(include=['number']).columns:  # Only for numeric columns
        if fill_missing_method == 'mean':
            df[column].fillna(df[column].mean(), inplace=True)
        elif fill_missing_method == 'median':
            df[column].fillna(df[column].median(), inplace=True)
        elif fill_missing_method == 'mode':
            df[column].fillna(df[column].mode()[0], inplace=True)
        elif fill_missing_method == 'value' and fill_value is not None:
            df[column].fillna(fill_value, inplace=True)
        else:
            raise ValueError("Invalid fill_missing_method or fill_value.")

    # For non-numeric columns (like 'Name'), fill missing values with the mode
    for column in df.select_dtypes(exclude=['number']).columns:  # Only for non-numeric columns
        df[column].fillna(df[column].mode()[0], inplace=True)

    # 3. Remove duplicates
    df.drop_duplicates(inplace=True)
    
    return df

# Sample DataFrame
data = {'ID': [1, 2, 3, 4, 5, 5],
        'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve', 'Eve'],
        'Age': [25, 30, None, 35, None, 28],
        'Salary': [50000, 55000, 60000, None, 50000, 55000]}

df = pd.DataFrame(data)

# Apply the clean_data function
df_cleaned = clean_data(df, fill_missing_method='mean')

print(df_cleaned)


   id     name   age   salary
0   1    Alice  25.0  50000.0
1   2      Bob  30.0  55000.0
2   3  Charlie  29.5  60000.0
3   4    David  35.0  54000.0
4   5      Eve  29.5  50000.0
5   5      Eve  28.0  55000.0


In [13]:
# Question 5: Complex Data Normalization
# Description: Normalize a numeric column to a range using min-max scaling.

import pandas as pd

# Sample DataFrame
data = {'ID': [1, 2, 3, 4, 5],
        'Salary': [50000, 55000, 60000, 70000, 65000]}

df = pd.DataFrame(data)

# Step 1: Min-Max Scaling of the 'Salary' column
min_salary = df['Salary'].min()
max_salary = df['Salary'].max()

df['Normalized_Salary'] = (df['Salary'] - min_salary) / (max_salary - min_salary)

print(df)


   ID  Salary  Normalized_Salary
0   1   50000               0.00
1   2   55000               0.25
2   3   60000               0.50
3   4   70000               1.00
4   5   65000               0.75
