In [13]:
# Ques_13.ipynb - Data Standardization & Validation (Optimized Version)

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# === Step 1: Generate or Load Sample Data ===
def generate_sample_data():
    """Generates sample data with numerical and categorical columns."""
    data = {
        'Age': [25, 30, 35, 40, 28, 25, 35, 33, 45, 42],
        'Salary': [50000, 60000, 80000, 120000, 65000, 50000, 80000, 70000, 90000, 95000],
        'Gender': ['Male', 'Female', 'Female', 'Male', 'Female', 'Male', 'Male', 'Female', 'Female', 'Male'],
        'City': ['NY', 'LA', 'Chicago', 'Houston', 'LA', 'NY', 'Chicago', 'Seattle', 'Boston', 'Houston']
    }
    return pd.DataFrame(data)


# === Step 2: Data Validation ===
def validate_data(df):
    """Basic validation checks for input DataFrame."""
    if df.empty:
        raise ValueError("DataFrame is empty. Please check the input source.")
    if df.isnull().any().any():
        print("⚠️ Warning: Missing values detected in the dataset.")
    return True


# === Step 3: Standardize Numerical Data ===
def standardize_data(df, numerical_columns):
    """Standardize numerical columns using StandardScaler."""
    scaler = StandardScaler()
    df[numerical_columns] = scaler.fit_transform(df[numerical_columns])
    return df


# === Step 4: Encode Categorical Data using OneHotEncoder ===
def encode_categorical_data(df, categorical_columns):
    """Encode categorical columns using OneHotEncoder to avoid unseen labels."""
    ohe = OneHotEncoder(drop='first', handle_unknown='ignore', sparse_output=False)  # Drop first to avoid multicollinearity
    encoded_array = ohe.fit_transform(df[categorical_columns])
    df_encoded = pd.DataFrame(encoded_array, columns=ohe.get_feature_names_out(categorical_columns))
    df = pd.concat([df, df_encoded], axis=1).drop(columns=categorical_columns)  # Concatenate one-hot encoded columns
    return df


# === Step 5: Validate Specific Columns ===
def validate_column_values(df, column_name, valid_values):
    """Ensure that the values in a specific column are valid (e.g., valid categories)."""
    if column_name in df.columns:
        invalid_values = df[~df[column_name].isin(valid_values)]
        if not invalid_values.empty:
            print(f"⚠️ Invalid values found in column '{column_name}':\n", invalid_values)
    return df


# === Step 6: Save Cleaned Data ===
def save_cleaned_data(df, path="Q13_cleaned_standardized.csv"):
    """Save the cleaned and standardized data to a CSV file."""
    df.to_csv(path, index=False)
    print(f"✅ Cleaned data saved to: {path}")


# === MAIN EXECUTION ===
if __name__ == "__main__":
    # Step 1: Load Data
    df = generate_sample_data()
    print("🔹 Original Data:\n", df)

    # Step 2: Validate Data
    try:
        validate_data(df)
    except Exception as e:
        print(f"❌ Validation Error: {e}")
        exit(1)

    # Step 3: Standardize numerical columns ('Age' and 'Salary')
    numerical_columns = ['Age', 'Salary']
    df_standardized = standardize_data(df, numerical_columns)
    print("\n✅ After Standardizing Numerical Columns:\n", df_standardized)

    # Step 4: Encode categorical columns ('Gender' and 'City') using OneHotEncoder
    categorical_columns = ['Gender', 'City']
    df_encoded = encode_categorical_data(df_standardized, categorical_columns)
    print("\n✅ After Encoding Categorical Columns:\n", df_encoded)

    # Step 5: Validate Categorical Columns ('Gender' with valid values ['Male', 'Female'])
    df_validated = validate_column_values(df_encoded, 'Gender', ['Male', 'Female'])
    print("\n✅ After Validating Categorical Column 'Gender':\n", df_validated)

    # Step 6: Save cleaned and processed data
    save_cleaned_data(df_validated)


🔹 Original Data:
    Age  Salary  Gender     City
0   25   50000    Male       NY
1   30   60000  Female       LA
2   35   80000  Female  Chicago
3   40  120000    Male  Houston
4   28   65000  Female       LA
5   25   50000    Male       NY
6   35   80000    Male  Chicago
7   33   70000  Female  Seattle
8   45   90000  Female   Boston
9   42   95000    Male  Houston

✅ After Standardizing Numerical Columns:
         Age    Salary  Gender     City
0 -1.330283 -1.255292    Male       NY
1 -0.574440 -0.772487  Female       LA
2  0.181402  0.193122  Female  Chicago
3  0.937245  2.124340    Male  Houston
4 -0.876777 -0.531085  Female       LA
5 -1.330283 -1.255292    Male       NY
6  0.181402  0.193122    Male  Chicago
7 -0.120935 -0.289683  Female  Seattle
8  1.693087  0.675926  Female   Boston
9  1.239582  0.917329    Male  Houston

✅ After Encoding Categorical Columns:
         Age    Salary  Gender_Male  City_Chicago  City_Houston  City_LA  \
0 -1.330283 -1.255292          1.0         

In [14]:
# Task B: Addressing Inconsistent Representations

# 16. Standardizing Date Formats:
# - Identify and correct inconsistent date formats within the dataset.








# 17. Pattern Matching for Consistency:
# - Standardize phone numbers to a specific pattern (e.g., (123) 456-7890).





# 18. Handling Mixed Case Text:
# - Convert all text entries to a consistent case (e.g., all uppercase).









