## Automating Data Cleaning in Python

In [1]:
# Write your code from here
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

# Step 1: Load sample dataset (using pandas)
# For demonstration, we'll create a sample DataFrame
data = {
    'age': [25, 32, 47, 51, 62],
    'income': [50000, 64000, 120000, 110000, 150000],
    'gender': ['M', 'F', 'F', 'M', 'M']  # Non-numeric column, will be excluded from scaling
}
df = pd.DataFrame(data)

# Step 2: Select numerical columns only
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns

# Step 3: Define the pipeline to scale numerical features
pipeline = Pipeline([
    ('scaler', StandardScaler())
])

# Step 4: Fit and transform numerical data
scaled_data = pipeline.fit_transform(df[numerical_cols])

# Optional: Convert scaled data back to DataFrame
scaled_df = pd.DataFrame(scaled_data, columns=numerical_cols)

print(scaled_df)












        age    income
0 -1.382872 -1.324367
1 -0.856780 -0.944426
2  0.270562  0.575340
3  0.571186  0.303953
4  1.397904  1.389500


    Task: Basic Pipeline with Scaling
1. Objective: Create a pipeline that scales numerical features in a dataset.
2. Steps:
    - Load a sample dataset with Pandas.
    - Define a pipeline using Pipeline from sklearn.pipeline .
    - Use StandardScaler to scale features.

    Task: Pipeline with Imputation
1. Objective: Automate data cleaning by handling missing values.
2. Steps:
    - Load a dataset with missing values.
    - Define a pipeline to use SimpleImputer for filling missing values.

In [2]:
# Write your code from here
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# Step 1: Create a sample dataset with missing values
data = {
    'age': [25, np.nan, 47, 51, 62],
    'income': [50000, 64000, np.nan, 110000, 150000],
    'gender': ['M', 'F', 'F', 'M', None]  # Missing value in categorical column
}
df = pd.DataFrame(data)

# Step 2: Select numerical columns to impute
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns

# Step 3: Define a pipeline with SimpleImputer to fill missing numerical values with mean
pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean'))
])

# Step 4: Fit and transform the numerical columns
imputed_data = pipeline.fit_transform(df[numerical_cols])

# Optional: Convert back to DataFrame with same column names
imputed_df = pd.DataFrame(imputed_data, columns=numerical_cols)

print(imputed_df)

     age    income
0  25.00   50000.0
1  46.25   64000.0
2  47.00   93500.0
3  51.00  110000.0
4  62.00  150000.0
