In [None]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_iris

# Load the Iris dataset
iris = load_iris()
df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
df['target'] = iris.target

# Display the first few rows of the dataset
print("Initial Data:")
print(df.head())

# Step 1: Handling Missing Values
# Check for missing values
print("\nMissing Values:")
print(df.isnull().sum())

# In this dataset, there are no missing values. If there were, we could handle them as follows:
# df.fillna(df.mean(), inplace=True)  # Fill missing values with the mean
# df.dropna(inplace=True)  # Drop rows with missing values

# Step 2: Handling Duplicates
# Check for duplicate rows
print("\nDuplicate Rows:")
print(df.duplicated().sum())

# Drop duplicate rows
df.drop_duplicates(inplace=True)

# Step 3: Feature Scaling
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaled_features = scaler.fit_transform(df.iloc[:, :-1])

df.iloc[:, :-1] = scaled_features

# Step 4: Encoding Categorical Variables
# In this dataset, the target variable is already encoded as integers. If it were categorical, we could use:
# from sklearn.preprocessing import LabelEncoder
# le = LabelEncoder()
# df['target'] = le.fit_transform(df['target'])

# Step 5: Feature Selection (if needed)
# For demonstration, we'll keep all features. If feature selection were needed:
# from sklearn.feature_selection import SelectKBest, f_classif
# selector = SelectKBest(score_func=f_classif, k=2)
# selected_features = selector.fit_transform(df.iloc[:, :-1], df['target'])
# df = pd.DataFrame(data=selected_features, columns=['feature1', 'feature2'])
# df['target'] = iris.target

# Final Cleaned Data
print("\nCleaned Data:")
print(df.head())


Initial Data:
   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
0                5.1               3.5                1.4               0.2   
1                4.9               3.0                1.4               0.2   
2                4.7               3.2                1.3               0.2   
3                4.6               3.1                1.5               0.2   
4                5.0               3.6                1.4               0.2   

   target  
0       0  
1       0  
2       0  
3       0  
4       0  

Missing Values:
sepal length (cm)    0
sepal width (cm)     0
petal length (cm)    0
petal width (cm)     0
target               0
dtype: int64

Duplicate Rows:
1

Cleaned Data:
   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
0          -0.898033          1.012401          -1.333255         -1.308624   
1          -1.139562         -0.137353          -1.333255         -1.308624   
2          -1.381091   