In [1]:
# Iris Dataset ETL Pipeline - CODTECH Task 1

import pandas as pd
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler

# 📥 Extract - Load the Iris dataset
df = sns.load_dataset('iris')  # loads as DataFrame
print("Original Data Sample:")
print(df.head())

# 🧹 Transform - Clean and Preprocess

# 1. Check for missing values
print("\nMissing values:\n", df.isnull().sum())

# 2. Encode target column 'species'
le = LabelEncoder()
df['species'] = le.fit_transform(df['species'])
# Setosa → 0, Versicolor → 1, Virginica → 2

# 3. Scale numerical features
scaler = StandardScaler()
df[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']] = scaler.fit_transform(
    df[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']]
)

# ✅ Check transformed data
print("\nTransformed Data Sample:")
print(df.head())

# 📤 Load - Save the processed dataset to CSV
df.to_csv('processed_iris.csv', index=False)
print("\n✅ Processed dataset saved as 'processed_iris.csv'")


Original Data Sample:
   sepal_length  sepal_width  petal_length  petal_width species
0           5.1          3.5           1.4          0.2  setosa
1           4.9          3.0           1.4          0.2  setosa
2           4.7          3.2           1.3          0.2  setosa
3           4.6          3.1           1.5          0.2  setosa
4           5.0          3.6           1.4          0.2  setosa

Missing values:
 sepal_length    0
sepal_width     0
petal_length    0
petal_width     0
species         0
dtype: int64

Transformed Data Sample:
   sepal_length  sepal_width  petal_length  petal_width  species
0     -0.900681     1.019004     -1.340227    -1.315444        0
1     -1.143017    -0.131979     -1.340227    -1.315444        0
2     -1.385353     0.328414     -1.397064    -1.315444        0
3     -1.506521     0.098217     -1.283389    -1.315444        0
4     -1.021849     1.249201     -1.340227    -1.315444        0

✅ Processed dataset saved as 'processed_iris.csv'
