In [1]:
# preprocessing_iris.py
# Task 1: Data Preprocessing and Exploration
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.model_selection import train_test_split

# 1. Load dataset
iris = load_iris()
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df['species'] = pd.Categorical.from_codes(iris.target, iris.target_names)

# 2. Preprocess
# Check for missing values
print("Missing values:\n", df.isnull().sum())

# Min-Max scaling for features
scaler = MinMaxScaler()
df[iris.feature_names] = scaler.fit_transform(df[iris.feature_names])

# Encode species (one-hot encoding example)
encoder = OneHotEncoder(sparse_output=False)
species_encoded = encoder.fit_transform(df[['species']])
species_df = pd.DataFrame(species_encoded, columns=encoder.get_feature_names_out(['species']))
df_encoded = pd.concat([df.drop('species', axis=1), species_df], axis=1)

# 3. Exploration
# Summary statistics
print("\nSummary statistics:")
print(df.describe())

# Pairplot
sns.pairplot(df, hue='species')
plt.savefig("pairplot.png")
plt.close()

# Correlation heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(df[iris.feature_names].corr(), annot=True, cmap='coolwarm')
plt.title("Correlation Heatmap")
plt.savefig("correlation_heatmap.png")
plt.close()

# Boxplots for outlier detection
for feature in iris.feature_names:
    plt.figure()
    sns.boxplot(x='species', y=feature, data=df)
    plt.title(f"Boxplot of {feature} by Species")
    plt.savefig(f"boxplot_{feature.replace(' ', '_')}.png")
    plt.close()

# 4. Train/test split function
def split_data(X, y, test_size=0.2, random_state=42):
    return train_test_split(X, y, test_size=test_size, random_state=random_state)

# Example split
X = df[iris.feature_names]
y = df['species']
X_train, X_test, y_train, y_test = split_data(X, y)
print("\nTrain/test split shapes:", X_train.shape, X_test.shape, y_train.shape, y_test.shape)

# Save preprocessed dataset
df.to_csv("iris_preprocessed.csv", index=False)
print("\nPreprocessed data saved to iris_preprocessed.csv")


Missing values:
 sepal length (cm)    0
sepal width (cm)     0
petal length (cm)    0
petal width (cm)     0
species              0
dtype: int64

Summary statistics:
       sepal length (cm)  sepal width (cm)  petal length (cm)  \
count         150.000000        150.000000         150.000000   
mean            0.428704          0.440556           0.467458   
std             0.230018          0.181611           0.299203   
min             0.000000          0.000000           0.000000   
25%             0.222222          0.333333           0.101695   
50%             0.416667          0.416667           0.567797   
75%             0.583333          0.541667           0.694915   
max             1.000000          1.000000           1.000000   

       petal width (cm)  
count        150.000000  
mean           0.458056  
std            0.317599  
min            0.000000  
25%            0.083333  
50%            0.500000  
75%            0.708333  
max            1.000000  

Train/test sp