In [None]:
import numpy as np
import pandas as pd
from google.colab import drive, files

# Mount Google Drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# **Step 1: Load the Iris dataset**

In [None]:
iris_data = pd.read_csv('drive/My Drive/irisdataset.csv')
print("Original dataset shape:", iris_data.shape)

Original dataset shape: (150, 5)


 # **Step 2: Data Quality Checks**

In [None]:
# Check for missing values
missing_values = {column: iris_data[column].isna().sum() for column in iris_data.columns}
print("\nMissing values in each column:")
print(missing_values)


Missing values in each column:
{'sepal.length': np.int64(0), 'sepal.width': np.int64(0), 'petal.length': np.int64(0), 'petal.width': np.int64(0), 'variety': np.int64(0)}


In [None]:
# Check for duplicate rows
duplicate_rows = iris_data.duplicated().sum()
print(f"\nNumber of duplicate rows: {duplicate_rows}")

# If duplicates exist, show them
if duplicate_rows > 0:
    print("\nDuplicate rows:")
    print(iris_data[iris_data.duplicated(keep='first')])


Number of duplicate rows: 1

Duplicate rows:
     sepal.length  sepal.width  petal.length  petal.width    variety
142           5.8          2.7           5.1          1.9  Virginica


In [None]:
# Remove duplicates and keep first occurrence
iris_data_clean = iris_data.drop_duplicates(keep='first')
print("\nDataset shape after removing duplicates:", iris_data_clean.shape)


Dataset shape after removing duplicates: (149, 5)


# **Step 3: Feature Selection**

In [None]:
features = ['sepal.length', 'sepal.width', 'petal.length', 'petal.width']
X = iris_data[features].values  # Convert to numpy array for easier manipulation
y = iris_data['variety']

 # **Step 4: Label Encoding**

In [None]:
# Get unique classes and create mapping
unique_classes = np.unique(y)
class_to_index = {class_name: index for index, class_name in enumerate(unique_classes)}
print("\nClass encoding mapping:")
print(class_to_index)

# Apply manual encoding
y_encoded = np.array([class_to_index[class_name] for class_name in y])


Class encoding mapping:
{'Setosa': 0, 'Versicolor': 1, 'Virginica': 2}


# **Step 4: Z-score Standardization**

In [None]:
# Calculate mean and standard deviation for each feature
feature_means = np.mean(X, axis=0)
feature_stds = np.std(X, axis=0)

# Apply Z-score standardization manually
X_scaled = np.zeros_like(X, dtype=float)
for i in range(X.shape[1]):
    X_scaled[:, i] = (X[:, i] - feature_means[i]) / feature_stds[i]

# Convert back to DataFrame for easier handling
X_scaled_df = pd.DataFrame(X_scaled, columns=features)

# Verify standardization
print("\nManual verification of standardization:")
print("Feature means after standardization:")
print(np.mean(X_scaled, axis=0))
print("\nFeature standard deviations after standardization:")
print(np.std(X_scaled, axis=0))


Manual verification of standardization:
Feature means after standardization:
[-4.73695157e-16 -7.81597009e-16 -4.26325641e-16 -4.73695157e-16]

Feature standard deviations after standardization:
[1. 1. 1. 1.]


 # **Step 6: Combine processed features and target**

In [None]:
processed_data = X_scaled_df.copy()
processed_data['target'] = y_encoded
processed_data['variety'] = y  # Keep original labels for reference

print("\nFinal preprocessed data:")
print(processed_data.head())


Final preprocessed data:
   sepal.length  sepal.width  petal.length  petal.width  target variety
0     -0.900681     1.019004     -1.340227    -1.315444       0  Setosa
1     -1.143017    -0.131979     -1.340227    -1.315444       0  Setosa
2     -1.385353     0.328414     -1.397064    -1.315444       0  Setosa
3     -1.506521     0.098217     -1.283389    -1.315444       0  Setosa
4     -1.021849     1.249201     -1.340227    -1.315444       0  Setosa


# **Step 5: Save the fully processed dataset**

In [None]:
# Save the processed dataset
processed_data.to_csv('iris_processed_standardized.csv', index=False)
files.download('iris_processed_standardized.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>