In [None]:
import pandas as pd

# Load the dataset
Iris = pd.read_csv('Iris.csv')

# Display the first few rows and basic information about the dataset
print(Iris.head())
print(Iris.info())


   Id  SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm      Species
0   1            5.1           3.5            1.4           0.2  Iris-setosa
1   2            4.9           3.0            1.4           0.2  Iris-setosa
2   3            4.7           3.2            1.3           0.2  Iris-setosa
3   4            4.6           3.1            1.5           0.2  Iris-setosa
4   5            5.0           3.6            1.4           0.2  Iris-setosa
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             150 non-null    int64  
 1   SepalLengthCm  150 non-null    float64
 2   SepalWidthCm   150 non-null    float64
 3   PetalLengthCm  150 non-null    float64
 4   PetalWidthCm   150 non-null    float64
 5   Species        150 non-null    object 
dtypes: float64(4), int64(1), object(1)
memory usage: 7.2+ KB
None


In [None]:
# Check for missing values
print("Missing values before handling:", Iris.isnull().sum())


Missing values before handling: Id               0
SepalLengthCm    0
SepalWidthCm     0
PetalLengthCm    0
PetalWidthCm     0
Species          0
dtype: int64


In [None]:
# Function to remove outliers using IQR method

def remove_outliers(df, columns):
  Q1 = df[columns].quantile(0.25)
  Q3 = df[columns].quantile(0.75)
  IQR = Q3 - Q1
  lower_bound = Q1 - 1.5 * IQR
  upper_bound = Q3 + 1.5 * IQR
  return df[~((df[columns] < lower_bound) | (df[columns] > upper_bound)).any(axis=1)]

# Specify numerical columns where you want to detect outliers
numerical_columns = ['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']
Iris_no_outliers = remove_outliers(Iris, numerical_columns)

# Check the shape before and after removing outliers
print("Shape before removing outliers:", Iris.shape)
print("Shape after removing outliers:", Iris_no_outliers.shape)


Shape before removing outliers: (150, 6)
Shape after removing outliers: (146, 6)


In [None]:
# No categorical variables to encode in the Iris dataset, as all are numerical
# No need for categorical encoding

# Separate features (X) and target (y)
X = Iris.drop('Species', axis=1)  # Features
y = Iris['Species']  # Target

# Check the data types of X and y
print(X.dtypes)
print(y.dtypes)


Id                 int64
SepalLengthCm    float64
SepalWidthCm     float64
PetalLengthCm    float64
PetalWidthCm     float64
dtype: object
object


In [None]:
from collections import Counter

# Check class distribution of target variable
print("Class distribution before handling imbalance:", Counter(y))

# No significant imbalance in the Iris dataset, so no handling is required


Class distribution before handling imbalance: Counter({'Iris-setosa': 50, 'Iris-versicolor': 50, 'Iris-virginica': 50})


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel

# Use RandomForestClassifier for feature selection
clf = RandomForestClassifier(random_state=42)
selector = SelectFromModel(clf)
selector.fit(X, y)

# Get selected feature indices and names
selected_features_idx = selector.get_support(indices=True)
selected_features = X.columns[selected_features_idx]

# Subset X with selected features
X_selected = X[selected_features]

# Check the selected features
print("Selected features:", selected_features)


Selected features: Index(['Id', 'PetalLengthCm', 'PetalWidthCm'], dtype='object')


In [None]:
from sklearn.decomposition import PCA

# Perform PCA for dimensionality reduction
pca = PCA(n_components=2)  # Reduce to 2 principal components
X_pca = pca.fit_transform(X_selected)

# Check the shape after PCA
print("Shape after PCA:", X_pca.shape)


Shape after PCA: (150, 2)
