Import Libraries

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

Load the Dataset

In [None]:
# Load the Iris dataset using pandas
iris = pd.read_csv("iris.csv")

# Display the first 5 rows of the dataset.
iris.head()

In [None]:
# 1. How many rows and columns does the dataset have?
rows, columns = iris.shape
print(f"Rows: {rows}, Columns: {columns}")
# 2. Are there any missing values?
# Check for missing values
iris.isnull().sum()

Histograms

In [None]:
# Create histograms for each numerical feature.
numerical_feature = ["sepal_length", "sepal_width", "petal_length", "petal_width"]

for each_col in numerical_feature:
    plt.grid(axis="y")
    plt.hist(iris[each_col], bins=20, edgecolor="black", color="orange", zorder=2)
    plt.title(f"Histogram for {each_col}")
    plt.xlabel(each_col)
    plt.ylabel("Frequency")
    plt.show()

In [None]:
# 1. Which feature has the most variability?
# To find the most variability we need Standard Derivation
standard_derivation = iris[numerical_feature].std()
print(standard_derivation)

# The feature with the highest standard deviation has the most variability.

Boxplots

In [None]:
# Create a boxplot for each numerical feature.
for each_col in numerical_feature:
    sns.boxplot(iris[each_col], color="red")
    plt.title(f"Boxplot for {each_col}")
    plt.xlabel(each_col)
    plt.show()

In [None]:
# 1. Identify features with outliers
# Check for outliers based on the boxplot (inspect visually)

# 2. Are there significant differences in the range of values for features?
# Find the range (max - min) for each feature
ranges = iris[numerical_feature].max() - iris[numerical_feature].min()
print(ranges)

Pairwise Scatterplots

In [None]:
# Use Seaborn's pairplot() to create scatterplots of all features.
sns.pairplot(iris, hue="species", diag_kind="kde", markers=["o", "s", "D"])
plt.show()

In [None]:
# 1. Are there any pairs of features that clearly separate the species?
# Yes, ( setosa ) and ( versicolor with virginica ) 

# Which features seem most correlated?
# petal_length vs petal_width seem the most correlated

Correlation Heatmap

In [None]:
# Compute the correlation matrix and visualize it as a heatmap.
correlation_matrix = iris[numerical_feature].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.show()

In [None]:
# 1.Which pair of features is most strongly correlated?
# petal_length and petal_width because it has correlation value of 0.96

# 2.Does this correlation make sense based on the scatterplots?
# Yes because the more petal_length increase the more petal_width also increase 

Violin Plots

In [None]:
# Create a violin plot for sepal_length grouped by species.
sns.violinplot(iris,x = "sepal_length", y = "species",hue = "species",split = True, inner = "quart")
plt.title("Violin Plot of Sepal Length")
plt.xlabel("Sepal Length")
plt.ylabel("Species")
plt.show()

In [None]:
# 1. Which species has the widest spread for sepal_length?
# Virginica

# 2. How does the median sepal_length vary across species?
# setosa has the lowest median and virginica has the highest median .versicolor has a median that is between setosa and virginica

3D Scatter Plot

In [None]:
# Create a 3D scatter plot for sepal_length, sepal_width, and petal_length.
from mpl_toolkits.mplot3d import Axes3D

fig = plt.figure()
axis = fig.add_subplot(111,projection="3d")

axis.scatter(iris['sepal_length'], iris['sepal_width'], iris['petal_length'],c = pd.Categorical(iris['species']).codes, marker = 'o')
axis.set_xlabel('Sepal Length')
axis.set_ylabel('Sepal Width')
axis.set_zlabel('Petal Length')
axis.set_title("3D Scatter Plot for Sepal Length, Sepal Width, and Petal Length")
plt.show()


In [None]:
# 1. Are there distinct clusters in 3D space for the species?
# Yes, there are distinct clusters for each species 

# 2. How do these clusters compare to the pairwise scatterplots?
# The 3D scatter plot shows clear clusters of species but pairwise scatterplots give more insights between pairs of feature

Create a Dashboard of Insights

In [None]:
sns.set(style="whitegrid")

fig, axes = plt.subplots(1, 3,figsize = (18,6))

# A heatmap for correlations.
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', ax=axes[0])
axes[0].set_title('Correlation Heatmap')

#A scatterplot for the two most correlated features.
sns.scatterplot(iris, x='petal_length', y="petal_width", hue='species', palette='viridis', ax=axes[1])
axes[1].set_title('Scatterplot of Petal Length vs Petal Width')

#A violin plot for petal_length grouped by species.
sns.violinplot(iris, x='species', y='petal_length',hue='species', palette='Set2', ax=axes[2])
axes[2].set_title('Violin Plot for Petal Length by Species')

plt.show()

In [None]:
# 1. Which features are most useful for distinguishing species?
# petal length 

# 2. Are there any unexpected patterns?
# Yes,  
# - setosa is seperate from the other two species versicolor and virginica
# - correlation between petal_length and petal_width is not expected to be that high


Interactive Visualizations

In [None]:
import plotly.express as px
from plotly.subplots import make_subplots

fig = make_subplots(1,2, subplot_titles=("Sepal Length vs Sepal Width", "Histogram of Sepal Length"))

# Scatterplot 
scatter = px.scatter(iris, x='sepal_length', y='sepal_width', color='species')
for trace in scatter.data:
    fig.add_trace(trace, 1, 1)

# histogram
hist = px.histogram(iris, x='sepal_length', color='species', nbins=20)
for trace in hist.data:
    fig.add_trace(trace, 1, 2)

fig.show()

In [None]:
# How does interactivity help in exploring the dataset?
# - Zooming, panning
# - selecting, hovering
# - toggle for visibility for cartain feature