In [None]:
# Task: Data Loading, Analysis, and Visualization
# Dataset: Iris (from sklearn)

# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_iris

# -------------------------------
# Task 1: Load and Explore the Dataset
# -------------------------------

try:
    # Load Iris dataset from sklearn
    iris = load_iris(as_frame=True)
    df = iris.frame  # Convert to Pandas DataFrame

    # Display first few rows
    print("First 5 rows of the dataset:")
    print(df.head())

    # Dataset info
    print("\nDataset Information:")
    print(df.info())

    # Check missing values
    print("\nMissing values per column:")
    print(df.isnull().sum())

    # Clean dataset (if missing values existed, handle them)
    # Here: No missing values, but let's demonstrate handling
    df.fillna(df.mean(numeric_only=True), inplace=True)  # Example of filling with mean

except FileNotFoundError:
    print("Error: Dataset file not found. Please check the file path.")
except Exception as e:
    print(f"An error occurred: {e}")


# -------------------------------
# Task 2: Basic Data Analysis
# -------------------------------

# Compute basic statistics
print("\nBasic Statistics of Numerical Columns:")
print(df.describe())

# Grouping: Mean of numerical columns grouped by species
grouped = df.groupby("target").mean()
print("\nMean values grouped by species:")
print(grouped)

# Add species names instead of target numbers
df["species"] = df["target"].map(dict(enumerate(iris.target_names)))

# Example pattern: check if one species has longer petals on average
print("\nAverage Petal Length per Species:")
print(df.groupby("species")["petal length (cm)"].mean())


# -------------------------------
# Task 3: Data Visualization
# -------------------------------

# Set Seaborn style
sns.set(style="whitegrid")

# 1. Line chart - simulate trend (cumulative sum of sepal length)
plt.figure(figsize=(8, 5))
df_sorted = df.sort_index()
plt.plot(df_sorted.index, df_sorted["sepal length (cm)"].cumsum(), label="Cumulative Sepal Length")
plt.title("Line Chart: Cumulative Sepal Length Over Index")
plt.xlabel("Index")
plt.ylabel("Cumulative Sepal Length (cm)")
plt.legend()
plt.show()

# 2. Bar chart - average petal length per species
plt.figure(figsize=(8, 5))
sns.barplot(x="species", y="petal length (cm)", data=df, ci=None)
plt.title("Bar Chart: Average Petal Length per Species")
plt.xlabel("Species")
plt.ylabel("Average Petal Length (cm)")
plt.show()

# 3. Histogram - distribution of sepal width
plt.figure(figsize=(8, 5))
plt.hist(df["sepal width (cm)"], bins=15, edgecolor="black")
plt.title("Histogram: Distribution of Sepal Width")
plt.xlabel("Sepal Width (cm)")
plt.ylabel("Frequency")
plt.show()

# 4. Scatter plot - Sepal length vs Petal length
plt.figure(figsize=(8, 5))
sns.scatterplot(x="sepal length (cm)", y="petal length (cm)", hue="species", data=df)
plt.title("Scatter Plot: Sepal Length vs Petal Length")
plt.xlabel("Sepal Length (cm)")
plt.ylabel("Petal Length (cm)")
plt.legend(title="Species")
plt.show()
