In [None]:
# Analyzing Data with Pandas and Visualizing Results with Matplotlib

# -------------------------------
# 1. Import Libraries
# -------------------------------
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Make plots look nicer
sns.set(style="whitegrid", palette="muted")

# -------------------------------
# 2. Load and Explore Dataset
# -------------------------------
print("=== Loading Iris Dataset ===")
df = sns.load_dataset("iris")   # Using built-in dataset for simplicity

# Display first rows
print("\nFirst 5 rows:")
print(df.head())

# Dataset info
print("\nDataset Info:")
print(df.info())

# Missing values
print("\nMissing values per column:")
print(df.isnull().sum())

# -------------------------------
# 3. Clean the Dataset
# -------------------------------
print("\n=== Cleaning Dataset ===")
# Drop missing values if any
df_cleaned = df.dropna()

print(f"Original dataset rows: {len(df)}")
print(f"Cleaned dataset rows: {len(df_cleaned)}")

# -------------------------------
# 4. Basic Data Analysis
# -------------------------------
print("\n=== Basic Statistics ===")
print(df_cleaned.describe())

# Group by species and compute mean sepal length
avg_sepal_length = df_cleaned.groupby("species")["sepal_length"].mean()
print("\nAverage Sepal Length per Species:")
print(avg_sepal_length)

# -------------------------------
# 5. Data Visualizations
# -------------------------------

# 1. Line Chart - Sepal Length Trend Over Samples
plt.figure(figsize=(8,5))
df_cleaned["sepal_length"].plot(kind="line", title="Sepal Length Trend Over Samples")
plt.xlabel("Sample Index")
plt.ylabel("Sepal Length")
plt.show()

# 2. Bar Chart - Average Petal Length per Species
plt.figure(figsize=(8,5))
sns.barplot(x="species", y="petal_length", data=df_cleaned, estimator="mean")
plt.title("Average Petal Length per Species")
plt.show()

# 3. Histogram - Distribution of Sepal Width
plt.figure(figsize=(8,5))
plt.hist(df_cleaned["sepal_width"], bins=15, color="skyblue", edgecolor="black")
plt.title("Distribution of Sepal Width")
plt.xlabel("Sepal Width")
plt.ylabel("Frequency")
plt.show()

# 4. Scatter Plot - Sepal Length vs Petal Length
plt.figure(figsize=(8,5))
sns.scatterplot(x="sepal_length", y="petal_length", hue="species", data=df_cleaned)
plt.title("Sepal Length vs Petal Length")
plt.show()

# -------------------------------
# 6. Observations
# -------------------------------
print("\n=== Observations ===")
print("- Iris-virginica generally has the longest petals among the species.")
print("- Sepal length and petal length are positively correlated across all species.")
print("- Sepal width shows a normal-like distribution, with most flowers having width between 2.5–3.5 cm.")
print("- Clear differences between species can be seen in petal size, useful for classification.")
