# Foundations: pandas data analysis and visualization


In [None]:
from sklearn import datasets
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns


This cell imports `scikit-learn` for datasets, `matplotlib` and `seaborn` for plotting, and `pandas` for handling tabular data.

## pandas basics

In [None]:
# Create a DataFrame from a Python dictionary
data = {
    "name": ["Alice", "Bob", "Charlie", "David"],
    "age": [25, 30, 35, 40],
    "score": [85.5, 92.0, 88.0, 95.5],
}
df = pd.DataFrame(data)
print("DataFrame:\n", df, "\n")

# Basic selection and summary statistics
print("Names column:\n", df["name"])
print("Average age:", df["age"].mean())
print("Describe scores:\n", df["score"].describe())

# Simple visualization of the scores
df.plot.bar(x="name", y="score", title="Participant scores", legend=False)
plt.xlabel("name")
plt.ylabel("score")
plt.tight_layout()
plt.show()

- Create a small DataFrame from a Python dictionary- Print the entire DataFrame to inspect it- Select the `name` column- Compute the average of the `age` column- Use `describe` to summarize the `score` column- Plot the scores as a bar chart

## Exploratory data analysis

In [None]:
# Load the Iris dataset from scikit-learn and put it in a DataFrame
iris = datasets.load_iris()
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df["target"] = iris.target

# Display the first few rows
print("First five rows:\n", df.head(), "\n")

# Summary statistics
print("Summary statistics:\n", df.describe(), "\n")

# Scatter plot of two features
df.plot.scatter(x="sepal length (cm)", y="petal length (cm)", c="target", cmap="viridis")
plt.title("Iris feature scatter plot")
plt.show()

# Histogram of petal widths
df["petal width (cm)"].hist(bins=20)
plt.title("Petal width distribution")
plt.xlabel("width (cm)")
plt.ylabel("count")
plt.show()
# Pairplot of all numerical features colored by class
sns.pairplot(df, hue='target')
plt.suptitle('Iris pair plot', y=1.02)
plt.show()

# Correlation heatmap
corr = df.drop(columns=['target']).corr()
sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.title('Feature correlation heatmap')
plt.tight_layout()
plt.show()

- Load the classic Iris dataset and store it in a DataFrame- Show the first rows and summary statistics- Scatter and histogram plots provide a basic look at feature relationships- `seaborn.pairplot` draws pairwise comparisons for all features- Finally, a heatmap visualizes feature correlations

## Digits dataset exploration

In [None]:
from sklearn.datasets import load_digits
digits = load_digits()
df_digits = pd.DataFrame(digits.data)
df_digits["target"] = digits.target
print(df_digits.head(), "
")
df_digits["target"].value_counts().plot.bar()
plt.title("Digit class distribution")
plt.xlabel("digit")
plt.ylabel("count")
plt.show()
# Show a few example images from the scikit-learn dataset
fig, axes = plt.subplots(2, 5, figsize=(8, 4))
for ax, image, label in zip(axes.ravel(), digits.images, digits.target):
    ax.imshow(image, cmap='gray_r')
    ax.set_title(label)
    ax.axis('off')
plt.suptitle('Sample digits')
plt.tight_layout()
plt.show()



- Load the handwritten digits dataset from scikit-learn- Display the distribution of digit classes- Plot a few example images from this dataset- Show one representative image for each digit

## Handling missing data

In [None]:
import numpy as np
df_missing = pd.DataFrame({"A": [1, 2, np.nan, 4], "B": [5, np.nan, np.nan, 8]})
print("Original:
", df_missing, "
")
print("Fill NA with column means:
", df_missing.fillna(df_missing.mean(numeric_only=True)))
print("Drop rows with NA:
", df_missing.dropna())

This cell demonstrates filling missing values with column means and dropping any rows that still contain NaNs.