# Foundations: pandas data analysis and visualization


In [None]:
from sklearn import datasets
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

This cell imports `scikit-learn` for datasets, `matplotlib` and `seaborn` for plotting, and `pandas` for handling tabular data.

## pandas basics

Documentation: [pandas documentation](https://pandas.pydata.org/docs/)

In [None]:
# Create a DataFrame from a Python dictionary
data = {
    "name": ["Alice", "Bob", "Charlie", "David"],
    "age": [25, 30, 35, 40],
    "score": [85.5, 92.0, 88.0, 95.5],
}
df = pd.DataFrame(data)
df

In [None]:
# Basic selection and summary statistics
print("Names column:\n", df["name"])

In [None]:
print("Average age:", df["age"].mean())

In [None]:
print("Describe scores:\n", df["score"].describe())

In [None]:
# Simple visualization of the scores
df.plot.bar(x="name", y="score", title="Participant scores", legend=False)
plt.xlabel("name")
plt.ylabel("score")
plt.tight_layout()
plt.show()

## Exploratory data analysis

In [None]:
# Load the Iris dataset from scikit-learn and put it in a DataFrame
iris = datasets.load_iris()
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df.head()

In [None]:
df["target"] = iris.target
df.head()

In [None]:
# Summary statistics
print("Summary statistics:\n", df.describe(), "\n")

In [None]:
# Scatter plot of two features
df.plot.scatter(x="sepal length (cm)", y="petal length (cm)", c="target", cmap="viridis")
plt.title("Iris feature scatter plot")
plt.show()

In [None]:
# Histogram of petal widths
df["petal width (cm)"].hist(bins=20)
plt.title("Petal width distribution")
plt.xlabel("width (cm)")
plt.ylabel("count")
plt.show()

In [None]:
# Correlation heatmap
corr = df.drop(columns=['target']).corr()
sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.title('Feature correlation heatmap')
plt.tight_layout()
plt.show()

## Digits dataset exploration

In [None]:
from sklearn.datasets import load_digits
digits = load_digits()
df_digits = pd.DataFrame(digits.data)
df_digits["target"] = digits.target
df_digits.head()

In [None]:
df_digits["target"].value_counts()

In [None]:
df_digits["target"].value_counts().plot.bar()
plt.title("Digit class distribution")
plt.xlabel("digit")
plt.ylabel("count")
plt.show()

In [None]:
# Show a few example images from the scikit-learn dataset
fig, axes = plt.subplots(2, 5, figsize=(2, 3))
for ax, image, label in zip(axes.ravel(), digits.images, digits.target):
    ax.imshow(image, cmap='gray_r')
    ax.set_title(label)
    ax.axis('off')
plt.suptitle('Sample digits')
plt.tight_layout()
plt.show()

## Handling missing data

In [None]:
import numpy as np
df_missing = pd.DataFrame({"A": [1, 2, np.nan, 4], "B": [5, np.nan, np.nan, 8]})
df_missing

In [None]:
print("Fill NA with column means:")
df_missing.fillna(df_missing.mean(numeric_only=True))

In [None]:
print("Drop rows with NA:")
df_missing.dropna()