# Foundations: NumPy, pandas and Matplotlib


This notebook demonstrates how to use **Jupyter notebooks**. Each cell can be run individually by clicking the play button or using `Shift+Enter`.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## 0. NumPy basics

In [None]:
import numpy as np
# Create arrays from Python lists
a = np.array([1, 2, 3])
print("1D array:", a)

# Create a 2D array of zeros
b = np.zeros((2, 3))
print("\n2D zeros array:\n", b)

# Create a 3x3 identity matrix
c = np.eye(3)
print("\nIdentity matrix:\n", c)

# More array creation tricks
d = np.arange(12).reshape(3, 4)
print("\nReshaped array:\n", d)

e = np.full((2, 2), 7)
print("\nConstant array:\n", e)

## 1. Array math

In [None]:
import numpy as np
x = np.array([1, 2, 3])
y = np.array([4, 5, 6])

# Element-wise operations
print("x + y =", x + y)
print("x * y =", x * y)

# Dot product
print("x dot y =", x @ y)

# Vectorized functions
print("sin(x) =", np.sin(x))

# Aggregations
print("mean of y =", y.mean())

# Broadcasting with a scalar
print("y squared =", y ** 2)

## 2. Indexing and slicing

In [None]:
import numpy as np
a = np.arange(10)
print("a =", a)

# Basic slicing
print("a[2:5] =", a[2:5])

# Negative step slicing
print("reverse =", a[::-1])

# Boolean masking
mask = a % 2 == 0
print("even elements =", a[mask])

# Fancy indexing
idx = [1, 3, 5]
print("selected indices =", a[idx])

# Adding a new axis
col_vec = a[:, np.newaxis]
print("\ncolumn vector shape:", col_vec.shape)

## 3. Broadcasting

In [None]:
import numpy as np
x = np.arange(3)
print("x =", x)

# Add a scalar (broadcast)
print("x + 5 =", x + 5)

# Add a 2D column vector to a row vector
a = x.reshape(3, 1)
b = np.array([10, 20, 30])
print("\na =\n", a)
print("b =", b)
print("a + b =\n", a + b)

# Multiply by a row vector
m = np.ones((2, 3))
print("\nm =\n", m)
print("m * x =\n", m * x)

## 4. Random numbers & statistics

In [None]:
import numpy as np
import matplotlib.pyplot as plt
# Random samples from a normal distribution
samples = np.random.randn(1000)
print("first five samples:", samples[:5])

# Compute basic statistics
print("mean =", samples.mean())
print("std =", samples.std())

print("25th percentile =", np.percentile(samples, 25))

# Histogram (counts per bin)
hist, bins = np.histogram(samples, bins=5)
print("\nhistogram:")
for b_left, b_right, count in zip(bins[:-1], bins[1:], hist):
    print(f"{b_left: .2f} to {b_right: .2f}: {count}")

# Visualize the distribution
plt.hist(samples, bins=30, density=True, alpha=0.7)
plt.title("Histogram of random samples")
plt.xlabel("value")
plt.ylabel("density")
plt.show()

## 5. Polynomial fitting

In [None]:
import numpy as np
import matplotlib.pyplot as plt
# Create noisy quadratic data
rng = np.random.default_rng(0)
x = np.linspace(-3, 3, 20)
y = 0.5 * x**2 - x + 2 + rng.normal(scale=1.0, size=x.shape)

# Fit a second degree polynomial
coeffs = np.polyfit(x, y, deg=2)
print("coefficients:", coeffs)

# Evaluate the fitted polynomial
p = np.poly1d(coeffs)
y_fit = p(x)

# Show first few fitted values
print("\nfirst 5 fitted values:", y_fit[:5])

# Plot the data and fitted curve
plt.scatter(x, y, label="data")
plt.plot(x, y_fit, color="red", label="fit")
plt.title("Polynomial fit")
plt.legend()
plt.show()

## 6. Saving and loading

In [None]:
import numpy as np
arr = np.arange(9).reshape(3, 3)
print("Original array:\n", arr)

np.save('array.npy', arr)
print('Array saved to array.npy')

loaded = np.load('array.npy')
print('Loaded array:\n', loaded)

# Save multiple arrays in a compressed npz
np.savez_compressed('arrays.npz', first=arr, second=arr * 2)
data = np.load('arrays.npz')
print('\nArrays in npz:', list(data.keys()))

np.savetxt('array.txt', arr, fmt='%d')
print('Also saved to array.txt')

## 7. Vectorization speed comparison

In [None]:
import numpy as np
import time
n = 1000000
data = np.arange(n)

# Sum using Python loop
start = time.time()
total = 0
for value in data:
    total += value
loop_time = time.time() - start

# Sum using vectorized operation
start = time.time()
vector_total = np.sum(data)
vector_time = time.time() - start

print("loop sum =", total, "took", loop_time, "seconds")
print("vectorized sum =", vector_total, "took", vector_time, "seconds")

## 8. pandas basics

In [None]:
import pandas as pd
# Create a DataFrame from a Python dictionary
data = {
    "name": ["Alice", "Bob", "Charlie", "David"],
    "age": [25, 30, 35, 40],
    "score": [85.5, 92.0, 88.0, 95.5],
}
df = pd.DataFrame(data)
print("DataFrame:\n", df, "\n")

# Basic selection and summary statistics
print("Names column:\n", df["name"])
print("Average age:", df["age"].mean())
print("Describe scores:\n", df["score"].describe())

# Simple visualization of the scores
df.plot.bar(x="name", y="score", title="Participant scores", legend=False)
import matplotlib.pyplot as plt
plt.xlabel("name")
plt.ylabel("score")
plt.tight_layout()
plt.show()

## 9. Exploratory data analysis

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import datasets
# Load the Iris dataset from scikit-learn and put it in a DataFrame
iris = datasets.load_iris()
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df["target"] = iris.target

# Display the first few rows
print("First five rows:\n", df.head(), "\n")

# Summary statistics
print("Summary statistics:\n", df.describe(), "\n")

# Scatter plot of two features
df.plot.scatter(x="sepal length (cm)", y="petal length (cm)", c="target", cmap="viridis")
plt.title("Iris feature scatter plot")
plt.show()

# Histogram of petal widths
df["petal width (cm)"].hist(bins=20)
plt.title("Petal width distribution")
plt.xlabel("width (cm)")
plt.ylabel("count")
plt.show()

## 10. CSV input and output

In [None]:
import pandas as pd
# Create a simple DataFrame
data = {
    "city": ["San Diego", "Los Angeles", "San Francisco"],
    "population": [1.4, 3.9, 0.88],
}
df = pd.DataFrame(data)
print("Original DataFrame:\n", df, "\n")

# Write the DataFrame to CSV
csv_path = "cities.csv"
df.to_csv(csv_path, index=False)
print(f"Data written to {csv_path}")

# Read the file back in
loaded = pd.read_csv(csv_path)
print("\nLoaded from CSV:\n", loaded)

## 11. CIFAR-10 image exploration

In [None]:
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt
# Download CIFAR-10 training data
transform = transforms.ToTensor()
cifar = datasets.CIFAR10(root="./data", train=True, download=True, transform=transform)

print("Number of images:", len(cifar))

loader = DataLoader(cifar, batch_size=4, shuffle=True)
images, labels = next(iter(loader))

# Plot a few sample images
fig, axes = plt.subplots(1, 4, figsize=(8, 2))
for img, ax in zip(images, axes):
    ax.imshow(img.permute(1, 2, 0))
    ax.axis("off")
plt.suptitle("CIFAR-10 samples")
plt.show()