### 2.3.2 Basic Commands

In [None]:
# Print statement
print("fit a model with", 11, "variables")

In [None]:
# Function documentation
print?

In [None]:
# Addition
3 + 5

In [None]:
# String concatenation
"hello" + " " + "world"

In [None]:
# List definition
x = [3, 4, 5]
x

In [None]:
# Concatenating lists
y = [4, 9, 7]
x + y

### 2.3.3 Introduction to Numerical Python

In [None]:
# Module import
import numpy as np

In [None]:
# NumPy array definition
x = np.array([3, 4, 5])
y = np.array([4, 9, 7])

In [None]:
# NumPy array addition
x + y

In [None]:
# Two dimensional arrays
x = np.array([[1, 2], [3, 4]])
x

In [None]:
# Array dimensions
x.ndim

In [None]:
# Array datatype
x.dtype

In [None]:
# Floating point array
np.array([[1, 2], [3.0, 4]]).dtype

In [None]:
# NumPy array documentation
np.array?

In [None]:
# Set array datatype
np.array([[1, 2], [3, 4]], float).dtype

In [None]:
# Array shape
x.shape

In [None]:
# Array sum method
x = np.array([1, 2, 3, 4])
x.sum()

In [None]:
# NumPy sum function
x = np.array([1, 2, 3, 4])
np.sum(x)

In [None]:
# Array reshaping
x = np.array([1, 2, 3, 4, 5, 6])
print("beginning x:\n", x)
x_reshape = x.reshape((2, 3))
print("reshaped x:\n", x_reshape)

In [None]:
# Element access
x_reshape[0, 0]

In [None]:
# More element access
x_reshape[1, 2]

In [None]:
# Shared memory
print("x before we modify x_reshape:\n", x)
print("x_reshape before we modify x_reshape:\n", x_reshape)
x_reshape[0, 0] = 5
print("x_reshape after we modift its top left element:\n", x_reshape)
print("x after we modify top left element of x_reshape:\n", x)

In [None]:
# Tuple immutability
my_tuple = (3, 4, 5)
my_tuple[0] = 2

In [None]:
# Array properties
x_reshape.shape, x_reshape.ndim, x_reshape.T

In [None]:
# Square root
np.sqrt(x)

In [None]:
# Squares
x**2

In [None]:
# Alternative square root syntax
x**0.5

In [None]:
# Normal distribution
x = np.random.normal(size=50)
x

In [None]:
# Normal distribution with mean 50 and standard deviation 1
y = x + np.random.normal(loc=50, size=50)

In [None]:
# Correlation matrix
np.corrcoef(x, y)

In [None]:
# Randomness
print(np.random.normal(scale=5, size=2))
print(np.random.normal(scale=5, size=2))

In [None]:
# Seeding randomness
rng = np.random.default_rng(1303)
print(rng.normal(scale=5, size=2))
rng2 = np.random.default_rng(1303)
print(rng2.normal(scale=5, size=2))

In [None]:
# Mean
rng = np.random.default_rng(3)
y = rng.standard_normal(10)
np.mean(y), y.mean()

In [None]:
# Variance
np.var(y), y.var(), np.mean((y - y.mean())**2)

In [None]:
# Standard deviation
np.sqrt(np.var(y)), np.std(y)

In [None]:
# Matrix applications
X = rng.standard_normal((10, 3))
X

In [None]:
# Row-wise mean
X.mean(axis=0)

In [None]:
# Alternative row-wise mean
X.mean(0)

### 2.3.4 Graphics

In [None]:
# Matplotlib
import matplotlib.pyplot as plt
fig, ax = plt.subplots(figsize=(8, 8))
x = rng.standard_normal(100)
y = rng.standard_normal(100)
ax.plot(x, y)

In [None]:
# Unpacking figure and axis
output = plt.subplots(figsize=(8, 8))
fig = output[0]
ax = output[1]

In [None]:
# Changing markers
fig, ax = plt.subplots(figsize=(8, 8))
ax.plot(x, y, "o");

In [None]:
# Scatter function
fig, ax = plt.subplots(figsize=(8, 8))
ax.scatter(x, y, marker="o");

In [None]:
# Scatter function without semicolon
fig, ax = plt.subplots(figsize=(8, 8))
ax.scatter(x, y, marker="o")

In [None]:
# Titling and axes labelling
fig, ax = plt.subplots(figsize=(8, 8))
ax.scatter(x, y, marker="o")
ax.set_xlabel("this is the x-axis")
ax.set_ylabel("this is the y-axis")
ax.set_title("Plot of X vs Y");

In [None]:
# Change figure size
fig.set_size_inches(12, 3)
fig

In [None]:
# Multiple subplots
fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(15, 5))

In [None]:
# Cherry picking subplots
axes[0, 1].plot(x, y, "o")
axes[1, 2].scatter(x, y, marker="+")
fig

In [None]:
# Save figures
fig.savefig(r"..\assets\Figure.png", dpi=400)
fig.savefig(r"..\assets\Figure.pdf", dpi=200);

In [None]:
# Continued modification
axes[0, 1].set_xlim([-1, 1])
fig.savefig(r"..\assets\Figure_updated.jpg")
fig

In [None]:
# Contour plot
fig, ax = plt.subplots(figsize=(8, 8))
x = np.linspace(-np.pi, np.pi, 50)
y = x
f = np.multiply.outer(np.cos(y), 1 / (1 + x**2))
ax.contour(x, y, f);

In [None]:
# Increase levels
fig, ax = plt.subplots(figsize=(8, 8))
ax.contour(x, y, f, levels=45);

In [None]:
# Image and heatmap plottin
fig, ax = plt.subplots(figsize=(8, 8))
ax.imshow(f);

### 2.3.6 Sequences and Slice Notation

In [None]:
# Sequence with np.linspace
seq1 = np.linspace(0, 10, 11)
seq1

In [None]:
# Sequence with np.arange
seq2 = np.arange(0, 10)
seq2

In [None]:
# Slice notation
"hello world"[3:6]

In [None]:
# Function call instead
"hello world"[slice(3, 6)]

### 2.3.6 Indexing Data

In [None]:
# Array initialisation
A = np.array(np.arange(16)).reshape((4, 4))
A

In [None]:
# Array indexing
A[1, 2]

In [None]:
# Select multiple rows
A[[1, 3]]

In [None]:
# Select multiple columns
A[:, [0, 2]]

In [None]:
# Attempt to create submatrix
A[[1, 3], [0, 2]]

In [None]:
# Equivalent result
np.array([A[1, 0], A[3, 2]])

In [None]:
# Fails again with more columns
A[[1, 3], [0, 2, 3]]

In [None]:
# Correct way to retrieve submatrix
A[[1, 3]][:, [0, 2]]

In [None]:
# Alternative method to create submatrix
idx = np.ix_([1, 3], [0, 2, 3])
A[idx]

In [None]:
# Another alternative method for creating submatrix
A[1:4:2, 0:3:2]

In [None]:
# Create array of False
keep_rows = np.zeros(A.shape[0], bool)
keep_rows

In [None]:
# Change values in list
keep_rows[[1, 3]] = True
keep_rows

In [None]:
# Numerical equivalence of Booleans
np.all(keep_rows == np.array([0, 1, 0, 1]))

In [None]:
# Select the first, second, first, second rows (indexing)
A[np.array([0, 1, 0, 1])]

In [None]:
# Select the second and fourth rows (Boolean indexing)
A[keep_rows]

In [None]:
# Boolean indexing to create submatrix
keep_cols = np.zeros(A.shape[1], bool)
keep_cols[[0, 2, 3]] = True
idx_bool = np.ix_(keep_rows, keep_cols)
A[idx_bool]

In [None]:
# Same result with mixed indexing
idx_mixed = np.ix_([1, 3], keep_cols)
A[idx_mixed]

### 2.3.7 Loading Data

In [None]:
# Read in a CSV
import pandas as pd
auto = pd.read_csv(r"..\data\Auto.csv")
auto

In [None]:
# Alternative read as data file
auto = pd.read_csv(r"..\data\Auto.data", delim_whitespace=True)
auto

In [None]:
# View horsepower column
auto["horsepower"]

In [None]:
# Unique column values
np.unique(auto["horsepower"])

In [None]:
# Fix NA encoded with ?
auto = pd.read_csv(r"..\data\Auto.data", na_values=["?"], delim_whitespace=True)
auto["horsepower"].sum()

In [None]:
# DataFrame shape
auto.shape

In [None]:
# Drop rows with NA values
auto_new = auto.dropna()
auto_new.shape

In [None]:
# Check columns names
auto = auto_new
auto.columns

In [None]:
# DataFrame slice
auto[:3]

In [None]:
# DataFrame slice with Booleans
idx_80 = auto["year"] > 80
auto[idx_80]

In [None]:
# Select multiple columns
auto[["mpg", "horsepower"]]

In [None]:
# View DataFrame index
auto.index

In [None]:
# Set index
auto_re = auto.set_index("name")
auto_re

In [None]:
# Setting index removes column
auto_re.columns

In [None]:
# Using loc for row selection
rows = ["amc rebel sst", "ford torino"]
auto_re.loc[rows]

In [None]:
# Using iloc for row selection
auto_re.iloc[[3, 4]]

In [None]:
# Using iloc for column selection
auto_re.iloc[:, [0, 2, 3]]

In [None]:
# Using iloc for row and column selection
auto_re.iloc[[3, 4], [0, 2, 3]]

In [None]:
# Indices are not inherintley unique
auto_re.loc["ford galaxie 500", ["mpg", "origin"]]

In [None]:
# Using loc with Boolean indexing
idx_80 = auto_re["year"] > 80
auto_re.loc[idx_80, ["weight", "origin"]]

In [None]:
# More concise syntax with lambda
auto_re.loc[lambda df: df["year"] > 80, ["weight", "origin"]]

In [None]:
# Multiple comparisons with elementwise AND operator
auto_re.loc[lambda df: (df["year"] > 80) & (df["mpg"] > 30), ["weight", "origin"]]

In [None]:
# Combining elementwise AND and OR operators
auto_re.loc[lambda df: (df["displacement"] < 300) & (df.index.str.contains("ford") | df.index.str.contains("datsun")), ["weight", "origin"]]

### 2.3.8 For Loops

In [None]:
# Basic for loop
total = 0
for value in [3, 2, 19]:
    total += value
print("Total is: {0}".format(total))

In [None]:
# Nested for loop
total = 0
for value in [2, 3, 19]:
    for weight in [3, 2, 1]:
        total += value * weight
print("Total is: {0}".format(total))

In [None]:
# Using zip to loop in synchrony
total = 0
for value, weight in zip([2, 3, 19], [0.2, 0.3, 0.5]):
    total += weight * value
print("Weighted average is: {0}".format(total))

In [None]:
# Generate DataFrame with some missing values
rng = np.random.default_rng(1)
A = rng.standard_normal((127, 5))
M = rng.choice([0, np.nan], p=[0.8, 0.2], size=A.shape)
A += M
D = pd.DataFrame(A, columns=["food", "bar", "pickle", "snack", "popcorn"])
D[:3]

In [None]:
# Use string formatting to print out percentage of missing values
for col in D.columns:
    template = "Column '{0}' has {1:.2%} missing values"
    print(template.format(col, np.isnan(D[col]).mean()))

### 2.3.9 Additional Graphical and Numerical Summaries

In [None]:
# Must provide the appropriate information to plotting function
fig, ax = plt.subplots(figsize=(8, 8))
ax.plot(horsepower, mpg, "o");

In [None]:
# Solve by accessing the columns directly
fig, ax = plt.subplots(figsize=(8, 8))
ax.plot(auto["horsepower"], auto["mpg"], "o");

In [None]:
# Plot using DataFrame attribute
ax = auto.plot.scatter("horsepower", "mpg");
ax.set_title("Horsepower vs. MPG")

In [None]:
# Save the figure using the figure attribute
fig = ax.figure
fig.savefig(r"..\assets\horsepower_mpg.png");

In [None]:
# Pass predefined axes to DataFrame method
fig, axes = plt.subplots(ncols=3, figsize=(15, 5))
auto.plot.scatter("horsepower", "mpg", ax=axes[1]);

In [None]:
# Numerical data type to categorical data type
auto.cylinders = pd.Series(auto.cylinders, dtype="category")
auto.cylinders.dtype

In [None]:
# Plot boxplots
fig, ax = plt.subplots(figsize=(8, 8))
auto.boxplot("mpg", by="cylinders", ax=ax);

In [None]:
# Plot histogram
fig, ax = plt.subplots(figsize=(8, 8))
auto.hist("mpg", ax=ax);

In [None]:
# Formatting a histogram
fig, ax = plt.subplots(figsize=(8, 8))
auto.hist("mpg", color="red", bins=12, ax=ax);

In [None]:
# Scatterplot matrix
pd.plotting.scatter_matrix(auto);

In [None]:
# Scatterplot matrix for a column subset
pd.plotting.scatter_matrix(auto[["mpg", "displacement", "weight"]]);

In [None]:
# Describe method for summaries
auto[["mpg", "weight"]].describe()

In [None]:
# Summaries of single columns
auto["cylinders"].describe()
auto["mpg"].describe()

## 2.4 Exercises

**Question 1**

(a) Flexible. An extremely large $n$ creates a more stable model trained on more data.

(b) Inflexible. A small $n$ means fewer training examples, so an inflexible method will make less assumptions about the model from the training data.

(c) Flexible. The flexible model will be able to capture the highly non-linear model.

(d) Inflexible. Very high variance means flexible models will capture data features that only exist due to variance.

**Question 2**

(a) Regression. Inference. $n = 500, p = 3$

(b) Classification. Prediction. $n = 20, p = 13$

(c) Regression. Prediction. $n = 52, p = 3$

**Question 3**

(a)

In [None]:
import matplotlib.pyplot as plt
img = plt.imread(r"..\assets\3a.png")
plt.imshow(img)
plt.show()