In [None]:
from calendar import month_abbr
from typing import Sequence, Tuple

import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from matplotlib.axes import Axes
from sklearn import preprocessing
from sklearn.decomposition import PCA

# Problem 1

In [None]:
forestfires = pd.read_csv("forestfires.csv")

forestfires["month"] = forestfires["month"].str.capitalize().map(
    dict(
        (v, k) for k, v in enumerate(month_abbr)
    )
)

forestfires

## a

In [None]:
fig, axes_arr = plt.subplots(2, 2)
fig.set_size_inches(10, 10)

def plot_(loc: Tuple[int, int], x_lable: str, color: str) -> None:
    axes: Axes = axes_arr[loc[0], loc[1]]
    axes.set_title(f"area vs. {x_lable}")
    axes.set_xlabel(x_lable)
    axes.set_ylabel("area")
    axes.scatter(forestfires[x_lable], forestfires["area"], s=4, c=color)

plot_((0, 0), "temp", "b")
plot_((0, 1), "month", "r")
plot_((1, 0), "DC", "g")
plot_((1, 1), "wind", "y")

## b

In [None]:
sns.histplot(forestfires["wind"])

## c

In [None]:
forestfires["wind"].describe()

## d

In [None]:
sns.histplot(forestfires["wind"], kde=True)

## e

In [None]:
forestfires["month"].plot(kind="density", xticks=range(1, 13), xlim=(1, 12), xlabel="month")

## f

In [None]:
pd.plotting.scatter_matrix(
    forestfires[["temp", "RH", "DC", "DMC"]],
    figsize=(20, 20)
)
plt.show()

- As we can see, `DC` and `DMC` indices from the FWI system have a strong positive correlation which means these two variables may increase and decrease simultaneously.

- Temperature (`temp`) and relative humidity (`RH`) have a weak negative correlation.

- Other combinations of variables seem not to be correlated.

## g

In [None]:
fig, axes_arr = plt.subplots(1, 3)
fig.set_size_inches(30, 10)

axes_arr[0].boxplot(forestfires["wind"])
axes_arr[0].set_title("Boxplot of wind")
axes_arr[1].boxplot(forestfires["ISI"])
axes_arr[1].set_title("Boxplot of ISI")
axes_arr[2].boxplot(forestfires["DC"])
axes_arr[2].set_title("Boxplot of DC")

- `wind`:

    There are several outliers above $Q_4$.

- `ISI`:

    There are many outliers stacking above $Q_4$ and a single extreme outlier far distant from the majority.

- `DC`:

    There are several outliers under but close to $Q_0$.

## h

In [None]:
fig, axes_arr = plt.subplots(1, 2)
fig.set_size_inches(20, 10)

axes_arr[0].hist(forestfires["DMC"])
axes_arr[1].hist(np.log(forestfires["DMC"]))

Comparing to the histogram of `DMC`, the histogram of log of `DMC` is skewed to the left and has a higher kurtosis.

# Problem 2

In [None]:
cereals = pd.read_csv("Cereals-1.csv")
cereals

## a

- quantitative / numerical:

    - `calories`
    - `protein`
    - `fat`
    - `sodium`
    - `fiber`
    - `carbo`
    - `sugars`
    - `potass`
    - `vitamins`
    - `weight`
    - `cups`
    - `rating`

- ordinal:

    - `shelf`

- nominal:

    - `mfr`
    - `type`

## b

In [None]:
numericals = cereals[[
    "calories",
    "protein",
    "fat",
    "sodium",
    "fiber",
    "carbo",
    "sugars",
    "potass",
    "vitamins",
    "weight",
    "cups",
    "rating"
]]
numericals.describe()

## c

In [None]:
numericals.hist(figsize=(20, 20))
plt.show()

- `sodium` has the largest variablity with range from 0 to 320.

- seem skewed:

    - `protein`
    - `fat`
    - `fiber`
    - `potass`
    - `rating`

- some values of `vitamins` seem to be extreme outliers.

## d

In [None]:
plt.boxplot(
    (
        cereals[cereals["type"]=="C"]["calories"],
        cereals[cereals["type"]=="H"]["calories"]
    ),
    labels=("cold", "hot")
)
plt.show()

- The `calories` of cold cereals varies in a wide range and has many outliers.

- Because the `calories` of hot cereals has extremely small volume of data, we can not draw any conclusions about its distribution and the correlation with the `calories` of cold cereals.

## e

In [None]:
plt.boxplot(
    (
        cereals[cereals["shelf"]==1]["rating"],
        cereals[cereals["shelf"]==2]["rating"],
        cereals[cereals["shelf"]==3]["rating"]
    ),
    labels=(1, 2, 3)
)
plt.show()

Comsuming `rating` for `shelf` height 2 is relatively lower than other two heights. Therefore we could consider removing `shelf` height 2.

## f

In [None]:
numericals.corr()

In [None]:
pd.plotting.scatter_matrix(
    numericals,
    figsize=(20, 20)
)
plt.show()

- `fiber` and `potass`.

- Remove one of the pair of variables that has strong correlation.

- See code cell below:

In [None]:
normalized = pd.DataFrame(
    preprocessing.scale(numericals),
    columns=numericals.columns
)
(normalized.corr() - numericals.corr()).round(4)

According to the table above, the correlations would **NOT** change if we normalized the data first.

## g

In [None]:
cal2rating = cereals.iloc[:, 3:].dropna(axis=0)

pcs = PCA()
pcs.fit(cal2rating)
pcs_summary = pd.DataFrame({
    "Standard deviation": np.sqrt(pcs.explained_variance_),
    "Proportion of variance": pcs.explained_variance_ratio_,
    "Cumulative proportion": np.cumsum(pcs.explained_variance_ratio_)
}).T

pcs_summary.columns = (f"PC{i+1}" for i in range(pcs_summary.shape[1]))
pcs_summary.round(4)

In [None]:
INFO_RATIO = 0.85

pcs_components = pd.DataFrame(
    pcs.components_.T,
    columns=pcs_summary.columns,
    index=cal2rating.columns
)

for i, j in enumerate(pcs_summary.loc["Cumulative proportion"]):
    if j >= INFO_RATIO:
        num_of_pc = i + 1
        break

pcs_components.iloc[:, :num_of_pc]

In [None]:
scores = pd.DataFrame(pcs.transform(cal2rating), columns=pcs_summary.columns)
scores.iloc[:, :num_of_pc]

# Problem 3

In [None]:
house_price = pd.read_csv("BostonHousing-1.csv")
house_price

- `CRIM`: Per capita crime rate by town
- `ZN`: Proportion of residential land zoned for lots over 25,000 sq. ft
- `INDUS`: Proportion of non-retail business acres per town
- `CHAS`: Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
- `NOX`: Nitric oxide concentration (parts per 10 million)
- `RM`: Average number of rooms per dwelling
- `AGE`: Proportion of owner-occupied units built prior to 1940
- `DIS`: Weighted distances to five Boston employment centers
- `RAD`: Index of accessibility to radial highways
- `TAX`: Full-value property tax rate per $10,000
- `PTRATIO`: Pupil-teacher ratio by town
- `LSTAT`: Percentage of lower status of the population
- `MEDV`: Median value of owner-occupied homes in $1000s
- `CAT. MEDV`: 1 if MEDV > 30 else 0

## a

In [None]:
ctg_cols = ["CHAS", "RAD", "CAT. MEDV"]
numericals = house_price.drop(ctg_cols, axis=1)
categoricals = house_price.loc[:, ctg_cols]

Standardize numerical variables as follow (if apply max-min rescaling, it will be extremely affected by outliers):

In [None]:
scaled_nmr = pd.DataFrame(
    preprocessing.scale(numericals),
    columns=numericals.columns
)

scaled_nmr

Max-min rescale categorical variables to make them have a same range ([0, 1]).

In [None]:
scaled_ctg = (categoricals - categoricals.min()) / (categoricals.max() - categoricals.min())
scaled_ctg

## b

In [None]:
scaled = scaled_nmr.join(scaled_ctg)

correlation = pd.DataFrame(
    scaled.corrwith(scaled.iloc[:, -1]),
    columns=["Correlation"]
)

correlation

In [None]:
correlation.sort_values("Correlation")[::-1]

Top 5 features:

1. `MEDV`
2. `RM`
3. `ZN`
4. `DIS`
5. `CHAS`

## c

In [None]:
pcs_o = PCA()
pcs_o.fit(house_price)
components_o = pd.DataFrame(
    pcs_o.components_.T,
    columns=(f"PC{i+1}" for i in range(len(house_price.columns))),
    index=house_price.columns
)

pcs_p = PCA()
pcs_p.fit(scaled)
components_p = pd.DataFrame(
    pcs_p.components_.T,
    columns=(f"PC{i+1}" for i in range(len(scaled.columns))),
    index=scaled.columns
)

In [None]:
components = components_o.iloc[:,:2].join(components_p.iloc[:,:2], lsuffix="_o", rsuffix="_p")
components

In [None]:
_, ax = plt.subplots()
ax.scatter(components.loc[:, "PC1_o"], components.loc[:, "PC2_o"])
ax.scatter(components.loc[:, "PC1_p"], components.loc[:, "PC2_p"], c="r")

## d

In [None]:
INFO_RATIO = 0.9

cumulative_proportion_o = np.cumsum(pcs_o.explained_variance_ratio_)

for i, j in enumerate(cumulative_proportion_o):
    if j >= INFO_RATIO:
        num_of_pc = i + 1
        break

num_of_pc

In [None]:

cumulative_proportion_p = np.cumsum(pcs_p.explained_variance_ratio_)

for i, j in enumerate(cumulative_proportion_p):
    if j >= INFO_RATIO:
        num_of_pc = i + 1
        break

num_of_pc

1 for original data and 7 for preprocessed data.

# Problem 4

See attached file.