In [34]:
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
import os
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.impute import SimpleImputer
from sklearn.model_selection import StratifiedShuffleSplit

#### Problem1.

In [35]:
HOUSING_PATH = "https://github.com/GwakJunwoo/fba_fml/blob/master/datasets/housing/housing.csv"

def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv("housing.csv")

housing = load_housing_data()

In [38]:
train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)
housing['income_cat'] = pd.cut(housing["median_income"], bins=[0., 1.5, 3.0, 4.5, 6., np.inf], labels = [1,2,3,4,5])
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

for train_index, test_index in split.split(housing, housing['income_cat']):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]

for i in (strat_train_set, strat_test_set):
    i.drop("income_cat", axis=1, inplace=True)


housing = strat_train_set.drop("median_house_value", axis=1)
housing_labels = strat_train_set["median_house_value"].copy()

imputer = SimpleImputer(strategy='median')
housing_num = housing.drop("ocean_proximity", axis=1)
imputer.fit(housing_num)

X = imputer.transform(housing_num)
housing_tr = pd.DataFrame(X, columns = housing_num.columns, index=housing_num.index)

In [41]:
from sklearn.preprocessing import OrdinalEncoder
housing_cat = housing[['ocean_proximity']]
ordinal_encoder = OrdinalEncoder()
housing_cat_encoded = ordinal_encoder.fit_transform(housing_cat)

from sklearn.preprocessing import OneHotEncoder
cat_encoder = OneHotEncoder()
housing_cat_1hot = cat_encoder.fit_transform(housing_cat)

In [42]:
from sklearn.base import BaseEstimator, TransformerMixin
rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6
class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True): 
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self 
    def transform(self, X):
        rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
        population_per_household = X[:, population_ix] / X[:, households_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]

attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False)
housing_extra_attribs = attr_adder.transform(housing.values)

In [44]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('attribs_adder', CombinedAttributesAdder()),
    ('std_scaler', StandardScaler()),
])
housing_num_tr = num_pipeline.fit_transform(housing_num)

from sklearn.compose import ColumnTransformer
num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]
full_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    ("cat", OneHotEncoder(), cat_attribs),
])
housing_prepared = full_pipeline.fit_transform(housing)

In [47]:
m1 = svm.SVR(kernel = 'linear')
m2 = svm.SVR(kernel = 'rbf')

In [48]:
m1.fit(housing_prepared, housing_labels)

In [49]:
m2.fit(housing_prepared, housing_labels)

In [51]:
from sklearn.metrics import mean_squared_error

pred = m1.predict(housing_prepared)
m1_mse = mean_squared_error(housing_labels, pred)
m1_rmse = np.sqrt(m1_mse)
print(m1_rmse)

106873.526604078


In [52]:
pred = m2.predict(housing_prepared)
m2_mse = mean_squared_error(housing_labels, pred)
m2_rmse = np.sqrt(m2_mse)
print(m2_rmse)

118451.153930123


In [53]:
from sklearn.model_selection import cross_val_score
scores1 = cross_val_score(m1, housing_prepared, housing_labels, scoring= 'neg_mean_squared_error', cv=10)
scores2 = cross_val_score(m2, housing_prepared, housing_labels, scoring= 'neg_mean_squared_error', cv=10)

print(f'CV score for M1 is {np.sqrt(-scores1).mean()}')
print(f'CV score for M2 is {np.sqrt(-scores2).mean()}')

CV score for M1 is [106846.91418486 108840.55721006 103122.02433423 109649.5401006
 103796.57205905 112219.65143015 109351.47147536 107877.83853403
 109166.01514235 108053.90140047]
CV score for M2 is [116628.19183878 119988.26365356 113823.78366263 120271.03189229
 114581.92489539 122665.05912251 119753.16461411 118174.92428728
 120129.13952467 118707.9580594 ]


In [60]:
from sklearn.model_selection import GridSearchCV
param_grid = [{'kernel':['linear', "rbf"], 'C': [0.5, 1., 2.], 'gamma': ['scale', 'auto']}]

grid_search1 = GridSearchCV(m1, param_grid, cv=5, scoring='neg_mean_squared_error', return_train_score=True)
grid_search1.fit(housing_prepared, housing_labels)

In [None]:
cvres = grid_search.cv_results_
for m, p in zip(cvres['mean_test_score'], cvres['params']):
    print(np.sqrt(-mean_score), params)

#### Problem2.

$\begin{aligned} {\left[\begin{array}{ll}x_{11} & x_{12} \\ x_{21} & x_{22}\end{array}\right]\left[\begin{array}{l}y_{1} \\ y_{2}\end{array}\right]+\left[\begin{array}{l}z_{1} \\ z_{2}\end{array}\right] } &=\left[\begin{array}{l}x_{11} y_{1}+x_{12} y_{2}+z_{1} \\ x_{21} y_{1}+x_{22} y_{2}+z_{2}\end{array}\right] \\ {\left[\begin{array}{l}y_{1} \\ y_{2}\end{array}\right]^{\top}\left[\begin{array}{ll}x_{11} & x_{12} \\ x_{21} & x_{22}\end{array}\right]\left[\begin{array}{l}y_{1} \\ y_{2}\end{array}\right] } &=\left[\begin{array}{l}y_{1} x_{11}+y_{2} x_{21} \\ y_{1} x_{12}+y_{2} x_{22}\end{array}\right]^{\top}\left[\begin{array}{l}y_{1} \\ y_{2}\end{array}\right] \\ &=y_{1}^{2} x_{11}+y_{1} y_{2} x_{21}+y_{1} x_{2} x_{12}+y_{2}^{2} x_{22} \end{aligned}$

#### Problem3. 

(a) (n x d)(d x 1) = (n x 1)

(b) {(d x n)(n x d)}<sup>-1</sup> = (d x d)

(c) (d x d)(d x n)(n x 1) = (d x 1)

#### Problem4. 

(a) 

$\begin{aligned} \operatorname{Var}(a X+b) &=E\left(a^{2} x^{2}+2 a b X+b^{2}\right)-E(a x+b)^{2} \\ &\left\{a^{2} E\left(x^{2}\right)+2 a b E(x)+b^{2}\right\}-\{a E(x)+b\}^{2} \\ &=\left\{a^{2} E\left(x^{2}\right)+2 a b E(x)+b^{2}\right\}-\left\{a^{2} E(x)^{2}+a b E(x)+b^{2}\right\} \\ &=a^{2}\left\{E\left(x^{2}\right)-E(x)^{2}\right\} \\ &=a^{2} \operatorname{Var}(X) \end{aligned}$

(b)

$\begin{aligned} \bar{X} &=\frac{1}{n} \sum_{i=1}^{n} X_{2} \\ E(\bar{X}) &=E\left(\frac{1}{n} \sum_{i=1}^{n} x_{2}\right)=\frac{1}{n} E\left(\sum_{i=1}^{n} X_{2}\right) \\ &=\frac{1}{n} E\left(X_{1}+x_{2}+\cdots+x_{n}\right)=\frac{1}{n} \cdot n \cdot \mu=\mu \\ V(\bar{x}) &=V\left(\frac{1}{n} \sum_{i=1}^{n} x_{2}\right)=\left(\frac{1}{n}\right)^{2} V\left(x_{1}+x_{2}+\cdots+x_{n}\right) \\ &=\frac{1}{n^{2}} \cdot n \cdot \sigma^{2}=\frac{\sigma^{2}}{n} \end{aligned}$

#### Problem5.

(a)

$\begin{aligned} P(y=1) &=\sum_{k} P(y=1, x=k) \\ &=\frac{10}{100}+\frac{15}{100}=\frac{1}{4}\end{aligned}$

(b)

$\begin{aligned} P(y=1 \mid x=1)=\frac{P(x=1, y=1)}{P(x=1)}=\frac{\frac{10}{100}}{\frac{15}{100}}=\frac{2}{3}\end{aligned}$

(c)

$\begin{aligned}P(x=1, y=1)=\frac{10}{100} \neq \frac{15}{100} \cdot \frac{25}{100}=P(x=1) P(y=1) \end{aligned}$

(a) random variable X와 무관한 사건 Y=1이 발생할 확률

(b) X=1이 발생했을 때, Y=1이 추가적으로 발생할 확률

(c) X, Y는 독립이 아니다. 특정 사건이 발생했을 때, 그 사건의 발생이 다른 사건의 발생확률에 영향을 미치기 때문이다. 