In [2]:
import pandas as pd

housing = pd.read_csv('../../datasets/housing/housing.csv')
housing.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [3]:
housing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [4]:
import numpy as np
housing["income_cat"] = pd.cut(housing["median_income"],
 bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
 labels=[1, 2, 3, 4, 5])

In [5]:
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing["income_cat"]):
 strat_train_set = housing.loc[train_index]
 strat_test_set = housing.loc[test_index]

In [6]:
for set_ in (strat_train_set, strat_test_set):
 set_.drop("income_cat", axis=1, inplace=True)


In [7]:
housing = strat_train_set.drop("median_house_value", axis=1)
housing_labels = strat_train_set["median_house_value"].copy()

In [8]:
median = housing["total_bedrooms"].median() # option 3
housing["total_bedrooms"].fillna(median, inplace=True)

In [9]:
from sklearn.impute import SimpleImputer
housing_num = housing.drop("ocean_proximity", axis=1)
imputer = SimpleImputer(strategy="median")


In [10]:
from sklearn.preprocessing import OneHotEncoder
housing_cat = housing[["ocean_proximity"]]
cat_encoder = OneHotEncoder()

In [11]:
from sklearn.base import BaseEstimator, TransformerMixin
rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6
class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True): # no *args or **kargs
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self # nothing else to do
    def transform(self, X, y=None):
        rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
        population_per_household = X[:, population_ix] / X[:, households_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household,
            bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]
attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False)
housing_extra_attribs = attr_adder.transform(housing.values)


In [12]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
num_pipeline = Pipeline([
 ('imputer', SimpleImputer(strategy="median")),
 ('attribs_adder', CombinedAttributesAdder()),
 ('std_scaler', StandardScaler()),
 ])


In [13]:
from sklearn.compose import ColumnTransformer
num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]
full_pipeline = ColumnTransformer([
 ("num", num_pipeline, num_attribs),
 ("cat", OneHotEncoder(), cat_attribs),
 ])
housing_prepared = full_pipeline.fit_transform(housing)


In [14]:
from sklearn.svm import SVR
model = SVR(kernel='linear', epsilon=5, C=0.01)
model.fit(housing_prepared, housing_labels)

some_data = housing.iloc[:5]
some_labels = housing_labels.iloc[:5]
some_data_prepared = full_pipeline.transform(some_data)
print("Predictions:", model.predict(some_data_prepared))

Predictions: [179344.55274184 179641.64641747 179372.2248784  179352.60394926
 179544.81784088]


In [16]:
from sklearn.metrics import mean_squared_error
housing_predictions = model.predict(housing_prepared)
mse = mean_squared_error(housing_labels, housing_predictions)
rmse = np.sqrt(mse)
rmse

118846.38744018429

In [18]:
X_test = strat_test_set.drop("median_house_value", axis=1)
y_test = strat_test_set["median_house_value"].copy()
X_test_prepared = full_pipeline.transform(X_test)
final_predictions = model.predict(X_test_prepared)
final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse) # => evaluates to 47,730.2
final_rmse

117181.09059357205

2.
(a)
$Xy$ = $\begin{pmatrix}x_{11}y_{1}&x_{12}y_{2}\\x_{21}y_{1}&x_{22}y_{2}\\ \end{pmatrix}$, $z$ = $\begin{pmatrix}z_1\\z_2\\ \end{pmatrix}$

$Xy+z$ = $\begin{pmatrix}x_{11}y_{1}+x_{12}y_{2}+z_1\\x_{21}y_{1}+x_{22}y_{2}+z_2\\ \end{pmatrix}$

(b)
$y^T$=$\begin{pmatrix}y_1&y_2\\ \end{pmatrix}$

$y^TXy$ = $\begin{pmatrix}x_{11}y_1^2+x_{12}y_1y_2&x_{21}y_1y_2+x_{22}y_2^2\\ \end{pmatrix}$

3.
(a) $y = Xw$ shape = $(n$ x $d) * (d$ x $1) = (n$ x $1)$

(b) $X^T$ shape = $(d$ x $n)$

$X^TX$ shape = 
$(d$ x $n) * (n$ x $d) = (d$ x $d)$
$(X^TX)^{-1}$ shape = $(d$ x $d)$

(c)
$(X^TX)^{-1}X^Ty$ shape = $(d$ x $d) * (d$ x $n) * (n$ x $1) = (d$ x $1)$

4.
(a)
$Var(aX + B) = E((aX + b)^2)-(E(aX+b))^2$
$=E(a^2X^2+2abX+b^2)-(a^2(E(X))^2+2abE(X)+b^2)$
$=a^2E(X^2)+2abE(X)+b^2-(a^2(E(X))^2+2abE(X)+b^2)$
$=a^2E(X^2)-a^2E(X)^2$
$=a^2Var(X)$

(b)
$E(\dot{X}) = E(\frac{1}{n}(X_1+X_2...X_n)) $
$= \frac{1}{n}E(X_1+X_2...X_n) $
$= \frac{1}{n}(E(X_1)+E(X_2)...E(X_n))$
$= \frac{1}{n}*n*\mu = \mu$

$Var(\dot{X}) = V(\frac{X_1+X_2+...X_n}{n})$
$=\frac{1}{n^2}V(X_1+X_2+...X_n)$
$=\frac{1}{n^2}V(X_1)+V(X_2)+...V(X_n)$
$=\frac{1}{n^2}*n*σ^2$
$=\frac{σ^2}{n}$




5.
(a)
Y가 일어날 확률 

$P(Y=1)=P(X=0,Y=1)+P(X=1,Y=1)=\frac{15}{100}+\frac{10}{100}=\frac{1}{4}$

(b) X가 일어났을때 Y가 일어날 확률

$P(Y=1|X=1)=\frac{P(Y=1\cap X=1)}{P(X=1)} = \frac{\frac{10}{100}}{\frac{10}{100}+\frac{5}{100}} = \frac{2}{3}$

(C)
$P(X\cap Y)=\frac{10}{100}$

$P(X)=\frac{15}{100}$, $P(Y)=\frac{25}{100}$

$P(X\cap Y)\neq P(X)P(Y)$
-> not independent
