### 데이터 훑어보기

In [None]:
import pandas as pd

housing = pd.read_csv("https://raw.githubusercontent.com/JRLearning/Machine-Learning/master/datasets/housing.csv")
housing.head()
housing.info()
housing["ocean_proximity"].value_counts()
housing.describe()

###  그래프 그리기

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
housing.hist(bins=50,figsize = (20,15))
plt.show

housing.plot(kind = "scatter",x = "longitude",y="latitude",alpha=0.1)

housing.plot(kind = "scatter",x = "longitude",y="latitude", alpha=0.4,
            s = housing["population"]/100, label = "population" , figsize=(10,7),
            c = "median_house_value", cmap = plt.get_cmap("jet"), colorbar = True, sharex = False
            )
plt.legend()

### test set 만들기

In [None]:
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(housing, test_size = 0.2, random_state=42)


from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits = 1, test_size = 0.2, random_state = 42)
for train_index, test_index in split.split(housing, housing["income_cat"]) :
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]

### 상관계수 확인하기

In [None]:
corr_matrix = housing.corr()
corr_matrix["median_house_value"].sort_values(ascending=False)

from pandas.plotting import scatter_matrix
attributes = ["median_house_value","median_income","total_rooms","housing_median_age"]
scatter_matrix(housing[attributes],figsize = (12,8))
plt.show()

### X변수와 Y변수를 떼어놓고 작업하면 편함

In [None]:
housing = strat_train_set.drop("median_house_value",axis = 1)
housing_labels = strat_train_set["median_house_value"].copy()

### 파이프라인

In [None]:
from sklearn.pipeline import Pipeline

num_pipeline = Pipeline([
        ('imputer', Imputer(strategy="median")),
        ('attribs_adder', CombinedAttributesAdder()),
        ('std_scaler', StandardScaler()),
    ])


from sklearn.pipeline import FeatureUnion
 
full_pipeline = FeatureUnion(transformer_list=[
        ("num_pipeline", num_pipeline),
        ("cat_pipeline", cat_pipeline),
    ])

### 모델링

In [None]:
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(housing_prepared, housing_labels)

from sklearn.metrics import mean_squared_error
housing_predictions = lin_reg.predict(housing_prepared)
lin_mse = mean_squared_error(housing_labels,housing_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

In [None]:
from sklearn.tree import DecisionTreeRegressor
tree_reg = DecisionTreeRegressor()
tree_reg.fit(housing_prepared,housing_labels)

from sklearn.metrics import mean_squared_error
housing_predictions = tree_reg.predict(housing_prepared)
lin_mse = mean_squared_error(housing_labels,housing_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

In [None]:
from sklearn.ensemble import RandomForestRegressor

forest_reg = RandomForestRegressor()
forest_reg.fit(housing_prepared,housing_labels)

from sklearn.metrics import mean_squared_error
housing_predictions = forest_reg.predict(housing_prepared)
lin_mse = mean_squared_error(housing_labels,housing_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

### 교차검증

In [None]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(tree_reg,housing_prepared,housing_labels, scoring = "neg_mean_squared_error",cv = 10)
tree_rmse_scores = np.sqrt(-scores)
display_scores(tree_rmse_scores)

### 그리드 탐색

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    {'n_estimators' : [3,10,30], 'max_features' : [2,4,6,8]},
    {'bootstrap' : [False], 'n_estimators':[3,10], 'max_features':[2,3,4]},    
]

forest_reg = RandomForestRegressor()

grid_search = GridSearchCV(forest_reg,param_grid,cv = 5, scoring = "neg_mean_squared_error",return_train_score=True)

grid_search.fit(housing_prepared,housing_labels)
grid_search.best_params_
grid_search.estimator

cvres = grid_search.cv_results_
for mean_score,params in zip(cvres["mean_test_score"],cvres["params"]) :
    print(np.sqrt(-mean_score),params)
    
    
feature_importances = grid_search.best_estimator_.feature_importances_
feature_importances
extra_attribs = ["rooms_per_hold","pop_per_hold","bedrooms_per_room"]
attributes = num_attribs + extra_attribs
sorted(zip(feature_importances,attributes),reverse = True)

### 테스트 세트로 시스템 평가하기

In [None]:
final_model = grid_search.best_estimator_

X_test = strat_train_set.drop("median_house_value",axis = 1)
Y_test = strat_train_set["median_house_value"].copy()


X_test_prepared = num_pipeline.fit_transform(X_test)

final_predictions = final_model.predict(X_test_prepared)
final_mse = mean_squared_error(Y_test,final_predictions)
final_rmse = np.sqrt(final_mse)
final_rmse