In [119]:
import os
import tarfile
import zipfile
import urllib.request
import pandas as pd
import numpy as np
from pandas.plotting import scatter_matrix
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelBinarizer
import matplotlib.pyplot as plt
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import FeatureUnion

import warnings

warnings.filterwarnings('ignore')
TEST_PATH = os.path.join("datasets","lab06")

SecondFlrSF_ix,BsmtFullBath_ix,FullBath_ix,HalfBath_ix = 43,46,48,49
class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_FullBath_per_BsmtFullBath = True):
        self.add_FullBath_per_BsmtFullBath = add_FullBath_per_BsmtFullBath
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        SecondFlrSF_per_HalfBath = X[:, SecondFlrSF_ix] / X[:, HalfBath_ix]
        if self.add_FullBath_per_BsmtFullBath:           
            FullBath_per_BsmtFullBath = X[:, FullBath_ix] / X[:, BsmtFullBath_ix]
            return np.c_[X,SecondFlrSF_per_HalfBath,FullBath_per_BsmtFullBath]
        else:
            return np.c_[X,SecondFlrSF_per_HalfBath]

        
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values
    
    
def load_housing_data(filename, test_path = TEST_PATH): # 주어진 filename 을 읽는 함수
    csv_path = os.path.join(test_path,filename)
    return pd.read_csv(csv_path)
   

if __name__ == "__main__":
    # 주어진 링크에서 all.zip 을 다운받아 압축을 푼 후 진행하였습니다.
    
    train_set = load_housing_data("train.csv")
    test_set = load_housing_data("test.csv")
    sample_set = load_housing_data("sample_submission.csv")
    
    #train_set["SalePrice"].hist()
    #plt.show()
    #데어터 셋 내용보기.
    
   # print(train_set)
    #print(train_set.info())
    #print(train_set.head())
    
    train = train_set.drop("SalePrice", axis =1)
   
    #train_set.hist(bins = 50, figsize = (20,15))
    #plt.show()
    
    # 상관관계
    train_copy = train_set.copy()
    train_copy["FullBath_per_BsmtFullBath"] = train_copy["FullBath"]/train_copy["BsmtFullBath"]
    train_copy["2ndFlrSF_per_HalfBath"] =train_copy["2ndFlrSF"]/train_copy["HalfBath"]

    #corr_matrix = train_copy.corr()
   # print(corr_matrix["SalePrice"].sort_values(ascending = False))
    #2ndFlrSF_per_HalfBath 특성은 2ndFlrSF 특성이나 HalfBath 특성보다 높은 상관관게를 갖는다. 
    #FullBath_per_BsmtFullBath 특성은 FullBath 특성이나 BsmtFullBath 특성보다 높은 상관관계를 갖는다.
    
    train_labels = train_set["SalePrice"].copy()
    """
    num_attr = list()
    cat_attr = list()
    for temp in train:
        if type(train[temp][0]) ==  str:
            cat_attr.append(temp)
        else:
            num_attr.append(temp)
    """
    
    num_attr = train.select_dtypes(exclude = 'object').columns.values.tolist()
    cat_attr = train.select_dtypes(include = 'object').columns.values.tolist()
   
    for temp in cat_attr:
        if train[temp].isnull().sum() > 100:
            train = train.drop(temp, axis = 1)
            cat_attr.remove(temp)
            
    train=train.dropna(subset = cat_attr)
    
    for temp in train:
        print(temp)
        
    num_pipeline = Pipeline([
                            ('selector', DataFrameSelector(num_attr)),
                            ('imputer', Imputer(strategy="median")),
                            ('attribs_adder', CombinedAttributesAdder()),
                            ('std_scaler', StandardScaler()),
                            ])
    
    cat_pipeline = Pipeline([
                            ('selector', DataFrameSelector(cat_attr)),
                            ('cat_encoder', OneHotEncoder(sparse=False)),
                            ])
    
    full_pipeline = FeatureUnion(transformer_list=[
            ("num_pipeline", num_pipeline),
            ("cat_pipeline", cat_pipeline),
        ])
    
    #housing_prepared = full_pipeline.fit_transform(train)
   
    


Id
MSSubClass
MSZoning
LotFrontage
LotArea
Street
LotShape
LandContour
Utilities
LotConfig
LandSlope
Neighborhood
Condition1
Condition2
BldgType
HouseStyle
OverallQual
OverallCond
YearBuilt
YearRemodAdd
RoofStyle
RoofMatl
Exterior1st
Exterior2nd
MasVnrType
MasVnrArea
ExterQual
ExterCond
Foundation
BsmtQual
BsmtCond
BsmtExposure
BsmtFinType1
BsmtFinSF1
BsmtFinType2
BsmtFinSF2
BsmtUnfSF
TotalBsmtSF
Heating
HeatingQC
CentralAir
Electrical
1stFlrSF
2ndFlrSF
LowQualFinSF
GrLivArea
BsmtFullBath
BsmtHalfBath
FullBath
HalfBath
BedroomAbvGr
KitchenAbvGr
KitchenQual
TotRmsAbvGrd
Functional
Fireplaces
GarageType
GarageYrBlt
GarageFinish
GarageCars
GarageArea
GarageQual
GarageCond
PavedDrive
WoodDeckSF
OpenPorchSF
EnclosedPorch
3SsnPorch
ScreenPorch
PoolArea
Fence
MiscVal
MoSold
YrSold
SaleType
SaleCondition
