# Module Load

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# 파일 설명
- train.csv - 교육 세트
- test.csv - 테스트 세트
- data_description.txt - 원래 Dean De Cock이 준비했지만 여기에서 사용 된 열 이름과 일치하도록 가볍게 편집 된 각 열의 전체 설명
- sample_submission.csv - 판매 연도 및 월별 선형 회귀 분석의 벤치 마크 제출, 로트 평방 피트 수 및 침실 수
### 데이터 필드
다음은 데이터 설명 파일에서 찾을 수있는 간단한 버전입니다.

- SalePrice - 부동산 판매 가격입니다. 이것은 예측하려는 대상 변수입니다.
- MSSubClass : 건물 클래스
- MSZoning : 일반 구역 분류
- LotFrontage : 재산에 연결된 거리의 선형 다리
- LotArea : 로트 크기 (스퀘어 피트)
- Street : 도로 접근의 유형
- Alley : 골목길의 유형
- LotShape : 재산의 일반적인 모양
- LandContour : 부동산의 평평함
- Utilities : 사용 가능한 유틸리티 유형
- LotConfig : 로트 구성
- LandSlope : 부동산의 경사
- Neighborhood : Ames시의 한도 내 물리적 위치
- Condition1 : 주요 도로 또는 철도까지의 거리
- Condition2 : 주요 도로 또는 철도까지의 거리 (초가있는 경우)
- BldgType : 거주 유형
- HouseStyle : 주거 스타일
- OverallQual : 전체 재질 및 마감 품질
- OverallCond : 전체 조건 등급
- YearBuilt : 원래 건설 날짜
- YearRemodAdd : 개장 날짜
- RoofStyle : 지붕 유형
- RoofMatl : 지붕 재료
- Exterior1st : 집안의 외장
- Exterior2nd : 주택의 외장 (둘 이상의 재료 인 경우)
- MasVnrType : 석조 베니어 유형
- MasVnrArea : 벽돌 베니어 면적 (평방 피트)
- ExterQual : 외관 재질
- ExterCond : 외부 물질의 현재 상태
- Foundation : 기초 유형
- BsmtQual : 지하실의 높이
- BsmtCond : 지하실의 일반적인 상태
- BsmtExposure : 도보 또는 정원 수준 지하 벽
- BsmtFinType1 : 지하실 마감 지역의 품질
- BsmtFinSF1 : 유형 1 평방 피트 완료
- BsmtFinType2 : 두 번째 완성 된 영역의 품질 (있는 경우)
- BsmtFinSF2 : 제 2 형 제곱 피트
- BsmtUnfSF : 지하 공간의 미완성 된 평방 피트
- TotalBsmtSF : 지하실 면적의 총 평방 피트
- Heating : 난방의 종류
- HeatingQC : 난방 품질 및 상태
- CentralAir : 중앙 냉난방 장치
- Electrical : 전기 시스템
- 1stFlrSF : 1 층 평방 피트
- 2ndFlrSF : 2 층 평방 피트
- LowQualFinSF : 낮은 품질의 완성 된 평방 피트 (모든 층)
- GrLivArea : 위의 (지상) 생활 면적 스퀘어 피트
- BsmtFullBath : 지하실 가득한 욕실
- BsmtHalfBath : 지하 욕실 반 화장실
- FullBath : 고급 욕실
- HalfBath : 학년 반 욕조
- Bedroom : 지하층 위 침실 수
- Kitchen : 부엌 수 :
- KitchenQual : 부엌 품질
- TotRmsAbvGrd : 등급 이상의 총 객실 (욕실은 포함되지 않음)
- Functional : 홈 기능 등급
- Fireplaces : 벽난로 수 :
- FireplaceQu : 벽난로 품질
- GarageType : 차고 위치
- GarageYrBlt : 1 년 차고가 건조되었습니다.
- GarageFinish : 차고 내부 마무리
- GarageCars : 자동차 용량의 차고 크기
- GarageArea : 평방 피트 단위의 차고 크기
- GarageQual : 차고 품질
- GarageCond : 차고 조건
- PavedDrive : 포장 도로
- WoodDeckSF : 목재 갑판 면적 (평방 피트)
- OpenPorchSF : 오픈 베란다 면적 (평방 피트)
- EnclosedPorch : 동봉 된 현관 면적
- 3SsnPorch : 세 자리 현관 면적 (평방 피트)
- ScreenPorch : 스크린 현관 면적 (평방 피트)
- PoolArea : 풀 면적 (스퀘어 피트)
- PoolQC : 풀 품질
- Fence : 울타리 품질
- MiscFeature : 기타 범주에서 다루지 않는 기타 기능
- MiscVal : 기타 기능의 가치
- MoSold : 매월 팔렸습니다.
- YrSold : 판매 된 연도
- SaleType : 판매 유형
- SaleCondition : 판매 조건

# Data Load
### Data shape
* train shape :  (1460, 81)  
* test shape :  (1459, 80)

In [3]:
df_sample = pd.read_csv('./data/sample_submission.csv')
df_train = pd.read_csv('./data/train.csv')
df_test = pd.read_csv('./data/test.csv')

In [4]:
print('train shape : ',df_train.shape, '\n',
      'test shape : ' ,df_test.shape)

train shape :  (1460, 81) 
 test shape :  (1459, 80)


In [5]:
df_sample.head()

Unnamed: 0,Id,SalePrice
0,1461,169277.052498
1,1462,187758.393989
2,1463,183583.68357
3,1464,179317.477511
4,1465,150730.079977


In [6]:
df_train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [7]:
df_train.tail()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
1455,1456,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,175000
1456,1457,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,2,2010,WD,Normal,210000
1457,1458,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,...,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1458,1459,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2010,WD,Normal,142125
1459,1460,20,RL,75.0,9937,Pave,,Reg,Lvl,AllPub,...,0,,,,0,6,2008,WD,Normal,147500


In [8]:
df_test.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal


# 결측치 파악

In [9]:
sr_train_null = df_train.isnull().sum().sort_values(ascending=False)
sr_test_null = df_test.isnull().sum().sort_values(ascending=False)

In [10]:
df_nan_info = pd.DataFrame(data=[sr_train_null[sr_train_null>0],sr_test_null[sr_test_null>0]],index=['Test','Train']).T
df_nan_info.sort_values(ascending=False,by='Train')

Unnamed: 0,Test,Train
PoolQC,1453.0,1456.0
MiscFeature,1406.0,1408.0
Alley,1369.0,1352.0
Fence,1179.0,1169.0
FireplaceQu,690.0,730.0
LotFrontage,259.0,227.0
GarageFinish,81.0,78.0
GarageQual,81.0,78.0
GarageYrBlt,81.0,78.0
GarageCond,81.0,78.0


In [11]:
df_nan_info['Dtype'] = df_train[df_nan_info.index].dtypes

In [12]:
df_nan_info['Not_nan_Test'] = df_test[df_nan_info.index].notnull().sum()
df_nan_info['Not_nan_Train'] = df_train[df_nan_info.index].notnull().sum()

df_nan_info['Nan_propotion_Train'] = df_nan_info['Test']/1459*100
df_nan_info['Nan_propotion_Train'] = df_nan_info['Train'] / 1460*100


In [13]:
df_nan_info.sort_values(by=['Test','Train'],ascending=False).head()

Unnamed: 0,Test,Train,Dtype,Not_nan_Test,Not_nan_Train,Nan_propotion_Train
PoolQC,1453.0,1456.0,object,3,7,99.726027
MiscFeature,1406.0,1408.0,object,51,54,96.438356
Alley,1369.0,1352.0,object,107,91,92.60274
Fence,1179.0,1169.0,object,290,281,80.068493
FireplaceQu,690.0,730.0,object,729,770,50.0


In [14]:
print('PoolQC_unique : ',df_train[df_train['PoolQC'].notnull()].PoolQC.unique(),'\n',
    'MiscFeature_unique : ',df_train[df_train['MiscFeature'].notnull()].MiscFeature.unique(),'\n',
     'Alley_unique : ',df_train[df_train['Alley'].notnull()].Alley.unique(),'\n',
     'Fence_unique : ',df_train[df_train['Fence'].notnull()].Fence.unique(),'\n',
     'FireplaceQu_unique : ',df_train[df_train['FireplaceQu'].notnull()].FireplaceQu.unique())

PoolQC_unique :  ['Ex' 'Fa' 'Gd'] 
 MiscFeature_unique :  ['Shed' 'Gar2' 'Othr' 'TenC'] 
 Alley_unique :  ['Grvl' 'Pave'] 
 Fence_unique :  ['MnPrv' 'GdWo' 'GdPrv' 'MnWw'] 
 FireplaceQu_unique :  ['TA' 'Gd' 'Fa' 'Ex' 'Po']


In [15]:
# f = open('./data/data_description.txt','r',newline='\n')
# f.readlines()

In [16]:
df_train.corr()['SalePrice'].sort_values()

KitchenAbvGr    -0.135907
EnclosedPorch   -0.128578
MSSubClass      -0.084284
OverallCond     -0.077856
YrSold          -0.028923
LowQualFinSF    -0.025606
Id              -0.021917
MiscVal         -0.021190
BsmtHalfBath    -0.016844
BsmtFinSF2      -0.011378
3SsnPorch        0.044584
MoSold           0.046432
PoolArea         0.092404
ScreenPorch      0.111447
BedroomAbvGr     0.168213
BsmtUnfSF        0.214479
BsmtFullBath     0.227122
LotArea          0.263843
HalfBath         0.284108
OpenPorchSF      0.315856
2ndFlrSF         0.319334
WoodDeckSF       0.324413
LotFrontage      0.351799
BsmtFinSF1       0.386420
Fireplaces       0.466929
MasVnrArea       0.477493
GarageYrBlt      0.486362
YearRemodAdd     0.507101
YearBuilt        0.522897
TotRmsAbvGrd     0.533723
FullBath         0.560664
1stFlrSF         0.605852
TotalBsmtSF      0.613581
GarageArea       0.623431
GarageCars       0.640409
GrLivArea        0.708624
OverallQual      0.790982
SalePrice        1.000000
Name: SalePr

In [17]:
df_nan_info['Corr_SalePrice'] = df_train.corr()['SalePrice'][df_nan_info.index]


In [18]:
df_nan_info.sort_values(by=['Test','Train'],ascending=False)

Unnamed: 0,Test,Train,Dtype,Not_nan_Test,Not_nan_Train,Nan_propotion_Train,Corr_SalePrice
PoolQC,1453.0,1456.0,object,3,7,99.726027,
MiscFeature,1406.0,1408.0,object,51,54,96.438356,
Alley,1369.0,1352.0,object,107,91,92.60274,
Fence,1179.0,1169.0,object,290,281,80.068493,
FireplaceQu,690.0,730.0,object,729,770,50.0,
LotFrontage,259.0,227.0,float64,1232,1201,15.547945,0.351799
GarageCond,81.0,78.0,object,1381,1379,5.342466,
GarageFinish,81.0,78.0,object,1381,1379,5.342466,
GarageQual,81.0,78.0,object,1381,1379,5.342466,
GarageYrBlt,81.0,78.0,float64,1381,1379,5.342466,0.486362


In [19]:
df_nan_info[df_nan_info['Dtype']!=object].sort_values(by=['Test','Train'],ascending=False)

Unnamed: 0,Test,Train,Dtype,Not_nan_Test,Not_nan_Train,Nan_propotion_Train,Corr_SalePrice
LotFrontage,259.0,227.0,float64,1232,1201,15.547945,0.351799
GarageYrBlt,81.0,78.0,float64,1381,1379,5.342466,0.486362
MasVnrArea,8.0,15.0,float64,1444,1452,1.027397,0.477493
BsmtFullBath,,2.0,int64,1457,1460,0.136986,0.227122
BsmtHalfBath,,2.0,int64,1457,1460,0.136986,-0.016844
BsmtFinSF1,,1.0,int64,1458,1460,0.068493,0.38642
BsmtFinSF2,,1.0,int64,1458,1460,0.068493,-0.011378
BsmtUnfSF,,1.0,int64,1458,1460,0.068493,0.214479
GarageArea,,1.0,int64,1458,1460,0.068493,0.623431
GarageCars,,1.0,int64,1458,1460,0.068493,0.640409


In [20]:
df_nan_info[df_nan_info['Dtype']==object].sort_values(by=['Test','Train'],ascending=False)

Unnamed: 0,Test,Train,Dtype,Not_nan_Test,Not_nan_Train,Nan_propotion_Train,Corr_SalePrice
PoolQC,1453.0,1456.0,object,3,7,99.726027,
MiscFeature,1406.0,1408.0,object,51,54,96.438356,
Alley,1369.0,1352.0,object,107,91,92.60274,
Fence,1179.0,1169.0,object,290,281,80.068493,
FireplaceQu,690.0,730.0,object,729,770,50.0,
GarageCond,81.0,78.0,object,1381,1379,5.342466,
GarageFinish,81.0,78.0,object,1381,1379,5.342466,
GarageQual,81.0,78.0,object,1381,1379,5.342466,
GarageType,81.0,76.0,object,1383,1379,5.205479,
BsmtExposure,38.0,44.0,object,1415,1422,3.013699,


## Concept

### 1. Only Numeric 
### 2. Only Categoric
### 3. Mixing type


#### 1. Only Numeric
* 상관계수 기준으로 일정 수준 미만 칼럼 드랍
* 
* Sub. 평균 - 이탈값 과도하게 반영하는 문제, 중위값은 이탈값을 제대로 반영하지 못함

* 속성/ 사례수 / 누락값 백분율/ 원소 갯수/ 최솟값/ 1사분위수/ 평균/ 중앙값/ 3사분위수/ 표준편차

In [21]:
df_numeric_train_info = df_train.describe().T
df_numeric_train_info.head()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Id,1460.0,730.5,421.610009,1.0,365.75,730.5,1095.25,1460.0
MSSubClass,1460.0,56.89726,42.300571,20.0,20.0,50.0,70.0,190.0
LotFrontage,1201.0,70.049958,24.284752,21.0,59.0,69.0,80.0,313.0
LotArea,1460.0,10516.828082,9981.264932,1300.0,7553.5,9478.5,11601.5,215245.0
OverallQual,1460.0,6.099315,1.382997,1.0,5.0,6.0,7.0,10.0


In [22]:
# df_train.isnull().sum()/len(df_train)

In [23]:
df_numeric_train_info['Null_propotion'] = df_train.isnull().sum()/len(df_train)
df_numeric_train_info.head()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max,Null_propotion
Id,1460.0,730.5,421.610009,1.0,365.75,730.5,1095.25,1460.0,0.0
MSSubClass,1460.0,56.89726,42.300571,20.0,20.0,50.0,70.0,190.0,0.0
LotFrontage,1201.0,70.049958,24.284752,21.0,59.0,69.0,80.0,313.0,0.177397
LotArea,1460.0,10516.828082,9981.264932,1300.0,7553.5,9478.5,11601.5,215245.0,0.0
OverallQual,1460.0,6.099315,1.382997,1.0,5.0,6.0,7.0,10.0,0.0


In [24]:
len(df_train['LotFrontage'].value_counts())

110

#### Element_count Function

In [25]:
##### element count function

def element_count(index_list,data_frame):
    value_dict={}
    
    for index in index_list:
        value_dict[index] = len(data_frame[index].value_counts())
    
    series_element = pd.Series(data=value_dict)
    return series_element
    


In [26]:
train_numeric_element = element_count(df_numeric_train_info.index.tolist(), df_train)
train_numeric_element.head()

1stFlrSF        753
2ndFlrSF        417
3SsnPorch        20
BedroomAbvGr      8
BsmtFinSF1      637
dtype: int64

## Train_Numeric_Data Info

In [27]:
df_numeric_train_info['element_count'] = train_numeric_element
df_numeric_train_info.head()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max,Null_propotion,element_count
Id,1460.0,730.5,421.610009,1.0,365.75,730.5,1095.25,1460.0,0.0,1460
MSSubClass,1460.0,56.89726,42.300571,20.0,20.0,50.0,70.0,190.0,0.0,15
LotFrontage,1201.0,70.049958,24.284752,21.0,59.0,69.0,80.0,313.0,0.177397,110
LotArea,1460.0,10516.828082,9981.264932,1300.0,7553.5,9478.5,11601.5,215245.0,0.0,1073
OverallQual,1460.0,6.099315,1.382997,1.0,5.0,6.0,7.0,10.0,0.0,10


## Train_Category_Data_info

In [28]:
from pandas import Categorical

In [29]:
Categorical.describe(df_train['FireplaceQu'])

Unnamed: 0_level_0,counts,freqs
categories,Unnamed: 1_level_1,Unnamed: 2_level_1
,690,0.472603
Gd,380,0.260274
TA,313,0.214384
Fa,33,0.022603
Ex,24,0.016438
Po,20,0.013699


In [30]:
df_train.loc[:,df_train.dtypes=='object'].count()[:5]

MSZoning       1460
Street         1460
Alley            91
LotShape       1460
LandContour    1460
dtype: int64

In [31]:
df_train['MSZoning'].value_counts().idxmax() # .idxmax(값중 최대의 인덱스 반환)

'RL'

In [32]:
len(df_train)

1460

#### Categries Max Min Name, Count Function

In [33]:
# categories info fucntion
def cat_info_func(index_list, data_frame):
    value_dict={}
    
    for index in index_list:
        
        try:
            value_dict['Max_cat_name'].append( str(data_frame[index].value_counts().idxmax()))
            value_dict['Max_cat_count'].append((data_frame[index].value_counts().max()))
            value_dict['Max_cat_propotion'].append((data_frame[index].value_counts().max())/len(data_frame))
            value_dict['Min_cat_name'].append(str(data_frame[index].value_counts().idxmin()))
            value_dict['Min_cat_count'].append((data_frame[index].value_counts().min()))
            value_dict['Min_cat_propotion'].append((data_frame[index].value_counts().min())/len(data_frame))
        
        
        except KeyError as e:
            value_dict['Max_cat_name'] = [str(data_frame[index].value_counts().idxmax())]
            value_dict['Max_cat_count'] = [((data_frame[index].value_counts().max()))]
            value_dict['Max_cat_propotion'] =[(data_frame[index].value_counts().max()/len(data_frame))]
            value_dict['Min_cat_name'] =[str(data_frame[index].value_counts().idxmin())]
            value_dict['Min_cat_count'] = [((data_frame[index].value_counts().min()))]
            value_dict['Min_cat_propotion'] =[(data_frame[index].value_counts().min()/len(data_frame))]
    
    
    dframe_cat_info = pd.DataFrame(data = value_dict,
                                  index= index_list,
                                  columns = ['Max_cat_name','Max_cat_count','Max_cat_propotion',
                                            'Min_cat_name','Min_cat_count','Min_cat_propotion'])
    return dframe_cat_info

In [34]:
df_category_train_info = pd.DataFrame(data = df_train.loc[:,df_train.dtypes=='object'].count(),
                                      columns=['count'])
df_category_train_info['element_count'] = element_count(df_category_train_info.index.tolist(),
                                                        df_train)

df_category_train_info['Null_propotion'] = df_train.isnull().sum()/len(df_train)

df_category_train_info.head()

Unnamed: 0,count,element_count,Null_propotion
MSZoning,1460,5,0.0
Street,1460,2,0.0
Alley,91,2,0.937671
LotShape,1460,4,0.0
LandContour,1460,4,0.0


In [35]:
cat_info_func(df_category_train_info.index.tolist(),df_train).head()

Unnamed: 0,Max_cat_name,Max_cat_count,Max_cat_propotion,Min_cat_name,Min_cat_count,Min_cat_propotion
MSZoning,RL,1151,0.788356,C (all),10,0.006849
Street,Pave,1454,0.99589,Grvl,6,0.00411
Alley,Grvl,50,0.034247,Pave,41,0.028082
LotShape,Reg,925,0.633562,IR3,10,0.006849
LandContour,Lvl,1311,0.897945,Low,36,0.024658


In [36]:
df_category_train_info=pd.concat([df_category_train_info,
                                  cat_info_func(df_category_train_info.index.tolist(),df_train)],axis=1)
df_category_train_info.head()

Unnamed: 0,count,element_count,Null_propotion,Max_cat_name,Max_cat_count,Max_cat_propotion,Min_cat_name,Min_cat_count,Min_cat_propotion
MSZoning,1460,5,0.0,RL,1151,0.788356,C (all),10,0.006849
Street,1460,2,0.0,Pave,1454,0.99589,Grvl,6,0.00411
Alley,91,2,0.937671,Grvl,50,0.034247,Pave,41,0.028082
LotShape,1460,4,0.0,Reg,925,0.633562,IR3,10,0.006849
LandContour,1460,4,0.0,Lvl,1311,0.897945,Low,36,0.024658


In [37]:
df_category_train_info

Unnamed: 0,count,element_count,Null_propotion,Max_cat_name,Max_cat_count,Max_cat_propotion,Min_cat_name,Min_cat_count,Min_cat_propotion
MSZoning,1460,5,0.0,RL,1151,0.788356,C (all),10,0.006849
Street,1460,2,0.0,Pave,1454,0.99589,Grvl,6,0.00411
Alley,91,2,0.937671,Grvl,50,0.034247,Pave,41,0.028082
LotShape,1460,4,0.0,Reg,925,0.633562,IR3,10,0.006849
LandContour,1460,4,0.0,Lvl,1311,0.897945,Low,36,0.024658
Utilities,1460,2,0.0,AllPub,1459,0.999315,NoSeWa,1,0.000685
LotConfig,1460,5,0.0,Inside,1052,0.720548,FR3,4,0.00274
LandSlope,1460,3,0.0,Gtl,1382,0.946575,Sev,13,0.008904
Neighborhood,1460,25,0.0,NAmes,225,0.15411,Blueste,2,0.00137
Condition1,1460,9,0.0,Norm,1260,0.863014,RRNe,2,0.00137


In [38]:
df_numeric_train_info.to_pickle('./data/df_numeric_train_info')

In [39]:
df_category_train_info.to_pickle('./data/df_category_train_info')

# Test_Numeric_Data Info

In [40]:
df_numeric_test_info = df_test.describe().T
df_numeric_test_info['Null_propotion'] = df_test.isnull().sum()/len(df_test)
df_numeric_test_info['element_count']=element_count(df_numeric_test_info.index.tolist(), df_test)
df_numeric_test_info.head()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max,Null_propotion,element_count
Id,1459.0,2190.0,421.321334,1461.0,1825.5,2190.0,2554.5,2919.0,0.0,1459
MSSubClass,1459.0,57.378341,42.74688,20.0,20.0,50.0,70.0,190.0,0.0,16
LotFrontage,1232.0,68.580357,22.376841,21.0,58.0,67.0,80.0,200.0,0.155586,115
LotArea,1459.0,9819.161069,4955.517327,1470.0,7391.0,9399.0,11517.5,56600.0,0.0,1106
OverallQual,1459.0,6.078821,1.436812,1.0,5.0,6.0,7.0,10.0,0.0,10


# Test_Category_Data_info

In [41]:
df_category_test_info = pd.DataFrame(data = df_test.loc[:,df_test.dtypes=='object'].count(),
                                      columns=['count'])
df_category_test_info['element_count'] = element_count(df_category_test_info.index.tolist(),
                                                        df_test)

df_category_test_info['Null_propotion'] = df_test.isnull().sum()/len(df_test)

df_category_test_info=pd.concat([df_category_test_info,
                                  cat_info_func(df_category_test_info.index.tolist(),df_test)],axis=1)
df_category_test_info.head()

Unnamed: 0,count,element_count,Null_propotion,Max_cat_name,Max_cat_count,Max_cat_propotion,Min_cat_name,Min_cat_count,Min_cat_propotion
MSZoning,1455,5,0.002742,RL,1114,0.763537,RH,10,0.006854
Street,1459,2,0.0,Pave,1453,0.995888,Grvl,6,0.004112
Alley,107,2,0.926662,Grvl,70,0.047978,Pave,37,0.02536
LotShape,1459,4,0.0,Reg,934,0.640164,IR3,6,0.004112
LandContour,1459,4,0.0,Lvl,1311,0.898561,Low,24,0.01645


In [42]:
df_numeric_test_info.to_pickle('./data/df_numeric_test_info')
df_category_test_info.to_pickle('./data/df_category_test_info')

# Concat data info

In [43]:
df_concat = pd.concat([df_train.iloc[:,:-1],df_test],axis=0)
df_concat.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,2,2008,WD,Normal
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,5,2007,WD,Normal
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,9,2008,WD,Normal
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,2,2006,WD,Abnorml
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,12,2008,WD,Normal


# Concat numeric info

In [44]:
df_numeric_concat_info = df_concat.describe().T
df_numeric_concat_info['Null_propotion'] = df_concat.isnull().sum()/len(df_concat)
df_numeric_concat_info['element_count']=element_count(df_numeric_concat_info.index.tolist(), df_concat)
df_numeric_concat_info.head()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max,Null_propotion,element_count
Id,2919.0,1460.0,842.787043,1.0,730.5,1460.0,2189.5,2919.0,0.0,2919
MSSubClass,2919.0,57.137718,42.517628,20.0,20.0,50.0,70.0,190.0,0.0,16
LotFrontage,2433.0,69.305795,23.344905,21.0,59.0,68.0,80.0,313.0,0.166495,128
LotArea,2919.0,10168.11408,7886.996359,1300.0,7478.0,9453.0,11570.0,215245.0,0.0,1951
OverallQual,2919.0,6.089072,1.409947,1.0,5.0,6.0,7.0,10.0,0.0,10


In [45]:
df_category_concat_info = pd.DataFrame(data = df_concat.loc[:,df_concat.dtypes=='object'].count(),
                                      columns=['count'])
df_category_concat_info['element_count'] = element_count(df_category_concat_info.index.tolist(),
                                                        df_concat)

df_category_concat_info['Null_propotion'] = df_concat.isnull().sum()/len(df_concat)

df_category_concat_info=pd.concat([df_category_concat_info,
                                  cat_info_func(df_category_concat_info.index.tolist(),df_concat)],axis=1)
df_category_concat_info.head()

Unnamed: 0,count,element_count,Null_propotion,Max_cat_name,Max_cat_count,Max_cat_propotion,Min_cat_name,Min_cat_count,Min_cat_propotion
MSZoning,2915,5,0.00137,RL,2265,0.775951,C (all),25,0.008565
Street,2919,2,0.0,Pave,2907,0.995889,Grvl,12,0.004111
Alley,198,2,0.932169,Grvl,120,0.04111,Pave,78,0.026721
LotShape,2919,4,0.0,Reg,1859,0.636862,IR3,16,0.005481
LandContour,2919,4,0.0,Lvl,2622,0.898253,Low,60,0.020555


In [46]:
df_numeric_concat_info.to_pickle('./data/df_numeric_concat_info')
df_category_concat_info.to_pickle('./data/df_category_concat_info')