In [1]:
#採用PCA降維，找出特徵變數，收斂速度較快
#變數變換: log, one-hot-encoding，可增加準確度
#資料預處理: 移除空值，而不是塞平均數，準確度更好 (要看資料而定)
#離群值處理: 使用Huber方法

In [7]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import Imputer
from sklearn.model_selection import KFold
from sklearn import linear_model
from sklearn.metrics import make_scorer
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn import svm
from sklearn.metrics import r2_score
from sklearn.ensemble import AdaBoostRegressor
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
#另外一種高階API，類似keras
import tflearn
import tensorflow as tf
import seaborn
import warnings
warnings.filterwarnings('ignore')

In [10]:
#資料集是一個房價的資訊


import os

os.listdir("./")

['.ipynb_checkpoints', 'svm_svr_2.ipynb', 'test.csv', 'train.csv']

In [85]:
train=pd.read_csv('train.csv')
labels=train["SalePrice"]
test=pd.read_csv('test.csv')

#合併兩個欄位一樣的df
#用concat
df=pd.concat([train,test],ignore_index=True)
df=df.drop(["SalePrice"],axis=1)

ids=test["Id"]

In [86]:
train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [87]:
train.shape

(1460, 81)

In [88]:
df.shape

(2919, 80)

In [99]:
#查看多少個null
nans=pd.isnull(df).sum()
nans[nans>0]

array([], dtype=int32)

In [90]:
#大於1000的刪除

df=df.drop("Id", 1)
df=df.drop("Alley", 1)
df=df.drop("Fence", 1)
df=df.drop("MiscFeature", 1)
df=df.drop("PoolQC", 1)
df=df.drop("FireplaceQu", 1)

In [91]:
#查看欄位型態
df.dtypes.value_counts()

object     38
int64      25
float64    11
dtype: int64

In [93]:
df.head()

Unnamed: 0,1stFlrSF,2ndFlrSF,3SsnPorch,BedroomAbvGr,BldgType,BsmtCond,BsmtExposure,BsmtFinSF1,BsmtFinSF2,BsmtFinType1,...,SaleType,ScreenPorch,Street,TotRmsAbvGrd,TotalBsmtSF,Utilities,WoodDeckSF,YearBuilt,YearRemodAdd,YrSold
0,856,854,0,3,1Fam,TA,No,706.0,0.0,GLQ,...,WD,0,Pave,8,856.0,AllPub,0,2003,2003,2008
1,1262,0,0,3,1Fam,TA,Gd,978.0,0.0,ALQ,...,WD,0,Pave,6,1262.0,AllPub,298,1976,1976,2007
2,920,866,0,3,1Fam,TA,Mn,486.0,0.0,GLQ,...,WD,0,Pave,6,920.0,AllPub,0,2001,2002,2008
3,961,756,0,3,1Fam,Gd,No,216.0,0.0,ALQ,...,WD,0,Pave,7,756.0,AllPub,0,1915,1970,2006
4,1145,1053,0,4,1Fam,TA,Av,655.0,0.0,GLQ,...,WD,0,Pave,9,1145.0,AllPub,192,2000,2000,2008


In [94]:
#將進行one-hot-encoding轉換類別資料
#遺缺值填入中位數
#log變數變換
#無限大(-inf)轉成0

all_columns = df.columns.values
non_categorical=df.describe().columns.values

categorical=[value for value in all_columns if value not in non_categorical]

In [95]:
#檢視
df[categorical].head()

Unnamed: 0,BldgType,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinType2,BsmtQual,CentralAir,Condition1,Condition2,Electrical,...,MSZoning,MasVnrType,Neighborhood,PavedDrive,RoofMatl,RoofStyle,SaleCondition,SaleType,Street,Utilities
0,1Fam,TA,No,GLQ,Unf,Gd,Y,Norm,Norm,SBrkr,...,RL,BrkFace,CollgCr,Y,CompShg,Gable,Normal,WD,Pave,AllPub
1,1Fam,TA,Gd,ALQ,Unf,Gd,Y,Feedr,Norm,SBrkr,...,RL,,Veenker,Y,CompShg,Gable,Normal,WD,Pave,AllPub
2,1Fam,TA,Mn,GLQ,Unf,Gd,Y,Norm,Norm,SBrkr,...,RL,BrkFace,CollgCr,Y,CompShg,Gable,Normal,WD,Pave,AllPub
3,1Fam,Gd,No,ALQ,Unf,TA,Y,Norm,Norm,SBrkr,...,RL,,Crawfor,Y,CompShg,Gable,Abnorml,WD,Pave,AllPub
4,1Fam,TA,Av,GLQ,Unf,Gd,Y,Norm,Norm,SBrkr,...,RL,BrkFace,NoRidge,Y,CompShg,Gable,Normal,WD,Pave,AllPub


In [96]:
#one-hot-encoding

df=pd.get_dummies(df)

In [98]:
#塞值

imp=Imputer(missing_values='NaN', strategy='most_frequent', axis=0)
df=imp.fit_transform(df)

#變數變換
df=np.log(df)
labels=np.log(labels)

df[df==-np.inf]=0

In [116]:
#切割

train=df[:1460]
test=df[1460:]

print(train.shape)
print(labels.shape)
print(test.shape)

(1460, 270)
(1460,)
(1459, 270)


In [111]:
clf=svm.SVR()
clf.fit(train,labels)

SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1,
  gamma='auto_deprecated', kernel='rbf', max_iter=-1, shrinking=True,
  tol=0.001, verbose=False)

In [131]:
test_labels=clf.predict(test)

In [132]:
test_labels.shape

(1459,)

In [136]:
price_lable=np.exp(test_labels)

In [137]:
price_lable

array([135434.9400385 , 205808.52634403, 187911.45262461, ...,
       176464.29286235, 109135.94024166, 254421.42909117])

In [138]:
print(price_lable)

[135434.9400385  205808.52634403 187911.45262461 ... 176464.29286235
 109135.94024166 254421.42909117]


In [139]:
sub=pd.DataFrame({
    "SalePrice":price_lable
})

In [141]:
sub.head()

Unnamed: 0,SalePrice
0,135434.940039
1,205808.526344
2,187911.452625
3,202696.372009
4,179084.218149


In [142]:
test_data = pd.read_csv('test.csv')

In [144]:
result =  test_data.append(sub)

In [146]:
result['SalePrice'] = sub['SalePrice']

In [147]:
result['SalePrice'].head()

0    135434.940039
1    205808.526344
2    187911.452625
3    202696.372009
4    179084.218149
Name: SalePrice, dtype: float64

In [148]:
result.to_csv("sample_submission.csv", index=False)

In [149]:
#feature reduction
#縮減變數
#找出適合的n_component
#whiten: 讓每個變數都有相同的方差

pca=PCA(whiten=True)
pca.fit(df)
variance=pd.DataFrame(pca)

Unnamed: 0,1stFlrSF,2ndFlrSF,3SsnPorch,Alley,BedroomAbvGr,BldgType,BsmtCond,BsmtExposure,BsmtFinSF1,BsmtFinSF2,...,SaleType,ScreenPorch,Street,TotRmsAbvGrd,TotalBsmtSF,Utilities,WoodDeckSF,YearBuilt,YearRemodAdd,YrSold
0,896.0,0.0,0.0,,2.0,1Fam,TA,No,468.0,144.0,...,WD,120.0,Pave,5.0,882.0,AllPub,140.0,1961.0,1961.0,2010.0
1,1329.0,0.0,0.0,,3.0,1Fam,TA,No,923.0,0.0,...,WD,0.0,Pave,6.0,1329.0,AllPub,393.0,1958.0,1958.0,2010.0
2,928.0,701.0,0.0,,3.0,1Fam,TA,No,791.0,0.0,...,WD,0.0,Pave,6.0,928.0,AllPub,212.0,1997.0,1998.0,2010.0
3,926.0,678.0,0.0,,3.0,1Fam,TA,No,602.0,0.0,...,WD,0.0,Pave,7.0,926.0,AllPub,360.0,1998.0,1998.0,2010.0
4,1280.0,0.0,0.0,,2.0,TwnhsE,TA,No,263.0,0.0,...,WD,144.0,Pave,5.0,1280.0,AllPub,0.0,1992.0,1992.0,2010.0
5,763.0,892.0,0.0,,3.0,1Fam,TA,No,0.0,0.0,...,WD,0.0,Pave,7.0,763.0,AllPub,157.0,1993.0,1994.0,2010.0
6,1187.0,0.0,0.0,,3.0,1Fam,TA,No,935.0,0.0,...,WD,0.0,Pave,6.0,1168.0,AllPub,483.0,1992.0,2007.0,2010.0
7,789.0,676.0,0.0,,3.0,1Fam,TA,No,0.0,0.0,...,WD,0.0,Pave,7.0,789.0,AllPub,0.0,1998.0,1998.0,2010.0
8,1341.0,0.0,0.0,,2.0,1Fam,TA,Gd,637.0,0.0,...,WD,0.0,Pave,5.0,1300.0,AllPub,192.0,1990.0,1990.0,2010.0
9,882.0,0.0,0.0,,2.0,1Fam,TA,No,804.0,78.0,...,WD,0.0,Pave,4.0,882.0,AllPub,240.0,1970.0,1970.0,2010.0
