In [1]:
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import LibrairiePerso_v4_5 as ownLibrary

In [2]:
pd.set_option('display.max_columns', 500)

In [3]:
path ='C:/Users/Julie/Documents/Big_Data/github/'

X_train = pd.read_csv(path + "Initial_train_rwrk.csv", sep=",")
y_train=X_train['SalePrice']

X_test = pd.read_csv(path + "Initial_test_rwrk.csv", sep=",")
y_test=X_test['SalePrice']

submission = pd.read_csv(path + "Initial_submission_rwrk.csv", sep=",")


for df in [X_train, X_test]:
    df.drop(['SalePrice'], axis=1, inplace=True)

## Scale Features and transformations

In [4]:
target = 'SalePrice'
continuous_features = ['LotFrontage','LotArea','YearBuilt','YearRemodAdd','MasVnrArea','BsmtFinSF1','BsmtFinSF2','BsmtUnfSF','TotalBsmtSF','1stFlrSF','2ndFlrSF','GrLivArea','GarageYrBlt','GarageArea','WoodDeckSF','OpenPorchSF','EnclosedPorch','ScreenPorch']

In [5]:
def transformationComparisonTable(X_train, X_test, y_train, y_test, scaleMethod) :

    scaled_train = ownLibrary.scale_features(X_train, [], scaleMethod)
    scaled_test = ownLibrary.scale_features(X_test, [], scaleMethod)
                
    
    scaled_dtr0 = ownLibrary.discretisationSupervise(scaled_train, scaled_test, y_train, y_test, False, False, 0)
    scaled_lr = ownLibrary.linearRegressionCorrelation(scaled_train, scaled_test, y_train, y_test)
    scaled_pr2 = ownLibrary.PolynomialFeaturesCorr(scaled_train, scaled_test, y_train, y_test, 2)
    scaled_pr3 = ownLibrary.PolynomialFeaturesCorr(scaled_train, scaled_test, y_train, y_test, 3)
    tableComparaison = pd.concat( [scaled_lr, scaled_dtr0, scaled_pr2, scaled_pr3], axis=1)
    
    return tableComparaison

### Comparison table of each feature transformation

From left to right, 5 table concatenated corresponding to scale methods 0 to 4.

Each table corresponds to a scale method and include the accuracy after no transformation, data tree regressor transformation, prolynomial order 2 and 3

In [6]:
result = pd.DataFrame()
for i in range(0,5):
    df = transformationComparisonTable(X_train[continuous_features], X_test[continuous_features], y_train, y_test, i)
    result = pd.concat( [result, df], axis=1)
    
result

Unnamed: 0,LR R²,DTR0_R²,PR2R²,PR3R²,LR R².1,DTR0_R².1,PR2R².1,PR3R².1,LR R².2,DTR0_R².2,PR2R².2,PR3R².2,LR R².3,DTR0_R².3,PR2R².3,PR3R².3,LR R².4,DTR0_R².4,PR2R².4,PR3R².4
LotFrontage,12.21,14.72,12.72,12.95,12.21,14.72,12.72,12.95,12.21,14.72,12.72,12.95,12.21,14.68,12.68,12.83,12.21,14.68,12.68,12.83
LotArea,8.54,20.2,16.09,18.45,8.54,16.78,17.29,19.15,8.54,17.07,17.28,19.15,8.54,20.71,16.87,18.89,8.54,20.71,16.87,18.89
YearBuilt,24.97,36.31,34.58,34.39,24.97,35.07,34.55,34.35,24.97,35.07,34.5,34.3,24.97,35.79,34.64,34.44,24.97,35.79,34.64,34.44
YearRemodAdd,25.46,29.07,26.35,26.69,25.46,29.07,26.35,26.69,25.46,29.07,26.35,26.69,25.46,29.07,26.34,26.71,25.46,29.07,26.34,26.71
MasVnrArea,26.37,25.78,26.39,26.81,26.37,25.78,26.39,26.81,26.37,25.78,26.39,26.81,26.37,25.0,26.4,26.95,26.37,25.0,26.4,26.95
BsmtFinSF1,16.8,28.46,15.48,31.14,16.8,5.9,12.79,15.35,16.8,5.9,12.79,15.35,16.8,23.84,15.38,31.0,16.8,23.84,15.38,31.0
BsmtFinSF2,0.21,0.78,0.98,1.0,0.21,1.16,0.55,0.58,0.21,1.16,0.55,0.58,0.21,1.13,0.62,0.66,0.21,1.13,0.62,0.66
BsmtUnfSF,2.73,6.86,5.29,7.11,2.73,6.97,5.18,7.5,2.73,6.97,5.18,7.5,2.73,7.56,5.38,7.07,2.73,7.33,5.38,7.08
TotalBsmtSF,38.95,39.13,41.66,42.22,38.95,39.33,41.66,42.22,38.95,39.33,41.66,42.22,38.95,38.14,41.61,42.19,38.95,38.14,41.61,42.19
1stFlrSF,37.12,36.09,38.36,38.23,37.12,37.0,38.37,38.25,37.12,36.09,38.36,38.23,37.12,35.55,38.35,38.19,37.12,35.55,38.35,38.19


## Features/methods Best accuracy on target (scale method = 0)

In [7]:
dtr_best_performance = ['LotFrontage','LotArea','YearBuilt','YearRemodAdd','BsmtFinSF2','GarageYrBlt','WoodDeckSF','OpenPorchSF','EnclosedPorch']
pr2_best_performance = ['1stFlrSF','GarageArea']
pr3_best_performance = ['BsmtFinSF1','BsmtUnfSF','TotalBsmtSF','2ndFlrSF','GrLivArea']

#The accuracy gain is too small to perform a polynomial transformation
no_modification = ['MasVnrArea','ScreenPorch']


## Bins list identified by Data Tree Regressor method

In [8]:
scaled_train0 = ownLibrary.scale_features(X_train, [], 0)
scaled_test0 = ownLibrary.scale_features(X_test, [], 0)
ownLibrary.discretisationSupervise(scaled_train0[dtr_best_performance], scaled_test0[dtr_best_performance], y_train, y_test, False, True, 0)

'LotFrontage': {
	1: [0,60.0],
	2: [61.0,74.0],
	3: [75.0,90.0],
	4: [91.0,220.0]
},
'LotArea': {
	1: [0,8635],
	2: [8640,10970],
	3: [10991,13680],
	4: [13682,430490]
},
'YearBuilt': {
	1: [0,1956],
	2: [1957,1984],
	3: [1985,2005],
	4: [2006,4020]
},
'YearRemodAdd': {
	1: [0,1956],
	2: [1957,1983],
	3: [1984,2007],
	4: [2008,4020]
},
'BsmtFinSF2': {
	1: [0,80],
	2: [81,1029],
	3: [1031,1057],
	4: [1063,2948]
},
'GarageYrBlt': {
	1: [0,1958.0],
	2: [1959.0,1987.0],
	3: [1988.0,2005.0],
	4: [2006.0,4020.0]
},
'WoodDeckSF': {
	1: [0,84],
	2: [85,96],
	3: [97,158],
	4: [159,1714]
},
'OpenPorchSF': {
	1: [0,8],
	2: [10,29],
	3: [30,43],
	4: [44,1046]
},
'EnclosedPorch': {
	1: [0,36],
	2: [37,42],
	3: [48,140],
	4: [143,1104]
},


Unnamed: 0,DTR0_R²
LotFrontage,14.72
LotArea,20.2
YearBuilt,36.31
YearRemodAdd,29.07
BsmtFinSF2,0.78
GarageYrBlt,34.53
WoodDeckSF,8.28
OpenPorchSF,21.29
EnclosedPorch,2.29
