In [139]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from scipy.stats import skew
from scipy.stats import boxcox
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import Lasso , Ridge, ElasticNet, BayesianRidge, LinearRegression
from sklearn.ensemble import HistGradientBoostingRegressor, GradientBoostingRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split



In [140]:
test = pd.read_csv(r"C:\Users\longv\Downloads\house-prices-advanced-regression-techniques (1)\test.csv")
train = pd.read_csv(r"C:\Users\longv\Downloads\house-prices-advanced-regression-techniques (1)\train.csv")

In [141]:
y = train['SalePrice']

In [142]:
y

0       208500
1       181500
2       223500
3       140000
4       250000
         ...  
1455    175000
1456    210000
1457    266500
1458    142125
1459    147500
Name: SalePrice, Length: 1460, dtype: int64

In [143]:
x = train.drop(columns=['SalePrice'])

In [144]:
x

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,2,2008,WD,Normal
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,5,2007,WD,Normal
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,9,2008,WD,Normal
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,2,2006,WD,Abnorml
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,12,2008,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,8,2007,WD,Normal
1456,1457,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,...,0,0,,MnPrv,,0,2,2010,WD,Normal
1457,1458,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,...,0,0,,GdPrv,Shed,2500,5,2010,WD,Normal
1458,1459,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,4,2010,WD,Normal


In [145]:
combined_df = pd.concat([x, test], axis=0, ignore_index=True)

In [146]:
combined_df

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,2,2008,WD,Normal
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,5,2007,WD,Normal
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,9,2008,WD,Normal
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,2,2006,WD,Abnorml
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,12,2008,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2914,2915,160,RM,21.0,1936,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,6,2006,WD,Normal
2915,2916,160,RM,21.0,1894,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,4,2006,WD,Abnorml
2916,2917,20,RL,160.0,20000,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,9,2006,WD,Abnorml
2917,2918,85,RL,62.0,10441,Pave,,Reg,Lvl,AllPub,...,0,0,,MnPrv,Shed,700,7,2006,WD,Normal


In [147]:
# Split Data into numerical data and object data

def Split_DF(df):
    numeric_features = df.dtypes[df.dtypes != "object"].index
    object_features = df.dtypes[df.dtypes == "object"].index
    return df[numeric_features], df[object_features]

# One hot encoding for object data

def one_hot_encoding(df):
    for col in df.columns:
        dummy_columns = pd.get_dummies(df[col], prefix=col)
        df = pd.concat([df, dummy_columns], axis=1)
        df.drop(columns=[col], inplace=True)
    col = df.columns
    df[col] = df[col].astype(int)
    return df
def Processing_Data(df):
    # ----- Handle Y ---------#

    #---------Handle X (feature) boxcox transform and onehotencoding------------#
    numeric_data, object_data = Split_DF(df)
    scaler = StandardScaler()
    numeric_data = scaler.fit_transform(numeric_data)
    object_data = one_hot_encoding(object_data)
    data = pd.concat([numeric_data, object_data], axis = 1)

    #------Fill N/A = 0---------#
    data = data.fillna(0)

    # --------- Drop and add new feature -------#
    data = data.drop(['Id'], axis=1)
    data['TotalSF'] = (data['TotalBsmtSF'] 
                       + data['1stFlrSF'] 
                       + data['2ndFlrSF'])
    data['YrBltAndRemod'] = data['YearBuilt'] +data['YearRemodAdd']
    data['Total_sqr_footage'] = (data['BsmtFinSF1'] 
                                 + data['BsmtFinSF2'] 
                                 + data['1stFlrSF'] 
                                 + data['2ndFlrSF']
                                )
    data['Total_Bathrooms'] = (data['FullBath'] 
                               + (0.5 * data['HalfBath']) 
                               + data['BsmtFullBath'] 
                               + (0.5 * data['BsmtHalfBath'])
                              )
    data['Total_porch_sf'] = (data['OpenPorchSF'] 
                              + data['3SsnPorch'] 
                              + data['EnclosedPorch'] 
                              + data['ScreenPorch'] 
                              + data['WoodDeckSF']
                             )
    data['haspool'] = data['PoolArea'].apply(lambda x: 1 if x > 0 else 0)
    data['has2ndfloor'] = data['2ndFlrSF'].apply(lambda x: 1 if x > 0 else 0)
    data['hasgarage'] = data['GarageArea'].apply(lambda x: 1 if x > 0 else 0)
    data['hasbsmt'] = data['TotalBsmtSF'].apply(lambda x: 1 if x > 0 else 0)
    data['hasfireplace'] = data['Fireplaces'].apply(lambda x: 1 if x > 0 else 0) 
    return data 

In [158]:
combine_num , combine_obj = Split_DF(combined_df)
id_column = combine_num[['Id']]


# Separate the features to be scaled
features_to_scale = combine_num.drop(columns=['Id'],axis = 1)

# Apply the scaler
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features_to_scale)

# Create a DataFrame with the scaled features
scaled_features_df = pd.DataFrame(scaled_features, columns=features_to_scale.columns)

# Combine the 'Id' column with the scaled features
combined_num_df = pd.concat([id_column, scaled_features_df], axis=1)


In [159]:
combined_num_df

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
0,1,0.067331,-0.184481,-0.217879,0.646183,-0.507284,1.046258,0.896833,0.523129,0.580807,...,0.348840,-0.740760,0.200006,-0.359601,-0.103331,-0.285935,-0.06315,-0.089592,-1.552184,0.157646
1,2,-0.873616,0.458190,-0.072044,-0.063185,2.188279,0.154764,-0.395604,-0.569991,1.177910,...,-0.059782,1.614879,-0.702843,-0.359601,-0.103331,-0.285935,-0.06315,-0.089592,-0.446925,-0.602962
2,3,0.067331,-0.055946,0.137197,0.646183,-0.507284,0.980221,0.848965,0.333506,0.097856,...,0.627446,-0.740760,-0.081209,-0.359601,-0.103331,-0.285935,-0.06315,-0.089592,1.026753,0.157646
3,4,0.302568,-0.398704,-0.078385,0.646183,-0.507284,-1.859351,-0.682812,-0.569991,-0.494856,...,0.785323,-0.740760,-0.184815,3.874967,-0.103331,-0.285935,-0.06315,-0.089592,-1.552184,-1.363569
4,5,0.067331,0.629569,0.518903,1.355551,-0.507284,0.947203,0.753229,1.382009,0.468851,...,1.686149,0.776967,0.540424,-0.359601,-0.103331,-0.285935,-0.06315,-0.089592,2.132012,0.157646
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2914,2915,2.419700,-2.069648,-1.043937,-1.481920,1.289758,-0.043346,-0.682812,-0.569991,-0.969026,...,-2.195761,-0.740760,-0.702843,-0.359601,-0.103331,-0.285935,-0.06315,-0.089592,-0.078505,-1.363569
2915,2916,2.419700,-2.069648,-1.049263,-1.481920,-0.507284,-0.043346,-0.682812,-0.569991,-0.415828,...,-0.867740,-0.740760,-0.347624,-0.359601,-0.103331,-0.285935,-0.06315,-0.089592,-0.815344,-1.363569
2916,2917,-0.873616,3.885767,1.246808,-0.772552,1.289758,-0.373528,0.561757,-0.569991,1.717937,...,0.478856,3.006130,-0.702843,-0.359601,-0.103331,-0.285935,-0.06315,-0.089592,1.026753,-1.363569
2917,2918,0.655424,-0.313015,0.034605,-0.772552,-0.507284,0.683057,0.370284,-0.569991,-0.229233,...,-2.195761,-0.108374,-0.229217,-0.359601,-0.103331,-0.285935,-0.06315,1.144312,0.289914,-1.363569


In [160]:
combine_obj = one_hot_encoding(combine_obj)

In [161]:
combine_obj

Unnamed: 0,MSZoning_C (all),MSZoning_FV,MSZoning_RH,MSZoning_RL,MSZoning_RM,Street_Grvl,Street_Pave,Alley_Grvl,Alley_Pave,LotShape_IR1,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,0,0,0,1,0,0,1,0,0,0,...,0,0,0,1,0,0,0,0,1,0
1,0,0,0,1,0,0,1,0,0,0,...,0,0,0,1,0,0,0,0,1,0
2,0,0,0,1,0,0,1,0,0,1,...,0,0,0,1,0,0,0,0,1,0
3,0,0,0,1,0,0,1,0,0,1,...,0,0,0,1,1,0,0,0,0,0
4,0,0,0,1,0,0,1,0,0,1,...,0,0,0,1,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2914,0,0,0,0,1,0,1,0,0,0,...,0,0,0,1,0,0,0,0,1,0
2915,0,0,0,0,1,0,1,0,0,0,...,0,0,0,1,1,0,0,0,0,0
2916,0,0,0,1,0,0,1,0,0,0,...,0,0,0,1,1,0,0,0,0,0
2917,0,0,0,1,0,0,1,0,0,0,...,0,0,0,1,0,0,0,0,1,0


In [162]:
combine_full = pd.concat([combined_num_df, combine_obj], axis = 1)

In [163]:
combine_full

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,1,0.067331,-0.184481,-0.217879,0.646183,-0.507284,1.046258,0.896833,0.523129,0.580807,...,0,0,0,1,0,0,0,0,1,0
1,2,-0.873616,0.458190,-0.072044,-0.063185,2.188279,0.154764,-0.395604,-0.569991,1.177910,...,0,0,0,1,0,0,0,0,1,0
2,3,0.067331,-0.055946,0.137197,0.646183,-0.507284,0.980221,0.848965,0.333506,0.097856,...,0,0,0,1,0,0,0,0,1,0
3,4,0.302568,-0.398704,-0.078385,0.646183,-0.507284,-1.859351,-0.682812,-0.569991,-0.494856,...,0,0,0,1,1,0,0,0,0,0
4,5,0.067331,0.629569,0.518903,1.355551,-0.507284,0.947203,0.753229,1.382009,0.468851,...,0,0,0,1,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2914,2915,2.419700,-2.069648,-1.043937,-1.481920,1.289758,-0.043346,-0.682812,-0.569991,-0.969026,...,0,0,0,1,0,0,0,0,1,0
2915,2916,2.419700,-2.069648,-1.049263,-1.481920,-0.507284,-0.043346,-0.682812,-0.569991,-0.415828,...,0,0,0,1,1,0,0,0,0,0
2916,2917,-0.873616,3.885767,1.246808,-0.772552,1.289758,-0.373528,0.561757,-0.569991,1.717937,...,0,0,0,1,1,0,0,0,0,0
2917,2918,0.655424,-0.313015,0.034605,-0.772552,-0.507284,0.683057,0.370284,-0.569991,-0.229233,...,0,0,0,1,0,0,0,0,1,0


In [240]:
label_x = combine_full[:1460]
unlabel_x = combine_full[1461:]
label_x.fillna(0)

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,1,0.067331,-0.184481,-0.217879,0.646183,-0.507284,1.046258,0.896833,0.523129,0.580807,...,0,0,0,1,0,0,0,0,1,0
1,2,-0.873616,0.458190,-0.072044,-0.063185,2.188279,0.154764,-0.395604,-0.569991,1.177910,...,0,0,0,1,0,0,0,0,1,0
2,3,0.067331,-0.055946,0.137197,0.646183,-0.507284,0.980221,0.848965,0.333506,0.097856,...,0,0,0,1,0,0,0,0,1,0
3,4,0.302568,-0.398704,-0.078385,0.646183,-0.507284,-1.859351,-0.682812,-0.569991,-0.494856,...,0,0,0,1,1,0,0,0,0,0
4,5,0.067331,0.629569,0.518903,1.355551,-0.507284,0.947203,0.753229,1.382009,0.468851,...,0,0,0,1,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,0.067331,-0.313015,-0.285470,-0.063185,-0.507284,0.914184,0.753229,-0.569991,-0.969026,...,0,0,0,1,0,0,0,0,1,0
1456,1457,-0.873616,0.672414,0.381311,-0.063185,0.391237,0.220801,0.178812,0.093689,0.765207,...,0,0,0,1,0,0,0,0,1,0
1457,1458,0.302568,-0.141636,-0.142806,0.646183,3.086800,-1.000876,1.040437,-0.569991,-0.365338,...,0,0,0,1,0,0,0,0,1,0
1458,1459,-0.873616,-0.055946,-0.057207,-0.772552,0.391237,-0.703711,0.561757,-0.569991,-0.861460,...,0,0,0,1,0,0,0,0,1,0


In [170]:
y = pd.DataFrame(y)

In [172]:

log_objective = np.log(y)

In [322]:
X_train, X_test, y_train, y_test = train_test_split(label_x, log_objective, test_size=0.15, random_state=500)


In [323]:
X_test

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
993,994,0.067331,-0.055946,-0.167661,-0.063185,-0.507284,1.112294,1.040437,-0.569991,-0.969026,...,0,1,0,0,0,0,0,0,0,1
1111,1112,0.067331,0.458190,0.039551,0.646183,0.391237,0.154764,-0.395604,3.110923,-0.084348,...,0,0,0,1,0,0,0,0,1,0
467,468,0.302568,0.415345,-0.087262,-0.772552,1.289758,-0.967858,0.513889,0.679289,-0.121667,...,0,0,0,1,0,0,0,0,1,0
996,997,-0.873616,,0.062251,-0.772552,0.391237,-0.340510,-1.113625,-0.569991,1.039611,...,0,0,0,0,0,0,0,0,1,0
86,87,0.067331,2.257668,0.221020,-0.063185,-0.507284,1.112294,0.992569,-0.569991,-0.969026,...,0,0,0,1,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
340,341,0.067331,0.672414,0.510153,1.355551,-0.507284,1.013239,0.848965,-0.569991,-0.969026,...,0,0,0,1,0,0,0,0,1,0
667,668,-0.873616,-0.184481,-0.259093,-0.063185,-0.507284,0.749093,0.657493,0.868911,1.529147,...,0,0,0,1,0,0,0,0,1,0
1216,1217,0.773042,-0.055946,-0.157009,-0.063185,-0.507284,0.220801,-0.299868,-0.569991,-0.969026,...,0,0,0,1,0,0,0,0,1,0
913,914,0.773042,0.543879,-0.494330,-0.772552,0.391237,-0.736730,-1.640173,-0.569991,-0.345580,...,0,0,0,1,0,0,0,0,1,0


In [324]:
X_train_noID = X_train.drop(['Id'], axis = 1)

In [325]:
X_test_noID = X_test.drop(['Id'], axis = 1)
unla_x_noID = unlabel_x.drop(['Id'], axis = 1)

In [326]:
X_train_noID = X_train_noID.fillna(0)
X_test_noID = X_test_noID.fillna(0)
unla_x_noID = unla_x_noID.fillna(0)

In [327]:
regressor = LinearRegression()
regressor.fit(X_train_noID, y_train)

In [328]:
pred = regressor.predict(X_test_noID)

In [329]:
pred

array([[ 1.21047745e+01],
       [ 1.21798325e+01],
       [ 1.18838882e+01],
       [ 1.17949829e+01],
       [ 1.20885239e+01],
       [ 1.19380722e+01],
       [ 1.23440628e+01],
       [ 1.16128616e+01],
       [ 1.22695847e+01],
       [ 1.19559097e+01],
       [ 1.16234436e+01],
       [ 1.23225327e+01],
       [ 1.18037796e+01],
       [ 1.17501907e+01],
       [ 1.20461349e+01],
       [ 1.24438400e+01],
       [ 1.21896286e+01],
       [ 1.18251114e+01],
       [ 1.19256363e+01],
       [ 1.16364212e+01],
       [ 1.21543503e+01],
       [ 1.21559067e+01],
       [ 1.24547195e+01],
       [ 1.22378006e+01],
       [ 1.22084274e+01],
       [ 1.17245789e+01],
       [ 1.20125046e+01],
       [ 1.15393906e+01],
       [ 1.17524490e+01],
       [ 1.18627243e+01],
       [ 1.22079391e+01],
       [ 1.21026993e+01],
       [ 1.28418808e+01],
       [ 1.23272934e+01],
       [ 1.18419876e+01],
       [ 1.15164108e+01],
       [ 1.20092545e+01],
       [ 1.19809189e+01],
       [ 1.1

In [315]:
pseudo_labels = regressor.predict(unla_x_noID)


In [316]:
X_combined = np.vstack([X_train_noID, unla_x_noID])
y_combined = np.concatenate([y_train, pseudo_labels])
X_combined = pd.DataFrame(X_combined)
y_combined =  pd.DataFrame(y_combined)

In [317]:
regressor.fit(X_combined, y_combined)

In [318]:
X_test_noID = X_test_noID.to_numpy()


In [319]:
X_test_noID = pd.DataFrame(X_test_noID)

In [320]:
pred = regressor.predict(X_test_noID)

In [321]:
pred

array([[12.25319672],
       [12.13822174],
       [11.67765808],
       [11.68804169],
       [12.58355713],
       [12.12858582],
       [11.91812897],
       [11.98593903],
       [11.99511719],
       [12.08804321],
       [11.85808563],
       [12.01646423],
       [11.18141937],
       [12.04090118],
       [12.2527771 ]])

array([[12.01430893],
       [12.11770725],
       [12.20233059],
       ...,
       [12.07727242],
       [11.68072891],
       [12.29785252]])

In [234]:
regressor = LinearRegression()
regressor.fit(X_train_noID, y_train)

In [237]:
pred = regressor.predict(X_test_noID)
X_test_noID

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
892,-0.873616,0.029743,-0.222444,-0.063185,2.188279,-0.274474,0.896833,-0.569991,0.486412,-0.29308,...,0,0,0,1,0,0,0,0,1,0
1105,0.067331,1.229395,0.264770,1.355551,-0.507284,0.749093,0.513889,1.448934,1.296453,-0.29308,...,0,0,0,1,0,0,0,0,1,0
413,-0.638379,-0.570083,-0.153204,-0.772552,0.391237,-1.463132,-1.640173,-0.569991,-0.969026,-0.29308,...,0,0,0,1,0,0,0,0,1,0
522,-0.167905,-0.827151,-0.655383,-0.063185,1.289758,-0.802766,-1.640173,-0.569991,-0.093129,-0.29308,...,0,0,0,1,0,0,0,0,1,0
1036,-0.873616,0.843792,0.346184,2.064919,-0.507284,1.178331,1.136173,-0.179591,1.274501,-0.29308,...,0,0,0,1,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
479,-0.638379,-0.827151,-0.538081,-1.481920,1.289758,-1.132949,0.753229,1.856066,-0.600227,-0.29308,...,0,0,0,1,0,0,1,0,0,0
1361,-0.873616,2.343357,0.759594,0.646183,-0.507284,1.112294,0.992569,-0.480757,1.827699,-0.29308,...,0,0,0,1,0,0,0,0,1,0
802,0.067331,-0.270170,-0.249709,0.646183,-0.507284,1.112294,0.992569,-0.569991,0.453484,-0.29308,...,0,0,0,1,0,0,0,0,1,0
651,0.302568,-0.398704,-0.137479,-1.481920,-0.507284,-1.033894,-1.640173,-0.569991,-0.969026,-0.29308,...,0,0,0,1,0,0,0,0,1,0


Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,1,0.067331,-0.184481,-0.217879,0.646183,-0.507284,1.046258,0.896833,0.523129,0.580807,...,0,0,0,1,0,0,0,0,1,0
1,2,-0.873616,0.458190,-0.072044,-0.063185,2.188279,0.154764,-0.395604,-0.569991,1.177910,...,0,0,0,1,0,0,0,0,1,0
2,3,0.067331,-0.055946,0.137197,0.646183,-0.507284,0.980221,0.848965,0.333506,0.097856,...,0,0,0,1,0,0,0,0,1,0
3,4,0.302568,-0.398704,-0.078385,0.646183,-0.507284,-1.859351,-0.682812,-0.569991,-0.494856,...,0,0,0,1,1,0,0,0,0,0
4,5,0.067331,0.629569,0.518903,1.355551,-0.507284,0.947203,0.753229,1.382009,0.468851,...,0,0,0,1,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,0.067331,-0.313015,-0.285470,-0.063185,-0.507284,0.914184,0.753229,-0.569991,-0.969026,...,0,0,0,1,0,0,0,0,1,0
1456,1457,-0.873616,0.672414,0.381311,-0.063185,0.391237,0.220801,0.178812,0.093689,0.765207,...,0,0,0,1,0,0,0,0,1,0
1457,1458,0.302568,-0.141636,-0.142806,0.646183,3.086800,-1.000876,1.040437,-0.569991,-0.365338,...,0,0,0,1,0,0,0,0,1,0
1458,1459,-0.873616,-0.055946,-0.057207,-0.772552,0.391237,-0.703711,0.561757,-0.569991,-0.861460,...,0,0,0,1,0,0,0,0,1,0


In [239]:
train

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,175000
1456,1457,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,2,2010,WD,Normal,210000
1457,1458,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,...,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1458,1459,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2010,WD,Normal,142125


In [331]:
label_x

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,1,0.067331,-0.184481,-0.217879,0.646183,-0.507284,1.046258,0.896833,0.523129,0.580807,...,0,0,0,1,0,0,0,0,1,0
1,2,-0.873616,0.458190,-0.072044,-0.063185,2.188279,0.154764,-0.395604,-0.569991,1.177910,...,0,0,0,1,0,0,0,0,1,0
2,3,0.067331,-0.055946,0.137197,0.646183,-0.507284,0.980221,0.848965,0.333506,0.097856,...,0,0,0,1,0,0,0,0,1,0
3,4,0.302568,-0.398704,-0.078385,0.646183,-0.507284,-1.859351,-0.682812,-0.569991,-0.494856,...,0,0,0,1,1,0,0,0,0,0
4,5,0.067331,0.629569,0.518903,1.355551,-0.507284,0.947203,0.753229,1.382009,0.468851,...,0,0,0,1,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,0.067331,-0.313015,-0.285470,-0.063185,-0.507284,0.914184,0.753229,-0.569991,-0.969026,...,0,0,0,1,0,0,0,0,1,0
1456,1457,-0.873616,0.672414,0.381311,-0.063185,0.391237,0.220801,0.178812,0.093689,0.765207,...,0,0,0,1,0,0,0,0,1,0
1457,1458,0.302568,-0.141636,-0.142806,0.646183,3.086800,-1.000876,1.040437,-0.569991,-0.365338,...,0,0,0,1,0,0,0,0,1,0
1458,1459,-0.873616,-0.055946,-0.057207,-0.772552,0.391237,-0.703711,0.561757,-0.569991,-0.861460,...,0,0,0,1,0,0,0,0,1,0


In [332]:
log_objective

Unnamed: 0,SalePrice
0,12.247694
1,12.109011
2,12.317167
3,11.849398
4,12.429216
...,...
1455,12.072541
1456,12.254863
1457,12.493130
1458,11.864462


In [333]:
combine_full

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,1,0.067331,-0.184481,-0.217879,0.646183,-0.507284,1.046258,0.896833,0.523129,0.580807,...,0,0,0,1,0,0,0,0,1,0
1,2,-0.873616,0.458190,-0.072044,-0.063185,2.188279,0.154764,-0.395604,-0.569991,1.177910,...,0,0,0,1,0,0,0,0,1,0
2,3,0.067331,-0.055946,0.137197,0.646183,-0.507284,0.980221,0.848965,0.333506,0.097856,...,0,0,0,1,0,0,0,0,1,0
3,4,0.302568,-0.398704,-0.078385,0.646183,-0.507284,-1.859351,-0.682812,-0.569991,-0.494856,...,0,0,0,1,1,0,0,0,0,0
4,5,0.067331,0.629569,0.518903,1.355551,-0.507284,0.947203,0.753229,1.382009,0.468851,...,0,0,0,1,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2914,2915,2.419700,-2.069648,-1.043937,-1.481920,1.289758,-0.043346,-0.682812,-0.569991,-0.969026,...,0,0,0,1,0,0,0,0,1,0
2915,2916,2.419700,-2.069648,-1.049263,-1.481920,-0.507284,-0.043346,-0.682812,-0.569991,-0.415828,...,0,0,0,1,1,0,0,0,0,0
2916,2917,-0.873616,3.885767,1.246808,-0.772552,1.289758,-0.373528,0.561757,-0.569991,1.717937,...,0,0,0,1,1,0,0,0,0,0
2917,2918,0.655424,-0.313015,0.034605,-0.772552,-0.507284,0.683057,0.370284,-0.569991,-0.229233,...,0,0,0,1,0,0,0,0,1,0


In [334]:
combine_full.to_csv(r"C:\Users\longv\Prediction-House-Price\Data\data for semisupervise\combine_full.csv", index = False)

In [335]:
log_objective.to_csv(r"C:\Users\longv\Prediction-House-Price\Data\data for semisupervise\log_price.csv", index = False)

In [336]:
label_x.to_csv(r"C:\Users\longv\Prediction-House-Price\Data\data for semisupervise\label_x.csv", index = False)

In [None]:
un.to_csv(r"C:\Users\longv\Prediction-House-Price\Data\data for semisupervise\combine_full.csv", index = False)