In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import Ridge, SGDRegressor, LinearRegression
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor, ExtraTreesRegressor,AdaBoostRegressor, VotingRegressor, StackingRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline, make_pipeline
 
from tpot import TPOTRegressor
    
    
from sklearn.compose import ColumnTransformer

from sklearn.metrics import mean_squared_error, mean_absolute_error,r2_score

In [2]:
train_df = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/train.csv")
test_df = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/test.csv")
sample = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/sample_submission.csv")

In [3]:
test = test_df.copy()
train = train_df.copy()

In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [5]:
train.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

In [6]:
train_id = train[["Id"]]
test_id = test[["Id"]]

In [7]:
train = train.set_index("Id")
test = test.set_index("Id")

In [8]:
numeric_columns = train.select_dtypes(include=[np.number]).columns

# Calculate Q1 (25th percentile) and Q3 (75th percentile) for the numeric columns
Q1 = train[numeric_columns].quantile(0.05)
Q3 = train[numeric_columns].quantile(0.95)
IQR = Q3 - Q1

# Identify outliers
outliers = (train[numeric_columns] < (Q1 - 1.5 * IQR)) | (train[numeric_columns] > (Q3 + 1.5 * IQR))

# Create a boolean mask to filter out rows with any outliers
outliers_mask = outliers.any(axis=1)

# Drop outliers
train = train[~outliers_mask]

In [9]:
X = pd.DataFrame(train.drop(columns=["SalePrice"]))
y = pd.DataFrame(train["SalePrice"])

In [10]:
X

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,2,2008,WD,Normal
2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,0,,,,0,5,2007,WD,Normal
3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,9,2008,WD,Normal
4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,,,0,2,2006,WD,Abnorml
5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,0,,,,0,12,2008,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,20,RL,90.0,17217,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,7,2006,WD,Abnorml
1455,20,FV,62.0,7500,Pave,Pave,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,10,2009,WD,Normal
1456,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,8,2007,WD,Normal
1457,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,MnPrv,,0,2,2010,WD,Normal


In [11]:
test

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,Inside,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,,Gar2,12500,6,2010,WD,Normal
1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,MnPrv,,0,3,2010,WD,Normal
1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,6,2010,WD,Normal
1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,Inside,...,144,0,,,,0,1,2010,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2915,160,RM,21.0,1936,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,6,2006,WD,Normal
2916,160,RM,21.0,1894,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,4,2006,WD,Abnorml
2917,20,RL,160.0,20000,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,9,2006,WD,Abnorml
2918,85,RL,62.0,10441,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,MnPrv,Shed,700,7,2006,WD,Normal


In [12]:
def FillNa_mean(data,column):
    data[[column]] = data[[column]].fillna(data[column].mean())
    return data[[column]]

In [13]:
def FillNa_mode(data,column):
    data[[column]] = data[[column]].fillna(data[column].mode()[0])
    return data[[column]]

In [14]:
def FillNa_Na(data,column):
    data[[column]] = data[[column]].fillna("Na")
    return data[[column]]

In [15]:
column_mean = ["LotFrontage","MasVnrArea"]
column_mode = ["MasVnrType", "GarageYrBlt"]
column_Na = ["BsmtQual", "BsmtCond", "BsmtExposure", "BsmtFinType1", 
             "BsmtFinType2", "Electrical", "FireplaceQu", "GarageType", 
             "GarageFinish", "GarageCond", "GarageQual", "PoolQC", "Fence",
             "MiscFeature", "Alley"]

In [16]:
for i in column_mean:
    FillNa_mean(X, i)
    FillNa_mean(test, i)

In [17]:
for i in column_mode:
    FillNa_mode(X,i)
    FillNa_mode(test,i)

In [18]:
for i in column_Na:
    FillNa_Na(X, i)  
    FillNa_Na(test, i)

In [19]:
full_df = pd.concat([X, test])

In [20]:
full_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2725 entries, 1 to 2919
Data columns (total 79 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MSSubClass     2725 non-null   int64  
 1   MSZoning       2721 non-null   object 
 2   LotFrontage    2725 non-null   float64
 3   LotArea        2725 non-null   int64  
 4   Street         2725 non-null   object 
 5   Alley          2725 non-null   object 
 6   LotShape       2725 non-null   object 
 7   LandContour    2725 non-null   object 
 8   Utilities      2723 non-null   object 
 9   LotConfig      2725 non-null   object 
 10  LandSlope      2725 non-null   object 
 11  Neighborhood   2725 non-null   object 
 12  Condition1     2725 non-null   object 
 13  Condition2     2725 non-null   object 
 14  BldgType       2725 non-null   object 
 15  HouseStyle     2725 non-null   object 
 16  OverallQual    2725 non-null   int64  
 17  OverallCond    2725 non-null   int64  
 18  YearBuilt    

In [21]:
full_df

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,RL,65.0,8450,Pave,Na,Reg,Lvl,AllPub,Inside,...,0,0,Na,Na,Na,0,2,2008,WD,Normal
2,20,RL,80.0,9600,Pave,Na,Reg,Lvl,AllPub,FR2,...,0,0,Na,Na,Na,0,5,2007,WD,Normal
3,60,RL,68.0,11250,Pave,Na,IR1,Lvl,AllPub,Inside,...,0,0,Na,Na,Na,0,9,2008,WD,Normal
4,70,RL,60.0,9550,Pave,Na,IR1,Lvl,AllPub,Corner,...,0,0,Na,Na,Na,0,2,2006,WD,Abnorml
5,60,RL,84.0,14260,Pave,Na,IR1,Lvl,AllPub,FR2,...,0,0,Na,Na,Na,0,12,2008,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2915,160,RM,21.0,1936,Pave,Na,Reg,Lvl,AllPub,Inside,...,0,0,Na,Na,Na,0,6,2006,WD,Normal
2916,160,RM,21.0,1894,Pave,Na,Reg,Lvl,AllPub,Inside,...,0,0,Na,Na,Na,0,4,2006,WD,Abnorml
2917,20,RL,160.0,20000,Pave,Na,Reg,Lvl,AllPub,Inside,...,0,0,Na,Na,Na,0,9,2006,WD,Abnorml
2918,85,RL,62.0,10441,Pave,Na,Reg,Lvl,AllPub,Inside,...,0,0,Na,MnPrv,Shed,700,7,2006,WD,Normal


In [22]:
# train[["LotFrontage"]] = train[["LotFrontage"]].fillna(train["LotFrontage"].mean())
# train[["MasVnrType"]] = train[["MasVnrType"]].fillna(train["MasVnrType"].mode()[0])
# train[["MasVnrArea"]] = train[["MasVnrArea"]].fillna(train["MasVnrArea"].mean())
# train[["GarageYrBlt"]] = train[["GarageYrBlt"]].fillna(train["GarageYrBlt"].mode()[0])


In [23]:
# train[["BsmtQual"]] = train[["BsmtQual"]].fillna("Na")
# train[["BsmtCond"]] = train[["BsmtCond"]].fillna("Na")
# train[["BsmtExposure"]] = train[["BsmtExposure"]].fillna("Na")
# train[["BsmtFinType1"]] = train[["BsmtFinType1"]].fillna("Na")
# train[["BsmtFinType2"]] = train[["BsmtExposure"]].fillna("Na")
# train[["Electrical"]] = train[["Electrical"]].fillna("Na")
# train[["FireplaceQu"]] = train[["FireplaceQu"]].fillna("Na")
# train[["GarageType"]] = train[["GarageType"]].fillna("Na")
# train[["GarageFinish"]] = train[["GarageFinish"]].fillna("Na")
# train[["GarageCond"]] = train[["GarageCond"]].fillna("Na")
# train[["GarageQual"]] = train[["GarageQual"]].fillna("Na")
# train[["PoolQC"]] = train[["PoolQC"]].fillna("Na")
# train[["Fence"]] = train[["Fence"]].fillna("Na")
# train[["MiscFeature"]] = train[["MiscFeature"]].fillna("Na")
# train[["Alley"]] = train[["Alley"]].fillna("Na")

In [24]:
# Initialize the StandardScaler
scaler = StandardScaler()

# Identify numeric columns
numeric_columns = full_df.select_dtypes(include=['int64', 'float64']).columns

# Fit the scaler to the numeric columns and transform them
full_df[numeric_columns] = scaler.fit_transform(full_df[numeric_columns])

In [25]:
full_df

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.094463,RL,-0.188366,-0.280099,Pave,Na,Reg,Lvl,AllPub,Inside,...,-0.290799,-0.041842,Na,Na,Na,-0.067363,-1.546684,0.163484,WD,Normal
2,-0.862456,RL,0.551059,-0.029623,Pave,Na,Reg,Lvl,AllPub,FR2,...,-0.290799,-0.041842,Na,Na,Na,-0.067363,-0.439782,-0.599347,WD,Normal
3,0.094463,RL,-0.040481,0.329755,Pave,Na,IR1,Lvl,AllPub,Inside,...,-0.290799,-0.041842,Na,Na,Na,-0.067363,1.036087,0.163484,WD,Normal
4,0.333693,RL,-0.434841,-0.040513,Pave,Na,IR1,Lvl,AllPub,Corner,...,-0.290799,-0.041842,Na,Na,Na,-0.067363,-1.546684,-1.362179,WD,Abnorml
5,0.094463,RL,0.748240,0.985348,Pave,Na,IR1,Lvl,AllPub,FR2,...,-0.290799,-0.041842,Na,Na,Na,-0.067363,2.142989,0.163484,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2915,2.486761,RM,-2.357348,-1.698880,Pave,Na,Reg,Lvl,AllPub,Inside,...,-0.290799,-0.041842,Na,Na,Na,-0.067363,-0.070815,-1.362179,WD,Normal
2916,2.486761,RM,-2.357348,-1.708028,Pave,Na,Reg,Lvl,AllPub,Inside,...,-0.290799,-0.041842,Na,Na,Na,-0.067363,-0.808749,-1.362179,WD,Abnorml
2917,-0.862456,RL,4.494662,2.235549,Pave,Na,Reg,Lvl,AllPub,Inside,...,-0.290799,-0.041842,Na,Na,Na,-0.067363,1.036087,-1.362179,WD,Abnorml
2918,0.692537,RL,-0.336251,0.153551,Pave,Na,Reg,Lvl,AllPub,Inside,...,-0.290799,-0.041842,Na,MnPrv,Shed,1.446718,0.298153,-1.362179,WD,Normal


In [26]:
encoder = LabelEncoder()

for column in full_df.columns:
    if full_df[column].dtype == 'object':
        # Fit the LabelEncoder and transform the column
        full_df[column] = encoder.fit_transform(full_df[column])

In [27]:
full_df

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.094463,3,-0.188366,-0.280099,1,1,3,3,0,4,...,-0.290799,-0.041842,2,4,1,-0.067363,-1.546684,0.163484,8,4
2,-0.862456,3,0.551059,-0.029623,1,1,3,3,0,2,...,-0.290799,-0.041842,2,4,1,-0.067363,-0.439782,-0.599347,8,4
3,0.094463,3,-0.040481,0.329755,1,1,0,3,0,4,...,-0.290799,-0.041842,2,4,1,-0.067363,1.036087,0.163484,8,4
4,0.333693,3,-0.434841,-0.040513,1,1,0,3,0,0,...,-0.290799,-0.041842,2,4,1,-0.067363,-1.546684,-1.362179,8,0
5,0.094463,3,0.748240,0.985348,1,1,0,3,0,2,...,-0.290799,-0.041842,2,4,1,-0.067363,2.142989,0.163484,8,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2915,2.486761,4,-2.357348,-1.698880,1,1,3,3,0,4,...,-0.290799,-0.041842,2,4,1,-0.067363,-0.070815,-1.362179,8,4
2916,2.486761,4,-2.357348,-1.708028,1,1,3,3,0,4,...,-0.290799,-0.041842,2,4,1,-0.067363,-0.808749,-1.362179,8,0
2917,-0.862456,3,4.494662,2.235549,1,1,3,3,0,4,...,-0.290799,-0.041842,2,4,1,-0.067363,1.036087,-1.362179,8,0
2918,0.692537,3,-0.336251,0.153551,1,1,3,3,0,4,...,-0.290799,-0.041842,2,2,3,1.446718,0.298153,-1.362179,8,4


In [28]:
X = full_df[0:1266]

In [29]:
test = full_df[1266:]

In [30]:
test

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1461,-0.862456,2,0.551059,0.410779,1,1,3,3,0,4,...,1.888692,-0.041842,2,2,1,-0.067363,-0.070815,1.689146,8,4
1462,-0.862456,3,0.600354,0.986873,1,1,0,3,0,0,...,-0.290799,-0.041842,2,4,0,26.969811,-0.070815,1.689146,8,4
1463,0.094463,3,0.255289,0.891692,1,1,0,3,0,4,...,-0.290799,-0.041842,2,2,1,-0.067363,-1.177716,1.689146,8,4
1464,0.094463,3,0.452469,0.052707,1,1,0,3,0,4,...,-0.290799,-0.041842,2,4,1,-0.067363,-0.070815,1.689146,8,4
1465,1.529842,3,-1.272857,-1.030437,1,1,0,1,0,4,...,2.324590,-0.041842,2,4,1,-0.067363,-1.915651,1.689146,8,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2915,2.486761,4,-2.357348,-1.698880,1,1,3,3,0,4,...,-0.290799,-0.041842,2,4,1,-0.067363,-0.070815,-1.362179,8,4
2916,2.486761,4,-2.357348,-1.708028,1,1,3,3,0,4,...,-0.290799,-0.041842,2,4,1,-0.067363,-0.808749,-1.362179,8,0
2917,-0.862456,3,4.494662,2.235549,1,1,3,3,0,4,...,-0.290799,-0.041842,2,4,1,-0.067363,1.036087,-1.362179,8,0
2918,0.692537,3,-0.336251,0.153551,1,1,3,3,0,4,...,-0.290799,-0.041842,2,2,3,1.446718,0.298153,-1.362179,8,4


### valdiating

In [31]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [32]:
linear = LinearRegression()

linear.fit(X_train,y_train)
print(f"Training score {linear.score(X_train,y_train)}")
y_pred = linear.predict(X_test)
print(f"R2: {r2_score(y_test,y_pred)}")

Training score 0.9065362849492203
R2: 0.903708885625857


In [33]:
cat_model = CatBoostRegressor()
cat_model.fit(X_train, y_train)

y_pred = cat_model.predict(X_test)
print(f"Training score {cat_model.score(X_train,y_train)}")
print(f"R2: {r2_score(y_test,y_pred)}")

Learning rate set to 0.04102
0:	learn: 74179.9475579	total: 64.8ms	remaining: 1m 4s
1:	learn: 72252.4791609	total: 70.4ms	remaining: 35.1s
2:	learn: 70580.5718006	total: 73.8ms	remaining: 24.5s
3:	learn: 68685.7149695	total: 76.6ms	remaining: 19.1s
4:	learn: 66883.9327109	total: 79.6ms	remaining: 15.8s
5:	learn: 65226.1167912	total: 82.7ms	remaining: 13.7s
6:	learn: 63485.6706185	total: 85.7ms	remaining: 12.2s
7:	learn: 61860.1082334	total: 88.6ms	remaining: 11s
8:	learn: 60311.4480150	total: 91.7ms	remaining: 10.1s
9:	learn: 59050.0350621	total: 94.8ms	remaining: 9.38s
10:	learn: 57535.6344369	total: 97.7ms	remaining: 8.78s
11:	learn: 56091.6076250	total: 101ms	remaining: 8.29s
12:	learn: 54830.5774195	total: 104ms	remaining: 7.88s
13:	learn: 53466.9404355	total: 107ms	remaining: 7.51s
14:	learn: 52353.8895996	total: 110ms	remaining: 7.21s
15:	learn: 51266.5050447	total: 113ms	remaining: 6.94s
16:	learn: 50008.4927381	total: 116ms	remaining: 6.7s
17:	learn: 48883.7397925	total: 119ms	

In [34]:
xg_model = XGBRegressor()
xg_model.fit(X_train, y_train)

y_pred = xg_model.predict(X_test)
print(f"Training score {cat_model.score(X_train,y_train)}")
print(f"R2: {r2_score(y_test,y_pred)}")

Training score 0.9958306766657639
R2: 0.8777699236320589


In [35]:
LG_model = LGBMRegressor()
LG_model.fit(X_train, y_train)

y_pred = LG_model.predict(X_test)
print(f"Training score {LG_model.score(X_train,y_train)}")
print(f"R2: {r2_score(y_test,y_pred)}")


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004564 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2976
[LightGBM] [Info] Number of data points in the train set: 1012, number of used features: 67
[LightGBM] [Info] Start training from score 182020.015810
Training score 0.9884593945308559
R2: 0.9158918639050482


### Test

In [36]:
xg_model = XGBRegressor()
xg_model.fit(X, y)

y_pred = xg_model.predict(test)
print(f"Training score {cat_model.score(X,y)}")

Training score 0.9829290728740031


In [37]:
cat_model = CatBoostRegressor()
cat_model.fit(X, y)

y_pred = cat_model.predict(test)
print(f"Training score {cat_model.score(X,y)}")

Learning rate set to 0.042498
0:	learn: 73503.7783711	total: 7.23ms	remaining: 7.23s
1:	learn: 71519.7119288	total: 10.6ms	remaining: 5.28s
2:	learn: 69753.8074090	total: 13.8ms	remaining: 4.6s
3:	learn: 67744.5318521	total: 17.1ms	remaining: 4.25s
4:	learn: 65904.1035802	total: 21.8ms	remaining: 4.34s
5:	learn: 64177.7998832	total: 25.1ms	remaining: 4.16s
6:	learn: 62373.8376921	total: 28.4ms	remaining: 4.03s
7:	learn: 60746.0386400	total: 31.4ms	remaining: 3.89s
8:	learn: 59196.9543125	total: 34.8ms	remaining: 3.83s
9:	learn: 57800.0167603	total: 37.9ms	remaining: 3.75s
10:	learn: 56253.2990604	total: 40.9ms	remaining: 3.67s
11:	learn: 54818.2494467	total: 44.1ms	remaining: 3.63s
12:	learn: 53434.9935042	total: 47.3ms	remaining: 3.59s
13:	learn: 52124.8717792	total: 50.2ms	remaining: 3.54s
14:	learn: 50907.3117248	total: 53.6ms	remaining: 3.52s
15:	learn: 49753.9061597	total: 56.7ms	remaining: 3.49s
16:	learn: 48495.7565702	total: 59.8ms	remaining: 3.46s
17:	learn: 47423.3389153	tota

In [38]:
output = test_id

In [39]:
output["SalePrice"] = pd.DataFrame(y_pred)

In [40]:
output = output.set_index("Id")

In [41]:
sample

Unnamed: 0,Id,SalePrice
0,1461,169277.052498
1,1462,187758.393989
2,1463,183583.683570
3,1464,179317.477511
4,1465,150730.079977
...,...,...
1454,2915,167081.220949
1455,2916,164788.778231
1456,2917,219222.423400
1457,2918,184924.279659


In [42]:
output.to_csv("House_prices_cat.csv")