In [1]:
import numpy as np
import pandas as pd
import sklearn
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

In [2]:
df = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
df

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,175000
1456,1457,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,2,2010,WD,Normal,210000
1457,1458,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,...,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1458,1459,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2010,WD,Normal,142125


In [3]:
def replace_nan(df):
    for column in df.columns:
        if df[column].dtype == 'float64' or df[column].dtype == 'int64':
            df[column].fillna(0, inplace=True)
        elif df[column].dtype == 'object':
            df[column].fillna("Not", inplace=True)
            # Convert the column to string
            df[column] = df[column].astype(str)
    return df

df_cleaned = replace_nan(df)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[column].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[column].fillna("Not", inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a

In [4]:
train_set, test_set = train_test_split(df_cleaned, test_size=0.2, random_state=42)

X_train = train_set.drop("SalePrice", axis=1)
y = train_set["SalePrice"].copy()

In [5]:
def separate_numeric_and_non_numeric(X_train):
    numeric_columns = X_train.select_dtypes(include='number').columns
    non_numeric_columns = X_train.select_dtypes(exclude='number').columns
    numeric_df = X_train[numeric_columns]
    non_numeric_df = X_train[non_numeric_columns]
    return numeric_df, non_numeric_df

# Separate numeric and non-numeric columns
numeric_df, non_numeric_df = separate_numeric_and_non_numeric(X_train)

# Define preprocessing pipelines
num_pipeline = Pipeline([
    ('std_scaler', StandardScaler())
])

cat_pipeline = Pipeline([
    ('ordinal_encoder', OrdinalEncoder())
])

# Define which columns will be processed by each pipeline
num_attribs = list(numeric_df.columns)
cat_attribs = list(non_numeric_df.columns)

# Create a full preprocessing pipeline
full_pipeline = ColumnTransformer([
    ('num', num_pipeline, num_attribs),
    ('cat', cat_pipeline, cat_attribs)
])

# Apply the full pipeline to the training data
X_prepared = full_pipeline.fit_transform(X_train)
X_prepared

array([[-1.11928402, -0.8667643 ,  0.35953495, ...,  1.        ,
         8.        ,  4.        ],
       [ 0.79046412,  0.07410996,  0.04874271, ...,  1.        ,
         8.        ,  4.        ],
       [-0.21615189, -0.63154574,  0.27477343, ...,  1.        ,
         8.        ,  4.        ],
       ...,
       [ 1.32669882, -0.8667643 ,  0.07699655, ...,  1.        ,
         8.        ,  4.        ],
       [ 0.30597137, -0.16110861, -0.06427265, ...,  1.        ,
         8.        ,  4.        ],
       [ 0.93157852,  1.48542135, -0.12078033, ...,  1.        ,
         8.        ,  4.        ]])

In [6]:
from sklearn.linear_model import LinearRegression

LR_model = LinearRegression()

In [7]:
LR_model.fit(X_prepared, y)

In [8]:
test_data = X_train.sample(5)
test_data

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
1231,1232,90,RL,70.0,7728,Pave,Not,Reg,Lvl,AllPub,...,0,0,Not,GdWo,Not,0,5,2006,WD,Normal
194,195,20,RL,60.0,7180,Pave,Not,IR1,Lvl,AllPub,...,0,0,Not,Not,Not,0,5,2008,WD,Normal
258,259,60,RL,80.0,12435,Pave,Not,Reg,Lvl,AllPub,...,0,0,Not,Not,Not,0,5,2008,WD,Normal
944,945,20,RL,0.0,14375,Pave,Not,IR1,Lvl,NoSeWa,...,233,0,Not,Not,Not,0,1,2009,COD,Abnorml
466,467,20,RL,85.0,10628,Pave,Not,Reg,Lvl,AllPub,...,176,0,Not,GdWo,Not,0,4,2007,WD,Normal


In [9]:
test_label = y.loc[test_data.index]
test_label

1231    132500
194     127000
258     231500
944     137500
466     167000
Name: SalePrice, dtype: int64

In [10]:
test_data_prepared = full_pipeline.transform(test_data)
test_data_prepared

array([[ 1.17852871e+00,  7.79765657e-01,  3.59534945e-01,
        -2.75379675e-01, -8.20444558e-01,  3.72217301e-01,
        -2.92402586e-01, -1.10480940e+00,  9.70689656e-02,
         7.77938398e-01, -2.85504061e-01, -5.99755648e-01,
         1.00408255e-01,  5.34153418e-02, -8.01922924e-01,
        -1.18998664e-01, -6.43640411e-01,  1.10531958e+00,
        -2.42870023e-01, -1.05556573e+00, -7.64097523e-01,
         1.36218320e-01, -2.12757112e-01, -3.46905283e-01,
        -9.58592150e-01,  2.03479972e-01,  2.95091654e-01,
         3.02011948e-01, -7.40157477e-01, -4.55000217e-01,
        -3.51921074e-01, -1.21008050e-01, -2.75837824e-01,
        -7.09928378e-02, -9.27403321e-02, -5.08009704e-01,
        -1.37548612e+00,  3.00000000e+00,  1.00000000e+00,
         1.00000000e+00,  3.00000000e+00,  3.00000000e+00,
         0.00000000e+00,  4.00000000e+00,  0.00000000e+00,
         1.20000000e+01,  2.00000000e+00,  2.00000000e+00,
         2.00000000e+00,  7.00000000e+00,  3.00000000e+0

In [11]:
predicted_data = LR_model.predict(test_data_prepared)
predicted_data

array([134923.52882537, 125689.52882537, 217357.52882537, 134045.52882537,
       188645.52882537])

In [12]:
pd.DataFrame({'Prognoz':predicted_data, 'Real baxosi': test_label})

Unnamed: 0,Prognoz,Real baxosi
1231,134923.528825,132500
194,125689.528825,127000
258,217357.528825,231500
944,134045.528825,137500
466,188645.528825,167000


In [13]:
test_set

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
892,893,20,RL,70.0,8414,Pave,Not,Reg,Lvl,AllPub,...,0,Not,MnPrv,Not,0,2,2006,WD,Normal,154500
1105,1106,60,RL,98.0,12256,Pave,Not,IR1,Lvl,AllPub,...,0,Not,Not,Not,0,4,2010,WD,Normal,325000
413,414,30,RM,56.0,8960,Pave,Grvl,Reg,Lvl,AllPub,...,0,Not,Not,Not,0,3,2010,WD,Normal,115000
522,523,50,RM,50.0,5000,Pave,Not,Reg,Lvl,AllPub,...,0,Not,Not,Not,0,10,2006,WD,Normal,159000
1036,1037,20,RL,89.0,12898,Pave,Not,IR1,HLS,AllPub,...,0,Not,Not,Not,0,9,2009,WD,Normal,315500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
479,480,30,RM,50.0,5925,Pave,Not,Reg,Bnk,AllPub,...,0,Not,MnPrv,Not,0,3,2007,WD,Alloca,89471
1361,1362,20,RL,124.0,16158,Pave,Not,IR1,Low,AllPub,...,0,Not,Not,Not,0,6,2009,WD,Normal,260000
802,803,60,RL,63.0,8199,Pave,Not,Reg,Lvl,AllPub,...,0,Not,Not,Not,0,10,2008,WD,Normal,189000
651,652,70,RL,60.0,9084,Pave,Not,Reg,Lvl,AllPub,...,0,Not,MnPrv,Not,0,10,2009,WD,Normal,108000


In [14]:
X_test = test_set.drop('SalePrice', axis=1)
X_test

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
892,893,20,RL,70.0,8414,Pave,Not,Reg,Lvl,AllPub,...,0,0,Not,MnPrv,Not,0,2,2006,WD,Normal
1105,1106,60,RL,98.0,12256,Pave,Not,IR1,Lvl,AllPub,...,0,0,Not,Not,Not,0,4,2010,WD,Normal
413,414,30,RM,56.0,8960,Pave,Grvl,Reg,Lvl,AllPub,...,0,0,Not,Not,Not,0,3,2010,WD,Normal
522,523,50,RM,50.0,5000,Pave,Not,Reg,Lvl,AllPub,...,0,0,Not,Not,Not,0,10,2006,WD,Normal
1036,1037,20,RL,89.0,12898,Pave,Not,IR1,HLS,AllPub,...,0,0,Not,Not,Not,0,9,2009,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
479,480,30,RM,50.0,5925,Pave,Not,Reg,Bnk,AllPub,...,0,0,Not,MnPrv,Not,0,3,2007,WD,Alloca
1361,1362,20,RL,124.0,16158,Pave,Not,IR1,Low,AllPub,...,0,0,Not,Not,Not,0,6,2009,WD,Normal
802,803,60,RL,63.0,8199,Pave,Not,Reg,Lvl,AllPub,...,0,0,Not,Not,Not,0,10,2008,WD,Normal
651,652,70,RL,60.0,9084,Pave,Not,Reg,Lvl,AllPub,...,0,0,Not,MnPrv,Not,0,10,2009,WD,Normal


In [15]:
y_test = test_set['SalePrice'].copy()
y_test

892     154500
1105    325000
413     115000
522     159000
1036    315500
         ...  
479      89471
1361    260000
802     189000
651     108000
722     124500
Name: SalePrice, Length: 292, dtype: int64

In [16]:
X_test_prepared = full_pipeline.fit_transform(X_test)
y_predicted = LR_model.predict(X_test_prepared)

In [17]:
from sklearn.metrics import mean_squared_error
lin_mse = mean_squared_error(y_test, y_predicted)
# RMSE hisoblaymiz
lin_rmse = np.sqrt(lin_mse)
print(lin_rmse)

4171012858567299.0


In [18]:
from sklearn.tree import DecisionTreeRegressor
Tree_model = DecisionTreeRegressor()
Tree_model.fit(X_prepared, y)

In [19]:
y_predicted = Tree_model.predict(X_test_prepared)

In [20]:
lin_mse = mean_squared_error(y_test, y_predicted)
# RMSE hisoblaymiz
lin_rmse = np.sqrt(lin_mse)
print(lin_rmse)

44572.19600681419


In [21]:
from sklearn.ensemble import RandomForestRegressor
RF_model = RandomForestRegressor()
RF_model.fit(X_prepared, y)

In [22]:
y_predicted = RF_model.predict(X_test_prepared)
lin_mse = mean_squared_error(y_test, y_predicted)
# RMSE hisoblaymiz
lin_rmse = np.sqrt(lin_mse)
print(lin_rmse)

28783.22278168475


In [23]:
from xgboost import XGBRegressor
XGB_model = XGBRegressor()
XGB_model.fit(X_prepared, y)

In [24]:
y_predicted = XGB_model.predict(X_test_prepared)
lin_mse = mean_squared_error(y_test, y_predicted)
# RMSE hisoblaymiz
lin_rmse = np.sqrt(lin_mse)
print(lin_rmse)

30203.955229394982


In [25]:
from sklearn.neighbors import KNeighborsRegressor 
KNN_model = KNeighborsRegressor()
KNN_model.fit(X_prepared, y)

In [26]:
y_predicted = KNN_model.predict(X_test_prepared)
lin_mse = mean_squared_error(y_test, y_predicted)
# RMSE hisoblaymiz
lin_rmse = np.sqrt(lin_mse)
print(lin_rmse)

43819.97175345932


In [27]:
X = df.drop("SalePrice", axis=1)
y = df["SalePrice"].copy()

X_prepared = full_pipeline.fit_transform(X)

In [28]:
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Std.dev:", scores.std())

In [29]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(LR_model, X_prepared, y, scoring="neg_mean_squared_error", cv=10)
LR_rmse_scores = np.sqrt(-scores)
display_scores(LR_rmse_scores)

Scores: [2.45718599e+04 3.22397090e+04 3.03469284e+04 4.37977523e+04
 3.65230872e+04 2.81366077e+04 7.80252147e+11 2.73301255e+04
 6.58164319e+04 3.54159338e+04]
Mean: 78025247150.14136
Std.dev: 234075633390.94577


In [30]:
scores = cross_val_score(Tree_model, X_prepared, y, scoring="neg_mean_squared_error", cv=10)
LR_rmse_scores = np.sqrt(-scores)
display_scores(LR_rmse_scores)

Scores: [37326.95582063 37325.10590568 32451.79758007 52646.45847258
 40310.08620782 27857.0350395  32778.19682632 40884.6617979
 52822.70410119 36884.56178125]
Mean: 39128.756353294244
Std.dev: 7734.53524898703


In [31]:
scores = cross_val_score(RF_model, X_prepared, y, scoring="neg_mean_squared_error", cv=10)
LR_rmse_scores = np.sqrt(-scores)
display_scores(LR_rmse_scores)

Scores: [25512.42251499 26575.69717281 21612.60084733 38331.85632038
 34125.97104858 26108.4730084  24189.91294438 24092.701776
 39478.51731289 28063.32607   ]
Mean: 28809.147901575452
Std.dev: 5931.821881225616


In [32]:
scores = cross_val_score(XGB_model, X_prepared, y, scoring="neg_mean_squared_error", cv=10)
LR_rmse_scores = np.sqrt(-scores)
display_scores(LR_rmse_scores)

Scores: [23437.44559822 25101.84931749 22734.21456633 40725.29098788
 31074.55141785 25313.53827175 24022.9864022  24958.79280016
 36297.41236315 26226.49592878]
Mean: 27989.257765381695
Std.dev: 5767.186862709019


In [33]:
scores = cross_val_score(KNN_model, X_prepared, y, scoring="neg_mean_squared_error", cv=10)
LR_rmse_scores = np.sqrt(-scores)
display_scores(LR_rmse_scores)

Scores: [28415.40134387 37145.43387409 34272.27055589 42937.05874349
 52054.77851207 41650.90198226 33162.70363151 37882.46825323
 57602.06282    30969.39694939]
Mean: 39609.24766658034
Std.dev: 8791.575116063674


In [34]:
URL = "/kaggle/input/house-prices-advanced-regression-techniques/test.csv"
df_test = pd.read_csv(URL)
df_test.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal


In [35]:
def replace_nan(df_test):
    for column in df_test.columns:
        if df_test[column].dtype == 'float64' or df_test[column].dtype == 'int64':
            df_test[column].fillna(0, inplace=True)
        elif df_test[column].dtype == 'object':
            df_test[column].fillna("Not", inplace=True)
            # Convert the column to string
            df_test[column] = df[column].astype(str)
    return df_test

df_cleaned = replace_nan(df_test)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_test[column].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_test[column].fillna("Not", inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always be

In [36]:
aviaprice_prepared = full_pipeline.fit_transform(df_cleaned)
aviaprice_prepared

array([[-1.73086406, -0.87471081,  0.68484859, ...,  1.        ,
         8.        ,  4.        ],
       [-1.72848977, -0.87471081,  0.71585154, ...,  1.        ,
         8.        ,  4.        ],
       [-1.72611547,  0.06135085,  0.49883089, ...,  1.        ,
         8.        ,  4.        ],
       ...,
       [ 1.72611547, -0.87471081,  3.16508463, ...,  1.        ,
         8.        ,  4.        ],
       [ 1.72848977,  0.64638939,  0.12679548, ...,  3.        ,
         8.        ,  4.        ],
       [ 1.73086406,  0.06135085,  0.49883089, ...,  1.        ,
         8.        ,  4.        ]])

In [37]:
y_predicted = RF_model.predict(aviaprice_prepared)

In [38]:
URL = "/kaggle/input/house-prices-advanced-regression-techniques/sample_submission.csv"
df_sample = pd.read_csv(URL)
df_sample['price'] = pd.DataFrame(y_predicted)
df_sample

Unnamed: 0,Id,SalePrice,price
0,1461,169277.052498,131618.00
1,1462,187758.393989,158498.50
2,1463,183583.683570,189988.40
3,1464,179317.477511,192159.17
4,1465,150730.079977,204978.37
...,...,...,...
1454,2915,167081.220949,93752.00
1455,2916,164788.778231,98535.00
1456,2917,219222.423400,154776.61
1457,2918,184924.279659,122520.33


In [39]:
df_sample.to_csv('submission.csv')