## Data cleaning 

In [572]:
#Import all relevant libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import zscore

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn import metrics

 
## This statement allows the visuals to render within your Jupyter Notebook.
%matplotlib inline

In [573]:
houses_train = pd.read_csv("data/train.csv")

In [574]:
test_dataset = pd.read_csv("data/test.csv")

In [575]:
sale_price = pd.read_csv("data/sample_submission.csv")

In [576]:
houses_train.isnull().sum().sort_values(ascending=False).head(30)

PoolQC          1453
MiscFeature     1406
Alley           1369
Fence           1179
MasVnrType       872
FireplaceQu      690
LotFrontage      259
GarageYrBlt       81
GarageCond        81
GarageType        81
GarageFinish      81
GarageQual        81
BsmtFinType2      38
BsmtExposure      38
BsmtQual          37
BsmtCond          37
BsmtFinType1      37
MasVnrArea         8
Electrical         1
Id                 0
Functional         0
Fireplaces         0
KitchenQual        0
KitchenAbvGr       0
BedroomAbvGr       0
HalfBath           0
FullBath           0
BsmtHalfBath       0
TotRmsAbvGrd       0
GarageCars         0
dtype: int64

In [577]:
columns_to_drop = ["MiscFeature", "Alley", "Fence","PoolQC"]
houses_train.drop(columns=columns_to_drop, inplace=True)

In [578]:
houses_train["MasVnrType"].fillna(houses_train["MasVnrType"].mode(),inplace=True)
houses_train["BsmtExposure"].fillna(houses_train["BsmtExposure"].mode(),inplace=True)
houses_train["FireplaceQu"].fillna(houses_train["FireplaceQu"].mode(),inplace=True)
houses_train["BsmtFinType2"].fillna(houses_train["BsmtFinType2"].mode(),inplace=True)
houses_train.fillna(0,inplace=True)
       

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  houses_train["MasVnrType"].fillna(houses_train["MasVnrType"].mode(),inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  houses_train["BsmtExposure"].fillna(houses_train["BsmtExposure"].mode(),inplace=True)
The behavior will change in pandas 3.0. This inplace method will

### Outliers

-------------------

In [579]:
# Compute Z-scores for the "GrLivArea" column
houses_train["p_z_score"] = zscore(houses_train["GrLivArea"])



# Identify outliers (|Z| > 3)
# I assume this is outliers so i will drop it from the dataset

indexs = houses_train[houses_train["p_z_score"].abs() > 3].sort_values('GrLivArea').index

houses_train.drop(indexs,inplace=True)
del houses_train["p_z_score"]


In [580]:
# Compute Z-scores for the "TotRmsAbvGrd" column
houses_train["p_z_score"] = zscore(houses_train["TotRmsAbvGrd"])



# Identify outliers (|Z| > 3)
# I assume this is outliers so i will drop it from the dataset

indexs = houses_train[houses_train["p_z_score"].abs() > 3].sort_values('TotRmsAbvGrd').index

houses_train.drop(indexs,inplace=True)
del houses_train["p_z_score"]


In [581]:
# Compute Z-scores for the "TotRmsAbvGrd" column
houses_train["p_z_score"] = zscore(houses_train["WoodDeckSF"])



# Identify outliers (|Z| > 3)
# I assume this is outliers so i will drop it from the dataset

indexs =houses_train[houses_train["p_z_score"].abs() > 4]['WoodDeckSF'].sort_values().index

houses_train.drop(indexs,inplace=True)
del houses_train["p_z_score"]


In [582]:
# Compute Z-scores for the "YearRemodAdd" column
houses_train["p_z_score"] = zscore(houses_train["YearRemodAdd"])



# Identify outliers (|Z| > 3)
# I assume this is outliers so i will drop it from the dataset

indexs = houses_train[houses_train["p_z_score"].abs() > 3].sort_values('YearRemodAdd').index

houses_train.drop(indexs,inplace=True)
del houses_train["p_z_score"]

In [583]:
houses_train.shape

(1430, 77)

In [584]:
# Identify numerical columns
numerical_features = houses_train.select_dtypes(include=['number']).columns.tolist()

# Display the numerical columns
#numerical_features

## Feature engineering: 


In [585]:
categorical_features = houses_train.select_dtypes(include=object).columns
numeric_features = houses_train.select_dtypes(include=np.number).columns
categorical_features

Index(['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities',
       'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
       'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
       'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation',
       'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
       'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual',
       'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual',
       'GarageCond', 'PavedDrive', 'SaleType', 'SaleCondition'],
      dtype='object')

In [586]:
count_line = 5

In [587]:
counts = houses_train['SaleType'].value_counts()


houses_train = houses_train[houses_train['SaleType'].isin(counts[counts >count_line].index)]

In [588]:
houses_train.shape

(1411, 77)

In [589]:
counts = houses_train['GarageCond'].value_counts()


houses_train = houses_train[houses_train['GarageCond'].isin(counts[counts >count_line].index)]

In [590]:
counts = houses_train['Functional'].value_counts()


houses_train = houses_train[houses_train['Functional'].isin(counts[counts >count_line].index)]

In [591]:
counts = houses_train['Electrical'].value_counts()


houses_train = houses_train[houses_train['Electrical'].isin(counts[counts >count_line].index)]

In [592]:
counts = houses_train['HeatingQC'].value_counts()


houses_train = houses_train[houses_train['HeatingQC'].isin(counts[counts >count_line].index)]

In [593]:
counts = houses_train['Heating'].value_counts()


houses_train = houses_train[houses_train['Heating'].isin(counts[counts >count_line].index)]

In [594]:
counts = houses_train['BsmtCond'].value_counts()


houses_train = houses_train[houses_train['BsmtCond'].isin(counts[counts >count_line].index)]

In [595]:
counts = houses_train['Foundation'].value_counts()


houses_train = houses_train[houses_train['Foundation'].isin(counts[counts >count_line].index)]

In [596]:
counts = houses_train['ExterCond'].value_counts()


houses_train = houses_train[houses_train['ExterCond'].isin(counts[counts >count_line].index)]

In [597]:
counts = houses_train['Exterior2nd'].value_counts()


houses_train = houses_train[houses_train['Exterior2nd'].isin(counts[counts >count_line].index)]

In [598]:
counts = houses_train['Exterior1st'].value_counts()


houses_train = houses_train[houses_train['Exterior1st'].isin(counts[counts >count_line].index)]

In [599]:
counts = houses_train['RoofMatl'].value_counts()


houses_train = houses_train[houses_train['RoofMatl'].isin(counts[counts >count_line].index)]

In [600]:
counts = houses_train['RoofStyle'].value_counts()


houses_train = houses_train[houses_train['RoofStyle'].isin(counts[counts >count_line].index)]

In [601]:
counts = houses_train['HouseStyle'].value_counts()


houses_train = houses_train[houses_train['HouseStyle'].isin(counts[counts >count_line].index)]

In [602]:
counts = houses_train['Condition2'].value_counts()


houses_train = houses_train[houses_train['Condition2'].isin(counts[counts >count_line].index)]

In [603]:
counts = houses_train['Condition1'].value_counts()


houses_train = houses_train[houses_train['Condition1'].isin(counts[counts >count_line].index)]

In [604]:
counts = houses_train['Neighborhood'].value_counts()


houses_train = houses_train[houses_train['Neighborhood'].isin(counts[counts >count_line].index)]

In [605]:
counts = houses_train['Utilities'].value_counts()


houses_train = houses_train[houses_train['Utilities'].isin(counts[counts >count_line].index)]

In [606]:
counts = houses_train['GarageCond'].value_counts()


houses_train = houses_train[houses_train['GarageCond'].isin(counts[counts >count_line].index)]

In [607]:
houses_train.shape

(1327, 77)

## 3. Feature selection


In [608]:
# one hot coding for categorical_features
houses_train = pd.get_dummies(houses_train,columns=categorical_features)

In [609]:
correlation = houses_train.corr()
correlation['SalePrice'].sort_values(ascending=False)

SalePrice           1.000000
OverallQual         0.799302
GrLivArea           0.732907
GarageCars          0.658731
TotalBsmtSF         0.642926
                      ...   
FireplaceQu_0      -0.470843
BsmtQual_TA        -0.495020
KitchenQual_TA     -0.538355
ExterQual_TA       -0.619393
Utilities_AllPub         NaN
Name: SalePrice, Length: 237, dtype: float64

In [610]:
# Set the correlation threshold
threshold = 0.3  # You can change this value based on your requirement

# Filter the correlations
# We use `abs()` for absolute value to consider both strong positive and negative correlations
selected_features = correlation[abs(correlation['SalePrice']) > threshold]['SalePrice'].index
selected_features

Index(['OverallQual', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1',
       'TotalBsmtSF', '1stFlrSF', 'GrLivArea', 'FullBath', 'TotRmsAbvGrd',
       'Fireplaces', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF',
       'SalePrice', 'MSZoning_RM', 'Neighborhood_NridgHt',
       'Exterior1st_VinylSd', 'Exterior2nd_VinylSd', 'MasVnrType_0',
       'MasVnrType_Stone', 'ExterQual_Ex', 'ExterQual_Gd', 'ExterQual_TA',
       'Foundation_CBlock', 'Foundation_PConc', 'BsmtQual_Ex', 'BsmtQual_TA',
       'BsmtFinType1_GLQ', 'HeatingQC_Ex', 'HeatingQC_TA', 'KitchenQual_Ex',
       'KitchenQual_Gd', 'KitchenQual_TA', 'FireplaceQu_0', 'FireplaceQu_Gd',
       'GarageType_Attchd', 'GarageType_Detchd', 'GarageFinish_Fin',
       'GarageFinish_Unf', 'SaleType_New', 'SaleCondition_Partial'],
      dtype='object')

In [611]:
df_houses_train = houses_train[selected_features]
df_houses_train.head()

Unnamed: 0,OverallQual,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,TotalBsmtSF,1stFlrSF,GrLivArea,FullBath,TotRmsAbvGrd,...,KitchenQual_Gd,KitchenQual_TA,FireplaceQu_0,FireplaceQu_Gd,GarageType_Attchd,GarageType_Detchd,GarageFinish_Fin,GarageFinish_Unf,SaleType_New,SaleCondition_Partial
0,7,2003,2003,196.0,706,856,856,1710,2,8,...,True,False,False,True,True,False,False,False,False,False
1,6,1976,1976,0.0,978,1262,1262,1262,2,6,...,False,True,False,False,True,False,False,False,False,False
2,7,2001,2002,162.0,486,920,920,1786,2,6,...,True,False,False,False,True,False,False,False,False,False
3,7,1915,1970,0.0,216,756,961,1717,1,7,...,True,False,False,True,False,True,False,True,False,False
4,8,2000,2000,350.0,655,1145,1145,2198,2,9,...,True,False,False,False,True,False,False,False,False,False


In [612]:
df_houses_train.shape

(1327, 43)

## Prepare train and test data


In [615]:
# Prepare data
X = df_houses_train.drop(["SalePrice"], axis=1)
y = df_houses_train["SalePrice"]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.3,
                                                    shuffle=True,
                                                    random_state=99)

# sacle the data
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

KeyError: "['Id'] not found in axis"

In [None]:
X.shape

(1327, 42)

In [None]:
y.shape

(1327,)

## Model

In [None]:
from sklearn.ensemble import GradientBoostingRegressor



In [None]:
gbr = GradientBoostingRegressor(
    n_estimators=100,  # Number of boosting stages
    learning_rate=0.1,  # Learning rate
    max_depth=3,  # Maximum depth of the individual regression estimators
    random_state=42
)

In [None]:
gbr.fit(X_train, y_train)

In [None]:
y_pred = gbr.predict(X_test)


In [None]:
mse = mean_squared_error(y_test, y_pred)
r2_train = r2_score(y_test, y_pred)
r2_test = r2_score(y_train, gbr.predict(X_train))


print(f"Kernel Ridge Regression R² Score (Train): {r2_train:.4f}")
print(f"Kernel Ridge Regression R² Score (Test): {r2_test:.4f}")
print(f"Mean Squared Error: {mse}")

Kernel Ridge Regression R² Score (Train): 0.9139
Kernel Ridge Regression R² Score (Test): 0.9578
Mean Squared Error: 480324228.1387551


## Submission


In [None]:
X_test_dataset = test_dataset.drop(columns=['Id'])
X_test_dataset["MasVnrType"].fillna(X_test_dataset["MasVnrType"].mode(),inplace=True)
X_test_dataset["BsmtExposure"].fillna(X_test_dataset["BsmtExposure"].mode(),inplace=True)
X_test_dataset["FireplaceQu"].fillna(X_test_dataset["FireplaceQu"].mode(),inplace=True)
X_test_dataset["BsmtFinType2"].fillna("NA",inplace=True)
X_test_dataset.fillna(0,inplace=True)

columns_to_drop = ["MiscFeature", "Alley", "Fence",'PoolQC']
X_test_dataset.drop(columns=columns_to_drop, inplace=True)

#y_pred = best_rf.predict(X_test_dataset)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_test_dataset["MasVnrType"].fillna(X_test_dataset["MasVnrType"].mode(),inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_test_dataset["BsmtExposure"].fillna(X_test_dataset["BsmtExposure"].mode(),inplace=True)
The behavior will change in pandas 3.0. This inplace met

In [None]:
X_test_dataset = pd.get_dummies(X_test_dataset,columns=X_test_dataset.select_dtypes(include=object).columns)

In [None]:
X_test_dataset = X_test_dataset[selected_features.drop('SalePrice',)]

In [None]:
y_pred = gbr.predict(X_test_dataset)

In [None]:
y = sale_price['SalePrice']

mse = mean_squared_error(y, y_pred)
r2_train = r2_score(y, y_pred)
#r2_test = r2_score(y_train, gbr.predict(X_train))


print(f"Kernel Ridge Regression R² Score (Train): {r2_train:.4f}")
#print(f"Kernel Ridge Regression R² Score (Test): {r2_test:.4f}")
print(f"Mean Squared Error: {mse}")

Kernel Ridge Regression R² Score (Train): -18.7657
Mean Squared Error: 5389467903.469452


In [None]:
y_pred = gbr.predict(X_test_dataset)

In [None]:
predictions = pd.DataFrame({'Id': test_dataset['Id'], 'SalePrice': y_pred})
predictions.to_csv('submission.csv',index=False)