<h1>Kwesi's House Price Prediction</h1>

**step 1:importing libraries and metrics**

In [57]:

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from sklearn.preprocessing import PolynomialFeatures

**step 2:Loading the Data**


In [58]:
df=pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')


**step 3:EDA (Exploratory Data Analysis)**

In [59]:
#looking at the 1st 10 rows
df.head(10)

In [60]:
#looking at the last 10 rows
df.tail(10)

In [61]:
#checking the size of the data
df.shape

In [62]:
#checking the types of the column and the number of null values in each column
df.info()

In [63]:
#checking the statistical summary of the data and that gives just the numerical columns variable summary
df.describe().T

In [64]:
# looking at the correlations between the numerical variables
plt.figure(figsize=(10,10))
sns.heatmap(df.corr(), cmap="RdBu")
plt.title("Correlations between numerical variables", size=15)
plt.show()

**step 4:Feature selection**

In [65]:

#selecting numerical columns that have strong correlation(>0.5 and <-0.5) but for caterogical columns we choose the ones that actually affect the target feature(salesprice)in real life
#n_cols=numerical columns
#c_cols=categorical columns
n_cols = list(df.corr()["SalePrice"][(df.corr()["SalePrice"]>0.50) | (df.corr()["SalePrice"]<-0.50)].index)
c_cols = ["MSZoning", "Utilities","BldgType","Heating","KitchenQual","SaleCondition","LandSlope"]
final_cols = n_cols + c_cols

final_cols

In [66]:
df = df[final_cols]

In [67]:
#checking for missing values again after feature selection
df.isnull().sum()

**step 5:Data Visualization again after getting important features**

In [68]:
#looking at the correlations between numerical columns using pairplot
sns.pairplot(df[n_cols])
plt.title("Correlations between numerical variables after getting important features", size=15)


In [69]:
#using jointplot to visualize the numerial variables against the target variable 
plt.figure(figsize=(10,8))
sns.jointplot(x=df["OverallQual"], y=df["SalePrice"], kind="kde")
sns.jointplot(x=df["YearBuilt"], y=df["SalePrice"], kind="kde")
sns.jointplot(x=df["YearRemodAdd"], y=df["SalePrice"], kind="kde")
sns.jointplot(x=df["TotalBsmtSF"], y=df["SalePrice"], kind="kde")
sns.jointplot(x=df["1stFlrSF"], y=df["SalePrice"], kind="kde")
sns.jointplot(x=df["GrLivArea"], y=df["SalePrice"], kind="kde")
sns.jointplot(x=df["FullBath"], y=df["SalePrice"], kind="kde")
sns.jointplot(x=df["TotRmsAbvGrd"], y=df["SalePrice"], kind="kde")
sns.jointplot(x=df["GarageCars"], y=df["SalePrice"], kind="kde")
sns.jointplot(x=df["GarageArea"], y=df["SalePrice"], kind="kde")
plt.show()

**step 6: x and y splitting i.e the target variable(y) and the other variables(x) splitting**

In [70]:
#x and y splitting
x = df.drop("SalePrice", axis=1)
y = df["SalePrice"]

**step 7: one hot encoding for categorical variables**

In [71]:
#Encoding the categorical features in x dataset by using One-Hot Encoding method
X = pd.get_dummies(x, columns=c_cols)

**step 8: standardizing the data**

In [72]:
#standardizinh the numerical variables using StandardScaler()
n_cols.remove("SalePrice")

Sscaler = StandardScaler()
X[n_cols] = Sscaler.fit_transform(X[n_cols])

In [73]:
X.head()

**step 9: Train and Test Splitting**

In [74]:
#splitting the data into test and train data for better evaluation
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

**step 10: ML Models (linear models)**

In [75]:
#functions for model comparison and evaluations

#rmse cross validation
def rmsecv(mlmodel):
    rmse = np.sqrt(-cross_val_score(mlmodel, X, y, scoring="neg_mean_squared_error", cv=5)).mean()
    return rmse
    
#evaluations
def evalu(y, predics):
    mae = mean_absolute_error(y, predics)
    mse = mean_squared_error(y, predics)
    rmse = np.sqrt(mean_squared_error(y, predics))
    r_squared = r2_score(y, predics)
    return mae, mse, rmse, r_squared

In [76]:
#creating a table
models = pd.DataFrame(columns=["Model","MAE","MSE","RMSE","R2 Score","RMSE (Cross-Validation)"])

**Model 1: Linear Regression**

In [77]:
#linear regression
lr = LinearRegression()
lr.fit(X_train, y_train)
predictions = lr.predict(X_test)

mae, mse, rmse, r_squared = evalu(y_test, predictions)
print("MAE:", mae)
print("MSE:", mse)
print("RMSE:", rmse)
print("R2 Score:", r_squared)
rmse_cross_val = rmsecv(lr)
print("RMSE Cross-Validation:", rmse_cross_val)

#creating a dictionary and append to table created
new_row = {"Model": "LinearRegression","MAE": mae, "MSE": mse, "RMSE": rmse, "R2 Score": r_squared, "RMSE (Cross-Validation)": rmse_cross_val}
models = models.append(new_row, ignore_index=True)

**Model 2: Ridge Regression**

In [78]:
ridge = Ridge()
ridge.fit(X_train, y_train)
predictions = ridge.predict(X_test)

mae, mse, rmse, r_squared = evalu(y_test, predictions)
print("MAE:", mae)
print("MSE:", mse)
print("RMSE:", rmse)
print("R2 Score:", r_squared)

rmse_cross_val = rmsecv(ridge)
print("RMSE Cross-Validation:", rmse_cross_val)

#creating a dictionary and append to table created
new_row = {"Model": "Ridge","MAE": mae, "MSE": mse, "RMSE": rmse, "R2 Score": r_squared, "RMSE (Cross-Validation)": rmse_cross_val}
models = models.append(new_row, ignore_index=True)

**Model 3: Lasso Regression**

In [79]:
lasso = Lasso()
lasso.fit(X_train, y_train)
predictions = lasso.predict(X_test)

mae, mse, rmse, r_squared = evalu(y_test, predictions)
print("MAE:", mae)
print("MSE:", mse)
print("RMSE:", rmse)
print("R2 Score:", r_squared)

rmse_cross_val = rmsecv(lasso)
print("RMSE Cross-Validation:", rmse_cross_val)

#creating a dictionary and append to table created
new_row = {"Model": "Lasso","MAE": mae, "MSE": mse, "RMSE": rmse, "R2 Score": r_squared, "RMSE (Cross-Validation)": rmse_cross_val}
models = models.append(new_row, ignore_index=True)

**Model 4: Elastic Net**

In [80]:
elastic_net = ElasticNet()
elastic_net.fit(X_train, y_train)
predictions = elastic_net.predict(X_test)

mae, mse, rmse, r_squared = evalu(y_test, predictions)
print("MAE:", mae)
print("MSE:", mse)
print("RMSE:", rmse)
print("R2 Score:", r_squared)

rmse_cross_val = rmsecv(elastic_net)
print("RMSE Cross-Validation:", rmse_cross_val)

#creating a dictionary and append to table created
new_row = {"Model": "ElasticNet","MAE": mae, "MSE": mse, "RMSE": rmse, "R2 Score": r_squared, "RMSE (Cross-Validation)": rmse_cross_val}
models = models.append(new_row, ignore_index=True)

**Model 5: Support Vector Machines**

In [81]:
svr = SVR(C=100000)
svr.fit(X_train, y_train)
predictions = svr.predict(X_test)

mae, mse, rmse, r_squared = evalu(y_test, predictions)
print("MAE:", mae)
print("MSE:", mse)
print("RMSE:", rmse)
print("R2 Score:", r_squared)

rmse_cross_val = rmsecv(svr)
print("RMSE Cross-Validation:", rmse_cross_val)

#creating a dictionary and append to table created
new_row = {"Model": "SVR","MAE": mae, "MSE": mse, "RMSE": rmse, "R2 Score": r_squared, "RMSE (Cross-Validation)": rmse_cross_val}
models = models.append(new_row, ignore_index=True)

**Model 5: Random Forest Regressor**

In [82]:
random_forest = RandomForestRegressor(n_estimators=100)
random_forest.fit(X_train, y_train)
predictions = random_forest.predict(X_test)

mae, mse, rmse, r_squared = evalu(y_test, predictions)
print("MAE:", mae)
print("MSE:", mse)
print("RMSE:", rmse)
print("R2 Score:", r_squared)

rmse_cross_val = rmsecv(random_forest)
print("RMSE Cross-Validation:", rmse_cross_val)


#creating a dictionary and append to table created
new_row = {"Model": "RandomForestRegressor","MAE": mae, "MSE": mse, "RMSE": rmse, "R2 Score": r_squared, "RMSE (Cross-Validation)": rmse_cross_val}
models = models.append(new_row, ignore_index=True)

**Model 6: XGBoost Regressor**


In [83]:
xgb = XGBRegressor(n_estimators=1000, learning_rate=0.01)
xgb.fit(X_train, y_train)
predictions = xgb.predict(X_test)

mae, mse, rmse, r_squared = evalu(y_test, predictions)
print("MAE:", mae)
print("MSE:", mse)
print("RMSE:", rmse)
print("R2 Score:", r_squared)

rmse_cross_val = rmsecv(xgb)
print("RMSE Cross-Validation:", rmse_cross_val)

#creating a dictionary and append to table created
new_row = {"Model": "XGBRegressor","MAE": mae, "MSE": mse, "RMSE": rmse, "R2 Score": r_squared, "RMSE (Cross-Validation)": rmse_cross_val}
models = models.append(new_row, ignore_index=True)

**Model 6: Polynomial Regression of Degree=2**


In [84]:
poly_reg = PolynomialFeatures(degree=2)
X_train_2d = poly_reg.fit_transform(X_train)
X_test_2d = poly_reg.transform(X_test)

lrgg = LinearRegression()
lrgg.fit(X_train_2d, y_train)
predictions = lrgg.predict(X_test_2d)

mae, mse, rmse, r_squared = evalu(y_test, predictions)
print("MAE:", mae)
print("MSE:", mse)
print("RMSE:", rmse)
print("R2 Score:", r_squared)

rmse_cross_val = rmsecv(lrgg)
print("RMSE Cross-Validation:", rmse_cross_val)

#creating a dictionary and append to table created
new_row = {"Model": "Polynomial Regression (degree=2)","MAE": mae, "MSE": mse, "RMSE": rmse, "R2 Score": r_squared, "RMSE (Cross-Validation)": rmse_cross_val}
models = models.append(new_row, ignore_index=True)

**Step 11: Comparing all the 6 models used**


In [85]:
#sorting the values based on "RMSE (Cross-Validation)" , the lower the rmse the better the model
models.sort_values(by="RMSE (Cross-Validation)")

In [86]:
plt.figure(figsize=(12,8))
sns.barplot(x=models["Model"], y=models["RMSE (Cross-Validation)"])
plt.title("Bar chart based on Models' RMSE Scores (Cross-Validated)", size=15)
plt.xticks(rotation=30, size=12)
plt.show()