In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

**Exploring Data**

In [None]:
#reading data
train = pd.read_csv('/kaggle/input/home-data-for-ml-course/train.csv' )
#setting ID as index 
train = train.set_index('Id')

In [None]:
train.head()

In [None]:
test = pd.read_csv('/kaggle/input/home-data-for-ml-course/test.csv')

In [None]:
test = test.set_index('Id')

In [None]:
test.head()

**concating x and x_test** - so we don't need to repeat the same work for train and test data individually

In [None]:
y = train['SalePrice']
x = pd.concat([train.drop("SalePrice", axis=1),test])

In [None]:
x.shape

In [None]:
nan_cols = [i for i in x.columns if x[i].isnull().sum()>0]
nan_cols

Read the data_description.txt file, where information about the columns and their missing values is already specified for following column as mentioned. [Dataset description](https://www.kaggle.com/competitions/home-data-for-ml-course/data)

In [None]:
# List of 'NaN' including columns where NaN's mean none.
none_cols = [
    'Alley', 'PoolQC', 'MiscFeature', 'Fence', 'FireplaceQu', 'GarageType',
    'GarageFinish', 'GarageQual', 'GarageCond', 'BsmtQual', 'BsmtCond',
    'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'MasVnrType'
]

# List of 'NaN' including columns where NaN's mean 0.
zero_cols = [
    'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF',
     'GarageArea', 'GarageCars', 'MasVnrArea'
]

In [None]:
# Filling the list of columns above with appropriate values in x:
for col in zero_cols:
    x[col].replace(np.nan, 0, inplace=True)
for col in none_cols:
    x[col].replace(np.nan, 'None', inplace=True)

In [None]:
# MSSubClass  - already specifid in data_description.txt that this is type of dwelling involved in the sale
#(that mean it's a category column not a numerical)
x['MSSubClass'] = x['MSSubClass'].astype(str)

In [None]:
nan_cols = [i for i in x.columns if x[i].isnull().sum()>0]
nan_cols

In [None]:
x.shape

**Numerical cols**

In [None]:
numerical_ = [i for i in x.columns if x[i].dtype in ['float64', 'int64']]
print(len(numerical_))
fig = plt.figure(figsize=(18,16))
for index,col in enumerate(numerical_):
    plt.subplot(10,4,index+1)
    sns.histplot(x[col], kde=False)
    plt.xticks(rotation=90)
fig.tight_layout(pad=1.0)

In [None]:
# LowQualFinSF - will drop
# 3SsnPorch - will drop
# PoolArea - will drop and PoolQC will handle
# MiscVal - will drop

**categiorical cols**

In [None]:
cat_cols = [i for i in x.columns if x[i].dtype == 'object']
cat_train = x[cat_cols]
fig = plt.figure(figsize=(18,20))
for index in range(len(cat_train.columns)):
    plt.subplot(9,5,index+1)
    sns.countplot(x=cat_train.iloc[:,index], data=cat_train)
    plt.xticks(rotation=90)
fig.tight_layout(pad=1.0)

In [None]:
#Utilities - will drop
#Condition2 - will drop
#PoolQC - will drop
#Street -will drop

**Bi-variate Analysis**

In [None]:
#Corr
numerical_ = [i for i in x.columns if x[i].dtype in ['int64','float64']]
print(len(numerical_))
plt.figure(figsize=(9,9))
correlation = x[numerical_].corr()
sns.heatmap(correlation, mask = correlation < 0.8, linewidth=0.5 ,linecolor='#D3D3D3', cmap='Blues',vmin=0.6, vmax=1)

<!-- # GarageYrBlt and YearBuilt
# TotRmsAbvGrd and GrLivArea
# GarageArea and GarageCars -->

**highly correlated**
1. GarageYrBlt and YearBuilt
2. TotRmsAbvGrd and GrLivArea
3. GarageArea and GarageCars
4. TotalBsmtSF and 1stFlrSF

In [None]:
#corr
cols = ['GarageYrBlt','YearBuilt','TotRmsAbvGrd','GrLivArea','GarageArea','GarageCars','SalePrice' ,'1stFlrSF','TotalBsmtSF']
correlation = train[cols].corr()
correlation[['SalePrice']].sort_values(['SalePrice'], ascending=False)

correlation with saleprice
---
*  YearBlt > GarageYrBlt
*  TotRmsAbvGrd < GrLivArea,
*  GarageArea < GarageCars
*  1stFlrSF < TotalBsmtSF






Mutual information is a lot like correlation in that it measures a relationship between two quantities. The advantage of mutual information is that it can detect any kind of relationship, while correlation only detects linear
relationships.

In [None]:
# GarageYrBlt
# TotRmsAbvGrd
# GarageArea
# 1stFlrSF   --------------------------will removed

**Scatter**

In [None]:
fig = plt.figure(figsize=(20,20))
for index in range(len(numerical_)):
    plt.subplot(10,5,index+1)
    sns.scatterplot(x=train[numerical_].iloc[:,index], y= y, data=train[numerical_])
fig.tight_layout(pad=1.0)

**We will also remove features that does not have any linear relationship with target SalePrice.**

1. MoSold  
2. YrSold


In [None]:
fig = plt.figure(figsize=(20,20))
for index in range(len(numerical_)):
    plt.subplot(10,5,index+1)
    sns.boxplot(y=train[numerical_].iloc[:,index], data=train[numerical_])
fig.tight_layout(pad=1.0)

**outliers:**
1. LotFrontage > 200
2. LotArea > 100000
3.  MasVnrArea > 1000 and SalePrice < 400000
4. BsmtFinSF1 > 2000 and SalePrice < 400000
5. TotalBsmtSF >4000
6. 1stFlrSF - will drop so no  need
7. GrLivArea > 4000

**Data preprocessing** - let's start the game :)

droping the columns that have a high frequency of one value>96.

In [None]:
#-----------------------Num_cols--------------------------------------
overfit_num = []
for i in numerical_:
  counts = x[i].value_counts()
  zeros = counts.iloc[0]
  if zeros / len(x) * 100 > 96:
    overfit_num.append(i)

print('droped cols :' ,overfit_num)

In [None]:
#--------------------------cat_cols------------------------------------------
overfit_cat = []
for i in cat_cols:
  counts = x[i].value_counts()
  zeros = counts.iloc[0]
  if zeros / len(x) * 100 > 96:
    overfit_cat.append(i)

print('droped cols :' ,overfit_cat)

All the columns in overfit_cat are fine, but it's worth noting that "PoolQC" and "MiscFeature" are significant features that can have a substantial impact on the price. Therefore, it may not be advisable to drop these columns despite their high frequency of zero values.

In [None]:
To_drop = ['LowQualFinSF', '3SsnPorch', 'PoolArea', 'MiscVal', 'Street', 'Utilities', 'Condition2', 'RoofMatl', 'Heating']
x = x.drop(To_drop , axis = 1)

In [None]:
#-----------------Bi-variate-------------------------------------------------
#GarageYrBlt    ,   TotRmsAbvGrd , GarageArea , 1stFlrSF
x = x.drop( ['GarageYrBlt' ,'TotRmsAbvGrd','GarageArea','1stFlrSF'] ,axis=1)

In [None]:
x = x.drop( ['MoSold' , 'YrSold'] ,axis=1)

**Outliers removation**

In [None]:
# outliers:
# LotFrontage > 200
# LotArea > 100000
# MasVnrArea > 1000 and SalePrice < 400000
# BsmtFinSF1 > 2000 and SalePrice < 400000
# TotalBsmtSF >4000
# 1stFlrSF - will drop so no  need
# GrLivArea > 4000
#-------droping from train df bcz if i'll drop from x then it will also effect the x_test that causes data leakage
train = train.drop(train[train['LotFrontage'] > 200].index)
train = train.drop(train[train['LotArea'] > 100000].index)
train = train.drop(train[(train['MasVnrArea'] > 1000) & (train['SalePrice'] < 400000) ].index)
train = train.drop(train[(train['BsmtFinSF1'] > 2000) & (train['SalePrice'] < 400000)].index)
train = train.drop(train[train['TotalBsmtSF'] > 4000].index)
train = train.drop(train[train['GrLivArea'] > 4000].index)

**target log transformation**

In [None]:
y  = np.log(y)

**Nan values imputation**

In [None]:
num_x_imp = [i for i in x.columns if 0 < x[i].isnull().sum() and x[i].dtype in ['int64', 'float64']]
print('num cols:' ,num_x_imp)
cat_x_imp = [i for i in x.columns if 0 < x[i].isnull().sum() and x[i].dtype in ['object']]
print('cat cols: ',cat_x_imp)

In [None]:
#KNN Imputer---------imputing for num_x_imp
num_cols = [i for i in x.columns if x[i].dtype in ['int64', 'float64']]
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=2)
x[num_cols] = imputer.fit_transform(x[num_cols] )


**imputing for num_x_imp-----you can also iimpute these values with mean like this---**


In [None]:
# for i in num_x_imp:
#   x[i] = x[i].fillna(x[i].mean())

In [None]:
#imputing for cat_x_imp
for i in cat_x_imp:
    x[i] = x[i].fillna(x[i].mode()[0])

In [None]:
x.shape

**Separating the train and test set**

In [None]:
X = x.loc[train.index]
y = y.loc[train.index]
x_test = x.loc[test.index]
x = X

**num scaling , cat one hot encoding**

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder

In [None]:
from sklearn.pipeline import Pipeline

In [None]:
# Define the columns for imputation, numerical columns, and categorical columns
numerical_columns = [i for i in x.columns if x[i].dtype in ['int64' , 'float64']]
categorical_columns = [i for i in x.columns if x[i].dtype == 'object']

**Just checking what will be The final size of the DataFrame after one-hot encoding**

In [None]:
# Count the number of unique values in each categorical column
unique_counts = {column: x[column].nunique() for column in categorical_columns}
print(unique_counts)

# Sum up the number of unique values across all categorical columns
total_unique_categories = sum(unique_counts.values())

# Calculate the final size of the DataFrame after one-hot encoding(+ numerical columns)
final_num_columns = total_unique_categories + len(numerical_columns)
final_size = (x.shape[0], final_num_columns)

print(f"The final size of the DataFrame after one-hot encoding: {final_size}")


In [None]:
# Create the scaler for numerical columns
numerical_scaler = Pipeline(steps=[
    ('scaler', MinMaxScaler())  # MinMaxScaler for numerical data
])

# Create the encoder for categorical columns
categorical_encoder = Pipeline(steps=[
    ('encoder', OneHotEncoder(handle_unknown='ignore' , sparse_output =False ))  # OneHotEncoder for categorical data
])


In [None]:
# Create the ColumnTransformer
preprocessor = ColumnTransformer(transformers=[
    ('numerical_scaler', numerical_scaler, numerical_columns),
    ('categorical_encoder', categorical_encoder, categorical_columns)
])

In [None]:
# Fit and transform the data
x = preprocessor.fit_transform(x)

In [None]:
x

In [None]:
x.shape

See same size :)

In [None]:
x_test = preprocessor.transform(x_test)

In [None]:
output_column_names = preprocessor.get_feature_names_out()

In [None]:
x_test.shape

In [None]:
x = pd.DataFrame(x, columns = output_column_names )
x_test = pd.DataFrame(x_test ,columns = output_column_names )

In [None]:
x = x.astype('float64')
x_test = x_test.astype('float64')
x

## **Model implementation**


In [None]:
models = pd.DataFrame(columns=["Model","MAE","MSE","RMSE","R2 Score"])

**Different regression models**

In [None]:
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
#lightgbm
import lightgbm as lgb
from sklearn.model_selection import train_test_split


# Split the data into training and testing sets
x_train, x_valid, y_train, y_valid = train_test_split(x, y, test_size=0.2, random_state=42)



# Create an instance of each regression model
random_state = 42

ridge_reg = Ridge(random_state=random_state)
lasso_reg = Lasso(random_state=random_state)
elastic_net_reg = ElasticNet(random_state=random_state)
decision_tree_reg = DecisionTreeRegressor(random_state=random_state)
random_forest_reg = RandomForestRegressor(random_state=random_state)
xgb_model =XGBRegressor(random_state=42)
svr_reg = SVR()
neural_network_reg = MLPRegressor(random_state=random_state)
lgb_reg = lgb.LGBMRegressor(random_state=42)

# Fit the regression models to your data
ridge_reg.fit(x_train, y_train)
lasso_reg.fit(x_train, y_train)
elastic_net_reg.fit(x_train, y_train)
decision_tree_reg.fit(x_train, y_train)
random_forest_reg.fit(x_train, y_train)
xgb_model.fit(x_train, y_train)
svr_reg.fit(x_train, y_train)
neural_network_reg.fit(x_train, y_train)
lgb_reg.fit(x_train, y_train)

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Ridge Regression
ridge_predictions = np.exp(ridge_reg.predict(x_valid))
ridge_mae = mean_absolute_error(np.exp(y_valid), ridge_predictions)
ridge_mse = mean_squared_error(np.exp(y_valid), ridge_predictions)
ridge_rmse = np.sqrt(ridge_mse)
ridge_r2 = r2_score(np.exp(y_valid), ridge_predictions)


new_row = {"Model": "Ridge Regression","MAE": ridge_mae, "MSE": ridge_mse, "RMSE": ridge_rmse, "R2 Score": ridge_r2}
models = models.append(new_row, ignore_index=True)


# Lasso Regression
lasso_predictions = np.exp(lasso_reg.predict(x_valid))
lasso_mae = mean_absolute_error(np.exp(y_valid), lasso_predictions)
lasso_mse = mean_squared_error(np.exp(y_valid), lasso_predictions)
lasso_rmse = np.sqrt(lasso_mse)
lasso_r2 = r2_score(np.exp(y_valid), lasso_predictions)


new_row = {"Model": "Lasso Regression","MAE": lasso_mae, "MSE": lasso_mse, "RMSE": lasso_rmse, "R2 Score": lasso_r2}
models = models.append(new_row, ignore_index=True)


# ElasticNet Regression
elastic_net_predictions = np.exp(elastic_net_reg.predict(x_valid))
elastic_net_mae = mean_absolute_error(np.exp(y_valid), elastic_net_predictions)
elastic_net_mse = mean_squared_error(np.exp(y_valid), elastic_net_predictions)
elastic_net_rmse = np.sqrt(elastic_net_mse)
elastic_net_r2 = r2_score(np.exp(y_valid), elastic_net_predictions)

new_row = {"Model": "ElasticNet Regression","MAE": elastic_net_mae, "MSE": elastic_net_mse, "RMSE": elastic_net_rmse, "R2 Score": elastic_net_r2}
models = models.append(new_row, ignore_index=True)



# Decision Tree Regression
decision_tree_predictions = np.exp(decision_tree_reg.predict(x_valid))
decision_tree_mae = mean_absolute_error(np.exp(y_valid), decision_tree_predictions)
decision_tree_mse = mean_squared_error(np.exp(y_valid), decision_tree_predictions)
decision_tree_rmse = np.sqrt(decision_tree_mse)
decision_tree_r2 = r2_score(np.exp(y_valid), decision_tree_predictions)


new_row = {"Model": "Decision Tree Regression","MAE": decision_tree_mae, "MSE": decision_tree_mse, "RMSE": decision_tree_rmse, "R2 Score": decision_tree_r2}
models = models.append(new_row, ignore_index=True)


# Random Forest Regression
random_forest_predictions = np.exp(random_forest_reg.predict(x_valid))
random_forest_mae = mean_absolute_error(np.exp(y_valid), random_forest_predictions)
random_forest_mse = mean_squared_error(np.exp(y_valid), random_forest_predictions)
random_forest_rmse = np.sqrt(random_forest_mse)
random_forest_r2 = r2_score(np.exp(y_valid), random_forest_predictions)


new_row = {"Model": "Random Forest Regression","MAE": random_forest_mae, "MSE": random_forest_mse, "RMSE": random_forest_rmse, "R2 Score": random_forest_r2}
models = models.append(new_row, ignore_index=True)

# Gradient Boosting Regression (XGBoost)
xgb_predictions = np.exp(xgb_model.predict(x_valid))
xgb_mae = mean_absolute_error(np.exp(y_valid), xgb_predictions)
xgb_mse = mean_squared_error(np.exp(y_valid), xgb_predictions)
xgb_rmse = np.sqrt(xgb_mse)
xgb_r2 = r2_score(np.exp(y_valid), xgb_predictions)


new_row = {"Model": "Gradient Boosting Regression","MAE":xgb_mae, "MSE": xgb_mse, "RMSE": xgb_rmse, "R2 Score": xgb_r2}
models = models.append(new_row, ignore_index=True)

# Support Vector Regression (SVR)
svr_predictions = np.exp(svr_reg.predict(x_valid))
svr_mae = mean_absolute_error(np.exp(y_valid), svr_predictions)
svr_mse = mean_squared_error(np.exp(y_valid), svr_predictions)
svr_rmse = np.sqrt(svr_mse)
svr_r2 = r2_score(np.exp(y_valid), svr_predictions)


new_row = {"Model": "Support Vector Regression","MAE":svr_mae, "MSE": svr_mse, "RMSE": svr_rmse, "R2 Score": svr_r2}
models = models.append(new_row, ignore_index=True)


# Neural Network Regression
neural_network_predictions = np.exp(neural_network_reg.predict(x_valid))
neural_network_mae = mean_absolute_error(np.exp(y_valid), neural_network_predictions)
neural_network_mse = mean_squared_error(np.exp(y_valid), neural_network_predictions)
neural_network_rmse = np.sqrt(neural_network_mse)
neural_network_r2 = r2_score(np.exp(y_valid), neural_network_predictions)


new_row = {"Model": "Neural Network Regression","MAE": neural_network_mae, "MSE": neural_network_mse, "RMSE": neural_network_rmse, "R2 Score": neural_network_r2}
models = models.append(new_row, ignore_index=True)

# LightGBM Regression
predictions_lgb = np.exp(lgb_reg.predict(x_valid))
lgbm_mae = mean_absolute_error(np.exp(y_valid), predictions_lgb)
lgbm_mse = mean_squared_error(np.exp(y_valid), predictions_lgb)
lgbm_rmse = np.sqrt(lgbm_mse)
lgbm_r2 = r2_score(np.exp(y_valid), predictions_lgb)

new_row = {"Model": "LightGBM Regression","MAE": lgbm_mae, "MSE": lgbm_mse, "RMSE": lgbm_rmse, "R2 Score": lgbm_r2}
models = models.append(new_row, ignore_index=True)




In [None]:
models.sort_values(by="R2 Score", ascending=False)

In [None]:
from sklearn.model_selection import GridSearchCV

form here you can do hyperparameter tuning for each machine learning models (i already did once so i know for my models which the best combination of hyperparameters that optimizes the model's performance so don't want to do it again, it is waste of time to do it again)).

In [None]:
# # Define the parameter grid for Grid Search CV
# param_grid = {'alpha': [0.1,0.5,0.7 ,1.0,2,3,4,5,6,7,8,9, 10.0],
#               'solver': ['auto']}

# # Create a Ridge Regression model
# ridge = Ridge()

# # Create the Grid Search CV object
# grid_search = GridSearchCV(estimator=ridge, param_grid=param_grid, cv=5)

# # Fit the Grid Search CV to the data
# grid_search.fit(x_train , y_train)

# # Get the best parameters and the best score
# best_params = grid_search.best_params_

# # Create a new Ridge Regression model with the best parameters
# best_ridge = Ridge(**best_params)

# # Fit the model with the best parameters to the data
# best_ridge.fit(x_train , y_train)

In [None]:
best_ridge = Ridge(alpha = 2 , solver ='auto')
best_ridge.fit(x_train , y_train)

In [None]:
# # Define the parameter grid
# param_grid = {
#     'learning_rate': [0.05, 0.1, 0.2],
#     'n_estimators': [900,1000,1100],
#     'max_depth': [6, 8, 10],
#     'subsample': [0.8, 1.0],
#     'colsample_bytree': [0.8, 1.0]
# }

# model = XGBRegressor()
# # Create the GridSearchCV object
# grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='neg_mean_absolute_error', n_jobs=-1)

# # Fit the GridSearchCV object to the data
# grid_search.fit(x_train, y_train)

# # Get the best parameters and best score
# best_params = grid_search.best_params_

# # Create a new Regression model with the best parameters
# best_XGB = XGBRegressor(**best_params)

# # Fit the model with the best parameters to the data
# best_XGB.fit(x_train , y_train)

In [None]:
best_XGB = XGBRegressor(learning_rate=0.05, n_estimators=1100 , max_depth=6 ,colsample_bytree=0.8 ,subsample = 0.8)
best_XGB.fit(x_train , y_train)

In [None]:
# # Define the LGBMRegressor model
# lgb_model = lgb.LGBMRegressor()

# # Define the parameter grid for GridSearchCV
# param_grid = {
#     'learning_rate': [0.05, 0.1, 0.2],
#     'n_estimators': [100,200,300],
#     'max_depth': [6, 8, 10],
#     'subsample': [0.6, 0.8, 1.0],
#     'colsample_bytree': [0.6, 0.8, 1.0]
# }
# # Create the GridSearchCV object
# grid_search = GridSearchCV(estimator=lgb_model, param_grid=param_grid, cv=5, scoring='neg_mean_absolute_error', n_jobs=-1)

# # Fit the GridSearchCV object to the data
# grid_search.fit(x_train, y_train)

# # Get the best parameters and best score
# best_params = grid_search.best_params_

# # Create a new Regression model with the best parameters
# best_lgb = lgb.LGBMRegressor(**best_params)

# # Fit the model with the best parameters to the data
# best_lgb.fit(x_train , y_train)


In [None]:
best_lgb = lgb.LGBMRegressor(colsample_bytree=0.6, learning_rate=0.05, max_depth=10, n_estimators=200, subsample=0.6)
best_lgb.fit(x_train , y_train)

In [None]:
# svr = SVR()

# param_grid = {
#     'kernel': ['linear', 'rbf'],
#     'C': [0.1, 1, 10],
#     'epsilon': [0.1, 0.01, 0.001]
# }

# grid_search = GridSearchCV(svr, param_grid, cv=5)
# grid_search.fit(x_train, y_train)

# best_params = grid_search.best_params_

# # Create a new Regression model with the best parameters
# best_svr = SVR(**best_params)

# # Fit the model with the best parameters to the data
# best_svr.fit(x_train , y_train)

In [None]:
best_svr = SVR(C=0.1, epsilon=0.01, kernel='linear')
best_svr.fit(x_train , y_train)

In [None]:
# rf = RandomForestRegressor()

# param_grid = {
#     'n_estimators': [100, 200, 500],       # Number of trees in the forest
#     'max_depth': [None, 5, 10],             # Maximum depth of the trees
#     'min_samples_split': [2, 5, 10],        # Minimum number of samples required to split an internal node
#     'min_samples_leaf': [1, 2, 4],          # Minimum number of samples required to be at a leaf node
#     'max_features': ['auto', 'sqrt']        # Number of features to consider when looking for the best split
# }

# grid_search = GridSearchCV(rf, param_grid, cv=5)
# grid_search.fit(x_train, y_train)

# best_params = grid_search.best_params_

# # Create a new Random Forest model with the best parameters
# best_rf = RandomForestRegressor(**best_params)

# # Fit the model with the best parameters to the data
# best_rf.fit(x_train, y_train)


In [None]:
best_rf = RandomForestRegressor( min_samples_leaf=2, n_estimators=200)
best_rf.fit(x_train, y_train)

In [None]:
def blend_models_predict(x):
  return(
      (0.6* best_ridge.predict(x)) +
      (0.15 *best_XGB.predict(x)) +
      (0.15 *best_lgb.predict(x)) +
      (0.05*best_svr.predict(x)) +
      (0.05*best_rf.predict(x))
      )


In [None]:
final_pred = np.exp(blend_models_predict(x_valid))
mae = mean_absolute_error(np.exp(y_valid), final_pred)
mse = mean_squared_error(np.exp(y_valid), final_pred)
rmse = np.sqrt(mse)
r2 = r2_score(np.exp(y_valid), final_pred)
print(mae  , mse ,rmse ,r2)

# #---------------------------Tunning--------------------------------------

# For more you can tune it_--------------------------

In [None]:
predictions = np.exp( blend_models_predict(x_test))


In [None]:
sub = pd.read_csv('/kaggle/input/home-data-for-ml-course/sample_submission.csv')
sub

In [None]:
sub['SalePrice'] = predictions

In [None]:
sub

In [None]:
output = sub
output.to_csv('Housing Prices.csv', index = False)