# House Prices Prediction using Regression 🏡
In this project, we predict house prices using regression models. The workflow includes data cleaning, feature engineering, visualization, and model evaluation.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error


In [2]:
train = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/train.csv")
test = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/test.csv")

print("Train shape:", train.shape)
print("Test shape:", test.shape)

train.head()

Train shape: (1460, 81)
Test shape: (1459, 80)


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


## 1. Data Cleaning and Preprocessing 🧹

We start by checking for missing values and removing or imputing them where appropriate. We will also drop columns that are highly sparse or irrelevant to our models.


In [3]:
# Check missing values
missing = train.isnull().sum()
missing = missing[missing > 0].sort_values(ascending=False)
missing.head(10)


PoolQC          1453
MiscFeature     1406
Alley           1369
Fence           1179
MasVnrType       872
FireplaceQu      690
LotFrontage      259
GarageType        81
GarageYrBlt       81
GarageFinish      81
dtype: int64

In [4]:
# Drop columns only if they exist in the dataframe
drop_cols = ['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu']
train = train.drop(columns=[col for col in drop_cols if col in train.columns])
test = test.drop(columns=[col for col in drop_cols if col in test.columns])


## 2. Feature Engineering 🔧

We apply feature engineering to improve model performance:
- Convert numerical-looking categorical variables to strings
- One-hot encode categorical features
- Align training and testing data for consistency


In [5]:
# Safe conversion of MSSubClass if it exists
if 'MSSubClass' in train.columns:
    train['MSSubClass'] = train['MSSubClass'].astype(str)
if 'MSSubClass' in test.columns:
    test['MSSubClass'] = test['MSSubClass'].astype(str)

# Save Id and target variable (only if columns exist)
y = train['SalePrice'] if 'SalePrice' in train.columns else None
train_ids = train['Id'] if 'Id' in train.columns else None
test_ids = test['Id'] if 'Id' in test.columns else None

# Drop columns safely
for col in ['Id', 'SalePrice']:
    if col in train.columns:
        train.drop(columns=col, inplace=True)
    if col in test.columns:
        test.drop(columns=col, inplace=True)


In [6]:
# Combine train and test for consistent one-hot encoding
combined = pd.concat([train, test], axis=0)
combined = pd.get_dummies(combined)

train = combined[:len(y)]
test = combined[len(y):]


In [7]:
# Fill missing values in train and test with mean (or median)
train = train.fillna(train.mean())
test = test.fillna(test.mean())

In [8]:
#Model training
from sklearn.linear_model import Ridge
from sklearn.model_selection import cross_val_score
import numpy as np
model = Ridge(alpha=1.0)
rmse_scores = -1 * cross_val_score(model, train, y, cv=5, scoring='neg_root_mean_squared_error')
print("RMSE scores:", rmse_scores)
print("Average RMSE:", np.mean(rmse_scores))

RMSE scores: [23685.72491642 33659.06592099 31450.34177732 25467.98378117
 45754.84495953]
Average RMSE: 32003.592271083537


In [9]:
# Train final model on full training data
model.fit(train, y)

# Predict on test set
predictions = model.predict(test)


In [10]:
submission = pd.DataFrame({
    'Id': test_ids,
    'SalePrice': predictions
})

submission.to_csv('submission.csv', index=False)

In [11]:
import os
print(os.listdir('/kaggle/working'))

['submission.csv', '__notebook__.ipynb']


In [12]:
#import pandas as pd



#submission = pd.DataFrame({
 #   'Id': test_ids,
  #  'SalePrice': predictions
#})

#submission.to_csv('submission.csv', index=False)

#from IPython.display import FileLink
#FileLink('submission.csv')