# Linear Regression Model to Predict House Prices

# Import dataset

In [1]:
import pandas as pd
df = pd.read_csv('Housing.csv')
df.head(20)

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished
5,10850000,7500,3,3,1,yes,no,yes,no,yes,2,yes,semi-furnished
6,10150000,8580,4,3,4,yes,no,no,no,yes,2,yes,semi-furnished
7,10150000,16200,5,3,2,yes,no,no,no,no,0,no,unfurnished
8,9870000,8100,4,1,2,yes,yes,yes,no,yes,2,yes,furnished
9,9800000,5750,3,2,4,yes,yes,no,no,yes,1,yes,unfurnished


# Data Cleaning And Preprocessing

In [2]:
# Checking missing values
print(df.isnull().sum())

price               0
area                0
bedrooms            0
bathrooms           0
stories             0
mainroad            0
guestroom           0
basement            0
hotwaterheating     0
airconditioning     0
parking             0
prefarea            0
furnishingstatus    0
dtype: int64


In [3]:
df.drop(columns=['stories', 'guestroom', 'basement', 'hotwaterheating', 'airconditioning', 'prefarea'], inplace=True)

In [4]:
# Converting categorical variables to numerical
df['furnishingstatus'] = df['furnishingstatus'].map({'unfurnished' : 0, 'semi-furnished' : 1, 'furnished' : 2})
df['mainroad'] = df['mainroad'].map({'no' : 0, 'yes' : 1})

df.head(10)

Unnamed: 0,price,area,bedrooms,bathrooms,mainroad,parking,furnishingstatus
0,13300000,7420,4,2,1,2,2
1,12250000,8960,4,4,1,3,2
2,12250000,9960,3,2,1,2,1
3,12215000,7500,4,2,1,3,2
4,11410000,7420,4,1,1,2,2
5,10850000,7500,3,3,1,2,1
6,10150000,8580,4,3,1,2,1
7,10150000,16200,5,3,1,0,0
8,9870000,8100,4,1,1,2,2
9,9800000,5750,3,2,1,1,0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 545 entries, 0 to 544
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype
---  ------            --------------  -----
 0   price             545 non-null    int64
 1   area              545 non-null    int64
 2   bedrooms          545 non-null    int64
 3   bathrooms         545 non-null    int64
 4   mainroad          545 non-null    int64
 5   parking           545 non-null    int64
 6   furnishingstatus  545 non-null    int64
dtypes: int64(7)
memory usage: 29.9 KB


In [6]:
df.describe()

Unnamed: 0,price,area,bedrooms,bathrooms,mainroad,parking,furnishingstatus
count,545.0,545.0,545.0,545.0,545.0,545.0,545.0
mean,4766729.0,5150.541284,2.965138,1.286239,0.858716,0.693578,0.930275
std,1870440.0,2170.141023,0.738064,0.50247,0.348635,0.861586,0.761373
min,1750000.0,1650.0,1.0,1.0,0.0,0.0,0.0
25%,3430000.0,3600.0,2.0,1.0,1.0,0.0,0.0
50%,4340000.0,4600.0,3.0,1.0,1.0,0.0,1.0
75%,5740000.0,6360.0,3.0,2.0,1.0,1.0,2.0
max,13300000.0,16200.0,6.0,4.0,1.0,3.0,2.0


In [7]:
df.head()

Unnamed: 0,price,area,bedrooms,bathrooms,mainroad,parking,furnishingstatus
0,13300000,7420,4,2,1,2,2
1,12250000,8960,4,4,1,3,2
2,12250000,9960,3,2,1,2,1
3,12215000,7500,4,2,1,3,2
4,11410000,7420,4,1,1,2,2


In [8]:
# Select relevant features and target variables
#features = ['area', 'bedrooms', 'bathrooms', 'mainroad', 'parking' ,'furnishingstatus']
#target =  ['price']

# Implementing a Supervised Learning Model

In [9]:
# Splitting the Data & Training the Model
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [10]:
# Prepare Feature Variables and Target Variable
X = df.drop(columns=['price'])
y = df['price']

In [11]:
# Split Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
# Train the Model
model = LinearRegression()
model.fit(X_train, y_train)

# Model Evaluation & Error Handling

In [13]:
# Measuring Performance
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [14]:
# Make prediction
y_pred = model.predict(X_test)

In [15]:
# Evaluate the Model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print('\nModel Performance:')
print("Mean Absolute Error: ", mae)
print("Mean Squared Error: ", mse)
print("R2: ", r2)


Model Performance:
Mean Absolute Error:  1211947.5740692772
Mean Squared Error:  2490569445043.81
R2:  0.5072638575282438


# Predicting House Prices & Testing the Model

In [34]:
# New House:[['area','bedrooms','bathrooms','mainroad','parking','furnishingstatus']]
new_house = [[3000, 3, 4, 1, 1, 2]]
predicted_price = model.predict(new_house)

print("Predicted Price for the new house: ",predicted_price)

Predicted Price for the new house:  [8297762.3373303]


