In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import pickle  # For saving the model

In [2]:
data= pd.read_csv('house_data.csv')  

In [3]:
data

Unnamed: 0,square_footage,num_bedrooms,num_bathrooms,location,price
0,1500,3,2,Urban,300000
1,2000,4,3,Suburban,400000
2,1800,3,2,Rural,250000
3,2500,4,3,Urban,500000
4,1200,2,1,Suburban,220000
5,2200,3,2,Urban,450000
6,1600,2,1,Rural,180000
7,3000,5,4,Suburban,600000
8,1700,3,2,Urban,320000
9,2100,4,3,Suburban,430000


In [4]:
data.shape

(10, 5)

In [5]:
data.describe()

Unnamed: 0,square_footage,num_bedrooms,num_bathrooms,price
count,10.0,10.0,10.0,10.0
mean,1960.0,3.3,2.3,365000.0
std,523.237783,0.948683,0.948683,133687.030868
min,1200.0,2.0,1.0,180000.0
25%,1625.0,3.0,2.0,262500.0
50%,1900.0,3.0,2.0,360000.0
75%,2175.0,4.0,3.0,445000.0
max,3000.0,5.0,4.0,600000.0


In [6]:
data.isnull().sum()

square_footage    0
num_bedrooms      0
num_bathrooms     0
location          0
price             0
dtype: int64

In [7]:
data.nunique()

square_footage    10
num_bedrooms       4
num_bathrooms      4
location           3
price             10
dtype: int64

In [8]:
data.dtypes

square_footage     int64
num_bedrooms       int64
num_bathrooms      int64
location          object
price              int64
dtype: object

In [9]:
data

Unnamed: 0,square_footage,num_bedrooms,num_bathrooms,location,price
0,1500,3,2,Urban,300000
1,2000,4,3,Suburban,400000
2,1800,3,2,Rural,250000
3,2500,4,3,Urban,500000
4,1200,2,1,Suburban,220000
5,2200,3,2,Urban,450000
6,1600,2,1,Rural,180000
7,3000,5,4,Suburban,600000
8,1700,3,2,Urban,320000
9,2100,4,3,Suburban,430000


In [10]:
data.columns

Index(['square_footage', 'num_bedrooms', 'num_bathrooms', 'location', 'price'], dtype='object')

In [11]:
# Perform One-Hot Encoding for the 'location' feature
data = pd.get_dummies(data, columns=['location'], drop_first=True)

# Check the encoded data
print(data.head())

   square_footage  num_bedrooms  num_bathrooms   price  location_Suburban  \
0            1500             3              2  300000              False   
1            2000             4              3  400000               True   
2            1800             3              2  250000              False   
3            2500             4              3  500000              False   
4            1200             2              1  220000               True   

   location_Urban  
0            True  
1           False  
2           False  
3            True  
4           False  


In [14]:
# Define features (X) and target (y)
x= data[['square_footage', 'num_bedrooms', 'num_bathrooms', 'location_Suburban', 'location_Urban']]
y = data['price']

In [15]:
# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

In [16]:
x_train.shape

(7, 5)

In [17]:
x_test.shape

(3, 5)

In [18]:
y_train.shape

(7,)

In [19]:
x_test.shape

(3, 5)

In [20]:
# Initialize and train the Linear Regression model
model = LinearRegression()
model.fit(x_train, y_train)


In [21]:
y_pred=model.predict(x_test)

In [22]:
y_pred

array([335808.62533688, 411954.17789758, 414986.52291109])

In [23]:
print('Train Score:', model.score(x_train,y_train))
print('Test Score:', model.score(x_test,y_test))

Train Score: 0.9995512785857166
Test Score: 0.8117722578948188


In [24]:
mse = mean_squared_error(y_test, y_pred)

In [25]:
mse

539586194.0348529

In [26]:
r2 = r2_score(y_test, y_pred)


In [27]:
r2

0.8117722578948188

In [29]:
# Save the trained model to a file
with open('house_price_model.pkl', 'wb') as file:
    pickle.dump(model, file)