In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

import joblib


In [2]:
df = pd.read_csv('USA Housing Dataset.csv')
print(df.head())

                  date      price  bedrooms  bathrooms  sqft_living  sqft_lot  \
0  2014-05-09 00:00:00   376000.0       3.0       2.00         1340      1384   
1  2014-05-09 00:00:00   800000.0       4.0       3.25         3540    159430   
2  2014-05-09 00:00:00  2238888.0       5.0       6.50         7270    130017   
3  2014-05-09 00:00:00   324000.0       3.0       2.25          998       904   
4  2014-05-10 00:00:00   549900.0       5.0       2.75         3060      7015   

   floors  waterfront  view  condition  sqft_above  sqft_basement  yr_built  \
0     3.0           0     0          3        1340              0      2008   
1     2.0           0     0          3        3540              0      2007   
2     2.0           0     0          3        6420            850      2010   
3     2.0           0     0          3         798            200      2007   
4     1.0           0     0          5        1600           1460      1979   

   yr_renovated                   stre

In [3]:
print(df.info())
print(df.describe())
print(df.isnull().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4140 entries, 0 to 4139
Data columns (total 18 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   date           4140 non-null   object 
 1   price          4140 non-null   float64
 2   bedrooms       4140 non-null   float64
 3   bathrooms      4140 non-null   float64
 4   sqft_living    4140 non-null   int64  
 5   sqft_lot       4140 non-null   int64  
 6   floors         4140 non-null   float64
 7   waterfront     4140 non-null   int64  
 8   view           4140 non-null   int64  
 9   condition      4140 non-null   int64  
 10  sqft_above     4140 non-null   int64  
 11  sqft_basement  4140 non-null   int64  
 12  yr_built       4140 non-null   int64  
 13  yr_renovated   4140 non-null   int64  
 14  street         4140 non-null   object 
 15  city           4140 non-null   object 
 16  statezip       4140 non-null   object 
 17  country        4140 non-null   object 
dtypes: float

In [4]:
df = df.drop(['date', 'street', 'city', 'statezip', 'country'], axis=1)
df = df.dropna()
print(df.head())

       price  bedrooms  bathrooms  sqft_living  sqft_lot  floors  waterfront  \
0   376000.0       3.0       2.00         1340      1384     3.0           0   
1   800000.0       4.0       3.25         3540    159430     2.0           0   
2  2238888.0       5.0       6.50         7270    130017     2.0           0   
3   324000.0       3.0       2.25          998       904     2.0           0   
4   549900.0       5.0       2.75         3060      7015     1.0           0   

   view  condition  sqft_above  sqft_basement  yr_built  yr_renovated  
0     0          3        1340              0      2008             0  
1     0          3        3540              0      2007             0  
2     0          3        6420            850      2010             0  
3     0          3         798            200      2007             0  
4     0          5        1600           1460      1979             0  


In [5]:
features = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated']
X = df[features]
y = df['price']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
model = LinearRegression()
model.fit(X_train, y_train)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [8]:
y_pred = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
print(f'RMSE: {rmse}')
print(f'R²: {r2}')

RMSE: 256023.6356287235
R²: 0.37475248028196506


In [9]:
coefficients = pd.DataFrame(model.coef_, features, columns=['Coefficient'])
print(coefficients)

                 Coefficient
bedrooms       -66699.925291
bathrooms       56804.913761
sqft_living       181.069381
sqft_lot           -0.701222
floors          25879.919093
waterfront     417154.850732
view            35362.824513
condition       29213.384390
sqft_above         99.564685
sqft_basement      81.504695
yr_built        -2131.939823
yr_renovated        4.952658


In [10]:
joblib.dump(model, 'house_price_model.pkl')

['house_price_model.pkl']

In [11]:
example = X_test.iloc[:5]
predictions = model.predict(example)
print('Example predictions:')
for i, pred in enumerate(predictions):
    print(f'Predicted: {pred:.2f}, Actual: {y_test.iloc[i]:.2f}')

Example predictions:
Predicted: 704393.05, Actual: 600000.00
Predicted: 375323.23, Actual: 370000.00
Predicted: 497316.03, Actual: 471000.00
Predicted: 379678.56, Actual: 240000.00
Predicted: 310000.54, Actual: 413000.00


In [16]:
loaded_model = joblib.load('house_price_model.pkl')

sample_house = pd.DataFrame({
    'bedrooms': [3],
    'bathrooms': [2.5],
    'sqft_living': [2000],
    'sqft_lot': [5000],
    'floors': [2.0],
    'waterfront': [0],
    'view': [0],
    'condition': [5],
    'sqft_above': [1800],
    'sqft_basement': [200],
    'yr_built': [1995],
    'yr_renovated': [0]
})

predicted_price = loaded_model.predict(sample_house)
print(f'Predicted price for the sample house: ${predicted_price[0]:.2f}')


Predicted price for the sample house: $560562.83
