# Importing Packages 
#Step 1:Import the relevant libraries such as 
#a) numpy, b) pandas, c) train_test_split, d) Linear regression and e) metrics 

In [2]:
import numpy as np 
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score 

# Step 2: Read the house price dataset and check for null data

In [3]:
data=pd.read_csv("kc_house_data.csv")
data.isnull().sum()
data.drop(['sqft_above'], axis='columns', inplace=True)
conv_dates=[1 if values==2014 else 0 for values in data.date]
data['date']=conv_dates

In [40]:
data.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,0,221900.0,3,1.0,1180,5650,1.0,0,0,3,7,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,0,538000.0,3,2.25,2570,7242,2.0,0,0,3,7,400,1951,1991,98125,47.721,-122.319,1690,7639
2,5631500400,0,180000.0,2,1.0,770,10000,1.0,0,0,3,6,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,0,604000.0,4,3.0,1960,5000,1.0,0,0,5,7,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,0,510000.0,3,2.0,1680,8080,1.0,0,0,3,8,0,1987,0,98074,47.6168,-122.045,1800,7503


In [4]:
data=data.drop(['date','view','grade','sqft_basement','yr_renovated','zipcode','lat','long','sqft_living15','sqft_lot15'],axis=1)

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21613 entries, 0 to 21612
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   id           21613 non-null  int64  
 1   price        21613 non-null  float64
 2   bedrooms     21613 non-null  int64  
 3   bathrooms    21613 non-null  float64
 4   sqft_living  21613 non-null  int64  
 5   sqft_lot     21613 non-null  int64  
 6   floors       21613 non-null  float64
 7   waterfront   21613 non-null  int64  
 8   condition    21613 non-null  int64  
 9   yr_built     21613 non-null  int64  
dtypes: float64(3), int64(7)
memory usage: 1.6 MB


# Step 3:Convert any string data to numerical data using suitable conversion 
# Step 4: Indentify the dependent variable (Output label) and independent variables (Input features)

In [6]:
data.shape

(21613, 10)

In [7]:
y=data[['price']].values
k=data.drop(['id','price'],axis=1)
x=data.drop(['id','price'], axis=1).values

In [8]:
k

Unnamed: 0,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,condition,yr_built
0,3,1.00,1180,5650,1.0,0,3,1955
1,3,2.25,2570,7242,2.0,0,3,1951
2,2,1.00,770,10000,1.0,0,3,1933
3,4,3.00,1960,5000,1.0,0,5,1965
4,3,2.00,1680,8080,1.0,0,3,1987
...,...,...,...,...,...,...,...,...
21608,3,2.50,1530,1131,3.0,0,3,2009
21609,4,2.50,2310,5813,2.0,0,3,2014
21610,2,0.75,1020,1350,2.0,0,3,2009
21611,3,2.50,1600,2388,2.0,0,3,2004


# Step 5: Split the data into train data and test data

In [9]:
x_train, x_test, y_train, y_test=train_test_split(x,y, test_size=0.2) 

# Step 6: Define and train (fit) the linear regression model using training data 

In [10]:
#Model Training 
LiRe=LinearRegression ()
LiRe.fit(x_train,y_train)

# Step 7: Test (predict) the output for the test data using the fitted model 

In [11]:
y_predict=LiRe.predict(x_test)
print(y_predict)
print(y_test) 

[[605417.21726072]
 [242562.19680805]
 [387652.46073704]
 ...
 [805689.38318241]
 [561613.94290202]
 [518507.9734081 ]]
[[527000.]
 [329999.]
 [270000.]
 ...
 [795000.]
 [319000.]
 [631625.]]


In [12]:
#Step 8: Determine the performance of the model using suitable metric 

In [13]:
print('coefficient of determination: %.3f' % r2_score(y_test,y_predict)) 

coefficient of determination: 0.593


In [15]:
def predict_price(bedrooms, bathrooms, sqft_living, sqft_lot, floors, waterfront, condition, yr_built):
    # Create a Linear Regression model
    LiRe = LinearRegression()
    
    # Train the model with the existing data
    LiRe.fit(x_train, y_train)
    
    # Create an input array with the user-provided data
    input_data = [[bedrooms, bathrooms, sqft_living, sqft_lot, floors, waterfront, condition, yr_built]]
    
    # Use the trained model to predict the price
    predicted_price = LiRe.predict(input_data)

    
    # Return the predicted price
    return predicted_price[0][0]

In [16]:
predicted_price = predict_price(3, 2, 2000, 8000, 2, 1, 3, 1990)
print(predicted_price)


1200666.7077685045


In [17]:
import pickle
pickle.dump(LiRe,open('HouseModel.pkl', 'wb'))