In [1]:
import numpy as np
import pandas as pd

#### Extracting Data:

In [2]:
df = pd.read_csv(r"C:\Users\pc world\Streamlit\housing_price_dataset.csv")
df.head()

Unnamed: 0,SquareFeet,Bedrooms,Bathrooms,Neighborhood,YearBuilt,Price
0,2126,4,1,Rural,1969,215355.283618
1,2459,3,2,Rural,1980,195014.221626
2,1860,2,1,Suburb,1970,306891.012076
3,2294,2,1,Urban,1996,206786.787153
4,2130,5,2,Suburb,2001,272436.239065


#### Data Preparation:

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   SquareFeet    50000 non-null  int64  
 1   Bedrooms      50000 non-null  int64  
 2   Bathrooms     50000 non-null  int64  
 3   Neighborhood  50000 non-null  object 
 4   YearBuilt     50000 non-null  int64  
 5   Price         50000 non-null  float64
dtypes: float64(1), int64(4), object(1)
memory usage: 2.3+ MB


In [4]:
df.isna().sum()

SquareFeet      0
Bedrooms        0
Bathrooms       0
Neighborhood    0
YearBuilt       0
Price           0
dtype: int64

In [5]:
df.duplicated().sum()

0

#### Transformation:

In [6]:
df['Neighborhood'] = df['Neighborhood'].map({"Rural":0,"Suburb":1,"Urban":2})

In [7]:
df.head()

Unnamed: 0,SquareFeet,Bedrooms,Bathrooms,Neighborhood,YearBuilt,Price
0,2126,4,1,0,1969,215355.283618
1,2459,3,2,0,1980,195014.221626
2,1860,2,1,1,1970,306891.012076
3,2294,2,1,2,1996,206786.787153
4,2130,5,2,1,2001,272436.239065


#### Splitting:

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
X = df.drop("Price",axis = 1)
y = df['Price']

In [10]:
X_train,X_test,y_train,y_test = train_test_split(X,y,train_size=0.8,random_state = 23)

In [13]:
import numpy as np
from sklearn.metrics import mean_squared_error

#### Linear Regression:

In [14]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train,y_train)
y_pred = lr.predict(X_test)
print(np.sqrt(mean_squared_error(y_test,y_pred)))

49403.976927995944


#### Building ML Model:

In [2]:
df = pd.read_csv(r"C:\Users\pc world\Streamlit\housing_price_dataset.csv")
df.head()

Unnamed: 0,SquareFeet,Bedrooms,Bathrooms,Neighborhood,YearBuilt,Price
0,2126,4,1,Rural,1969,215355.283618
1,2459,3,2,Rural,1980,195014.221626
2,1860,2,1,Suburb,1970,306891.012076
3,2294,2,1,Urban,1996,206786.787153
4,2130,5,2,Suburb,2001,272436.239065


In [67]:
cont = [0,1,2,4]
ordi = [3]
nomi = []

In [68]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler,OneHotEncoder,OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression

In [69]:
num_pipe = Pipeline([("Mean Imputation",SimpleImputer(strategy='mean')),
                     ('Scaling',StandardScaler())
                    ])

nom_pipe = Pipeline([("Mode Imputation",SimpleImputer(strategy='most_frequent')),
         ("Encoder",OneHotEncoder(drop='first'))])

ord_pipe = Pipeline([("Mode Imputation",SimpleImputer(strategy='most_frequent')),
         ("Encoder",OrdinalEncoder(handle_unknown='use_encoded_value',unknown_value=-1))])

In [70]:
preprocess = ColumnTransformer([("Continuous",num_pipe,cont),
                  ('Nominal',nom_pipe,nomi),
                  ('Ordinal',ord_pipe,ordi)],
                              remainder='passthrough')
preprocess

In [82]:
model = Pipeline([("Data Preprocessing",preprocess),
         ("Algorithm ",LinearRegression())])

In [83]:
model

In [73]:
X = df.drop('Price',axis = 1)
y = df['Price']

In [74]:
X_train,X_test,y_train,y_test = train_test_split(X,y,train_size=0.8,random_state = 23)

In [75]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [76]:
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
print(np.sqrt(mean_squared_error(y_test,y_pred)))

49403.976927995944


In [78]:
import pickle

In [79]:
pickle.dump(model,open("predict_pl.pkl","wb"))

In [80]:
model_pickled = pickle.load(open("predict_pl.pkl","rb"))