In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('C://Users//manik//OneDrive//Documents//Data_sets//Housing.csv')
df.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


features included : 
- price of the house
- area of the house
- bedrooms 
- bathrooms
- stories or number of floors
- mainroad : the house is near mainroad
- guestroom : is there any guestroom
- basement
- hotwater
- ac
- parking
- prefarea : means it's surrounding is good or not like schools,hospitals etc ..
- furnishingstatus : whether furniture is ready or not

In [3]:
df.size

7085

In [4]:
# checking if there are any missing values
df.isnull().sum()

price               0
area                0
bedrooms            0
bathrooms           0
stories             0
mainroad            0
guestroom           0
basement            0
hotwaterheating     0
airconditioning     0
parking             0
prefarea            0
furnishingstatus    0
dtype: int64

In [5]:
# renaming column names stories-floors , hotwaterheating-geyser , airconditioning-ac
df.rename(columns={'stories':'floors','hotwaterheating':'geyser','airconditioning':'ac'},inplace=True)

In [6]:
df.head()

Unnamed: 0,price,area,bedrooms,bathrooms,floors,mainroad,guestroom,basement,geyser,ac,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


In [7]:
df.dtypes == 'object'

price               False
area                False
bedrooms            False
bathrooms           False
floors              False
mainroad             True
guestroom            True
basement             True
geyser               True
ac                   True
parking             False
prefarea             True
furnishingstatus     True
dtype: bool

In [8]:
# categorical columns
df.select_dtypes(include=['object']).columns

Index(['mainroad', 'guestroom', 'basement', 'geyser', 'ac', 'prefarea',
       'furnishingstatus'],
      dtype='object')

In [9]:
df['mainroad'].unique()

array(['yes', 'no'], dtype=object)

In [10]:
df['guestroom'].unique()

array(['no', 'yes'], dtype=object)

In [11]:
df['basement'].unique()

array(['no', 'yes'], dtype=object)

In [12]:
df['geyser'].unique()

array(['no', 'yes'], dtype=object)

In [13]:
df['ac'].unique()

array(['yes', 'no'], dtype=object)

In [14]:
df['prefarea'].unique()

array(['yes', 'no'], dtype=object)

In [15]:
df['furnishingstatus'].unique()

array(['furnished', 'semi-furnished', 'unfurnished'], dtype=object)

In [16]:
newdf = pd.get_dummies(data=df)
newdf

Unnamed: 0,price,area,bedrooms,bathrooms,floors,parking,mainroad_no,mainroad_yes,guestroom_no,guestroom_yes,...,basement_yes,geyser_no,geyser_yes,ac_no,ac_yes,prefarea_no,prefarea_yes,furnishingstatus_furnished,furnishingstatus_semi-furnished,furnishingstatus_unfurnished
0,13300000,7420,4,2,3,2,0,1,1,0,...,0,1,0,0,1,0,1,1,0,0
1,12250000,8960,4,4,4,3,0,1,1,0,...,0,1,0,0,1,1,0,1,0,0
2,12250000,9960,3,2,2,2,0,1,1,0,...,1,1,0,1,0,0,1,0,1,0
3,12215000,7500,4,2,2,3,0,1,1,0,...,1,1,0,0,1,0,1,1,0,0
4,11410000,7420,4,1,2,2,0,1,0,1,...,1,1,0,0,1,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
540,1820000,3000,2,1,1,2,0,1,1,0,...,1,1,0,1,0,1,0,0,0,1
541,1767150,2400,3,1,1,0,1,0,1,0,...,0,1,0,1,0,1,0,0,1,0
542,1750000,3620,2,1,1,0,0,1,1,0,...,0,1,0,1,0,1,0,0,0,1
543,1750000,2910,3,1,1,0,1,0,1,0,...,0,1,0,1,0,1,0,1,0,0


In [17]:
newdf.columns

Index(['price', 'area', 'bedrooms', 'bathrooms', 'floors', 'parking',
       'mainroad_no', 'mainroad_yes', 'guestroom_no', 'guestroom_yes',
       'basement_no', 'basement_yes', 'geyser_no', 'geyser_yes', 'ac_no',
       'ac_yes', 'prefarea_no', 'prefarea_yes', 'furnishingstatus_furnished',
       'furnishingstatus_semi-furnished', 'furnishingstatus_unfurnished'],
      dtype='object')

In [18]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score,r2_score

In [19]:
x = newdf.drop('price',axis=1)
y = newdf.price

In [20]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2)

In [21]:
model = LinearRegression()
model.fit(x_train,y_train)

In [22]:
y_pred = model.predict(x_test)
y_pred

array([3387927.5803923 , 4144154.90514273, 6601825.15766763,
       3967347.09405723, 3115727.8780767 , 4097482.26704523,
       4195101.85263253, 3150177.10695703, 4994395.77203073,
       1965929.12449958, 3904441.05898184, 2600294.52260425,
       2682673.06361825, 3684965.16228199, 3036969.44462884,
       2187197.60754198, 2637073.99519356, 4136671.01454333,
       7149313.04194443, 5196621.12918993, 5155704.26566517,
       3739674.20613439, 3964309.35793221, 4199666.22209734,
       7610723.93766839, 7757185.1517887 , 3176308.98406249,
       3140713.62204261, 6380241.72641424, 5890900.87443321,
       6355971.04170716, 2960683.98555994, 3742029.64604784,
       3691806.25875717, 2634852.41631233, 6397163.50398168,
       2909918.02226439, 2793490.70148807, 5223878.00103426,
       4925800.3855156 , 5773821.37815366, 6680021.78981345,
       6060061.11971833, 5979229.3706838 , 4901593.4187138 ,
       9051672.62430062, 6977665.69547194, 6630450.02836817,
       6888493.77712436,

In [23]:
y_test

275    4319000
406    3465000
162    5460000
204    4900000
214    4865000
        ...   
525    2345000
59     7210000
184    5110000
535    2100000
140    5740000
Name: price, Length: 109, dtype: int64

In [24]:
r2 = r2_score(y_test, y_pred)
print("R-squared (R2) Score:", r2)

R-squared (R2) Score: 0.7056373638589316


In [25]:
import pickle 
with open('housing_model.pkl' , 'wb') as f:
    pickle.dump(model,f)

In [250]:
with open('housing_model.pkl' , 'rb') as f:
    loaded_model = pickle.load(f)

In [251]:
loaded_model.predict(x_test)

array([5443810.85860028, 3018259.1898455 , 4858646.21399867,
       7425800.99828112, 3144197.10379671, 7061741.65530471,
       5029653.03109443, 2736795.96234124, 3771158.57565351,
       6074154.02481618, 4452113.00286201, 5962249.94325555,
       7133035.10218444, 4988750.04031497, 4042141.31688706,
       3485640.2145775 , 5433965.66611343, 3921391.34743163,
       6679956.03262506, 4736286.06467688, 5917579.7865247 ,
       4269216.19228025, 5150874.58064653, 5800893.45161046,
       3383509.39773119, 4688816.11580627, 6949131.16930223,
       3614203.37476505, 8373832.65518163, 3057821.32987438,
       5211636.34205577, 5762171.02391547, 7060845.66773146,
       5515770.63970325, 3324169.89423926, 5649428.95467337,
       6176589.55157027, 2314707.08493406, 4535982.50166653,
       3870711.57185203, 3185770.21367527, 3931710.88472217,
       9803178.78761004, 5223221.30097804, 4042541.60629702,
       6150846.22033044, 6584608.61546665, 6377850.34396942,
       3769337.72655687,