In [1]:
import  pandas as pd
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
import plotly.express as px
import warnings
warnings.filterwarnings('ignore')


In [2]:
data=pd.read_csv('guwahati_house_price.csv') 
data.head()               

Unnamed: 0.1,Unnamed: 0,bhk,price,size,location
0,0,3,45.0,1500,CHRISTIAN BASTI
1,1,3,78.79,1751,LAL GANESH
2,2,2,31.5,750,BORAGAON
3,3,2,50.0,1100,BAGHARBARI
4,4,3,65.2,1630,BELTOLA


In [3]:
data.shape

(3411, 5)

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3411 entries, 0 to 3410
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  3411 non-null   int64  
 1   bhk         3411 non-null   object 
 2   price       3411 non-null   float64
 3   size        3411 non-null   int64  
 4   location    3411 non-null   object 
dtypes: float64(1), int64(2), object(2)
memory usage: 133.4+ KB


In [5]:
data.isnull().sum()

Unnamed: 0    0
bhk           0
price         0
size          0
location      0
dtype: int64

**There is not any null values present in the Dataset**

###  Different Types OF Features present in the dataset

In [6]:
data.drop(['Unnamed: 0'],axis=1,inplace=True)

In [7]:
data['bhk'].unique()

array(['3', '2', '4', '1', '6', '5', 'A', '8', '9', '7'], dtype=object)

In [8]:
# We have to find those non-numaric values , and convert them into Nan values .
data['bhk']=pd.to_numeric(data['bhk'],errors='coerce')

In [9]:
cat_features=[features for features in data.columns if data[features].dtype=='O']
print('Clategorical features are :',cat_features)

Clategorical features are : ['location']


In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3411 entries, 0 to 3410
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   bhk       3409 non-null   float64
 1   price     3411 non-null   float64
 2   size      3411 non-null   int64  
 3   location  3411 non-null   object 
dtypes: float64(2), int64(1), object(1)
memory usage: 106.7+ KB


In [11]:
num_features=[features for features in data.columns if data[features].dtype!='O']
print('Numerical features are :',num_features)

print('---------'*12)
print('length of Numerical features are :',len(num_features))

Numerical features are : ['bhk', 'price', 'size']
------------------------------------------------------------------------------------------------------------
length of Numerical features are : 3


In [12]:
data.isnull().sum()

bhk         2
price       0
size        0
location    0
dtype: int64

In [13]:
# Fill those Nan values by taking mode() .
data['bhk']=data['bhk'].fillna(data['bhk'].mode()[0])

In [14]:
data.describe()

Unnamed: 0,bhk,price,size
count,3411.0,3411.0,3411.0
mean,2.733216,65.935561,1230.828496
std,0.704661,55.323471,558.039952
min,1.0,1.0,43.0
25%,2.0,46.6,944.0
50%,3.0,57.8,1150.0
75%,3.0,72.905,1369.5
max,9.0,2000.0,7800.0


In [15]:
data['location'].unique()

array(['CHRISTIAN BASTI', 'LAL GANESH', 'BORAGAON', 'BAGHARBARI',
       'BELTOLA', 'AHOM GAON', 'BAMUNIMAIDAM', 'KAHILIPARA', 'SIX MILE',
       'REHABARI', 'LOKHRA', 'JYOTIKUCHI', 'SARANIA HILLS', 'GANESHGURI',
       'SACHAL PATH VIP ROAD BYLANE NUMBER 1', 'CHANDMARI', 'NOONMATI',
       'ULUBARI', 'DISPUR', 'SARUMOTORIA', 'SOUTH SARANIA ROAD',
       'DOWNTOWN', 'JATIA', 'ATHGAON', 'DHARAPUR', 'RADHA NAGAR',
       'BAMUNIMAIDAN', 'GHORAMARA', 'ZOO TINIALI', 'BORBARI',
       'KAHILIPARA ROAD', 'ZOO ROAD', 'AZARA', 'PATOR KUCHI', 'BASISTHA',
       'KALA PAHAR', 'LALMATI', 'RUKMINI GAON', 'MATHGHARIA', 'KHANAPARA',
       'BHANGAGARH', 'GEETANAGAR', 'SARUSAJAI', 'BHETAPARA', 'HATIGAON',
       'MALIGAON', 'LACHIT NAGAR', 'HENGRABARI', 'NAYANPUR',
       'NARENGI TINALI', 'SATGAON', 'JALUKBARI', 'BARSAPARA',
       'PANJABARI ROAD', 'ADABARI', 'GS ROAD', 'VIP ROAD', 'GARCHUK',
       'JAYANAGAR', 'KALYANI SAGAR PATH', 'DIGHALIPUKHURI', 'KERAKUCHI',
       'SAWKUCHI', 'ABC GALI', 'TA

In [16]:
data.describe()

Unnamed: 0,bhk,price,size
count,3411.0,3411.0,3411.0
mean,2.733216,65.935561,1230.828496
std,0.704661,55.323471,558.039952
min,1.0,1.0,43.0
25%,2.0,46.6,944.0
50%,3.0,57.8,1150.0
75%,3.0,72.905,1369.5
max,9.0,2000.0,7800.0


In [17]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3411 entries, 0 to 3410
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   bhk       3411 non-null   float64
 1   price     3411 non-null   float64
 2   size      3411 non-null   int64  
 3   location  3411 non-null   object 
dtypes: float64(2), int64(1), object(1)
memory usage: 106.7+ KB


In [18]:
location_counts=data['location'].value_counts()

In [19]:
location_count_less_10=location_counts[location_counts<=10]

In [20]:
data['location']=data.location.apply (lambda x : 'other' if x in location_count_less_10 else x)

In [21]:
# One-hot encode 'Geography'
from sklearn.preprocessing import  OneHotEncoder
onehot_encoder_location = OneHotEncoder(drop='first')
location_encoded = onehot_encoder_location.fit_transform(data[['location']]).toarray()
location_encoded_df = pd.DataFrame(location_encoded, columns=onehot_encoder_location.get_feature_names_out(['location']))
location_encoded_df.head()


Unnamed: 0,location_ADABARI,location_AHOM GAON,location_AZARA,location_BAGHARBARI,location_BAMUNIMAIDAM,location_BARSAPARA,location_BASISTHA,location_BELTOLA,location_BETKUCHI,location_BHANGAGARH,...,location_SATGAON,location_SIX MILE,location_SURVEY,location_TARUN NAGAR,location_ULUBARI,location_UZAN BAZAR,location_VIP ROAD,location_ZOO ROAD,location_ZOO TINIALI,location_other
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
data = pd.concat([data.drop('location', axis=1), location_encoded_df], axis=1)
data.head()

Unnamed: 0,bhk,price,size,location_ADABARI,location_AHOM GAON,location_AZARA,location_BAGHARBARI,location_BAMUNIMAIDAM,location_BARSAPARA,location_BASISTHA,...,location_SATGAON,location_SIX MILE,location_SURVEY,location_TARUN NAGAR,location_ULUBARI,location_UZAN BAZAR,location_VIP ROAD,location_ZOO ROAD,location_ZOO TINIALI,location_other
0,3.0,45.0,1500,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,3.0,78.79,1751,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2.0,31.5,750,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,2.0,50.0,1100,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,3.0,65.2,1630,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [23]:
# Split the data into features and target
X=data.drop('price',axis=1)
y=data['price']

In [24]:
## Split the data in training and tetsing sets
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

In [25]:
from sklearn.preprocessing import  StandardScaler
scaler=StandardScaler()
X_train=scaler.fit_transform(X_train)
X_test=scaler.transform(X_test)

In [26]:
# Save the encoders and scaler for later use
import pickle

with open('onehot_encoder_location.pkl', 'wb') as file:
    pickle.dump(onehot_encoder_location, file)

with open('scaler.pkl', 'wb') as file:
    pickle.dump(scaler, file)

### Model Tranning

In [27]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error

In [28]:
##Create a Function to Evaluate Model
def model_evaluate(true,predicted): 
    mae=mean_absolute_error(true,predicted)
    mse=mean_squared_error(true,predicted)
    rmse=np.sqrt(mean_squared_error(true,predicted))
    r2_squre=r2_score(true,predicted)

    return mae,rmse,r2_squre
    

In [29]:
#begin model tranning 
models={
    'RandomForestRegressor':RandomForestRegressor(),
    'LinearRegression':LinearRegression(), 
    'KNeighborsRegressor':KNeighborsRegressor(), 
    'DecisionTreeRegressor':DecisionTreeRegressor(),
    'SVM':SVR(),
    'Grdiant Boost':GradientBoostingRegressor()
    
}
for i in range (len(list(models))):
    model=list(models.values())[i]
    model.fit(X_train,y_train)#Train Model

    #make prediction 
    y_train_pred=model.predict(X_train)
    y_test_pred=model.predict(X_test)

    # Evaluate Train and Test dataset
    model_train_mae , model_train_rmse, model_train_r2=model_evaluate(y_train,y_train_pred)
    model_test_mae , model_test_rmse, model_test_r2=model_evaluate(y_test,y_test_pred)

    print(list(models.keys())[i])
    print('Model performance for Training set')
    print("- Root Mean Squared Error: {:.4f}".format(model_train_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_train_mae))
    print("- R2 Score: {:.4f}".format(model_train_r2))

    print('-----'*10)

    print('Model performance for Test set')
    print("- Root Mean Squared Error:",model_test_rmse)
    print("- Mean Absolute Error:",model_test_mae)
    print("- R2 Score: ",model_test_r2) 

    print('='*35)
    print('\n')

    

RandomForestRegressor
Model performance for Training set
- Root Mean Squared Error: 17.7299
- Mean Absolute Error: 4.8853
- R2 Score: 0.9044
--------------------------------------------------
Model performance for Test set
- Root Mean Squared Error: 31.47873784726795
- Mean Absolute Error: 10.540906189087352
- R2 Score:  0.5371398108616912


LinearRegression
Model performance for Training set
- Root Mean Squared Error: 44.5489
- Mean Absolute Error: 13.9512
- R2 Score: 0.3963
--------------------------------------------------
Model performance for Test set
- Root Mean Squared Error: 27.314795092058752
- Mean Absolute Error: 14.394104566144918
- R2 Score:  0.651493324977683


KNeighborsRegressor
Model performance for Training set
- Root Mean Squared Error: 41.3007
- Mean Absolute Error: 10.2545
- R2 Score: 0.4811
--------------------------------------------------
Model performance for Test set
- Root Mean Squared Error: 35.11701345614492
- Mean Absolute Error: 13.37541727672035
- R2 Sco

In [30]:
data.corr()

Unnamed: 0,bhk,price,size,location_ADABARI,location_AHOM GAON,location_AZARA,location_BAGHARBARI,location_BAMUNIMAIDAM,location_BARSAPARA,location_BASISTHA,...,location_SATGAON,location_SIX MILE,location_SURVEY,location_TARUN NAGAR,location_ULUBARI,location_UZAN BAZAR,location_VIP ROAD,location_ZOO ROAD,location_ZOO TINIALI,location_other
bhk,1.000000,0.430774,0.627246,-0.029853,-0.020604,-0.011637,-0.003742,0.049417,-0.029815,-0.015158,...,-0.000680,-0.010988,-0.015158,0.013023,-0.016459,0.003339,0.021161,0.020422,0.049825,0.024403
price,0.430774,1.000000,0.644310,-0.012907,-0.032629,-0.042524,-0.016804,0.004454,-0.028081,-0.014591,...,-0.020675,0.010487,-0.008272,0.034728,0.009566,0.019295,-0.002628,0.084101,0.056343,0.063904
size,0.627246,0.644310,1.000000,-0.030379,-0.014475,-0.027277,-0.021514,0.036932,-0.034195,-0.011326,...,-0.039220,0.029442,-0.034161,0.066050,0.005114,0.006593,-0.002111,0.023183,0.059610,0.032107
location_ADABARI,-0.029853,-0.012907,-0.030379,1.000000,-0.007627,-0.008973,-0.006506,-0.006975,-0.006001,-0.004143,...,-0.005868,-0.015730,-0.004143,-0.006259,-0.007308,-0.006745,-0.008326,-0.007199,-0.008132,-0.023556
location_AHOM GAON,-0.020604,-0.032629,-0.014475,-0.007627,1.000000,-0.012902,-0.009354,-0.010029,-0.008628,-0.005956,...,-0.008437,-0.022616,-0.005956,-0.008998,-0.010508,-0.009697,-0.011971,-0.010350,-0.011692,-0.033868
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
location_UZAN BAZAR,0.003339,0.019295,0.006593,-0.006745,-0.009697,-0.011408,-0.008271,-0.008868,-0.007630,-0.005267,...,-0.007461,-0.019998,-0.005267,-0.007957,-0.009292,1.000000,-0.010586,-0.009152,-0.010339,-0.029948
location_VIP ROAD,0.021161,-0.002628,-0.002111,-0.008326,-0.011971,-0.014084,-0.010211,-0.010948,-0.009419,-0.006502,...,-0.009210,-0.024688,-0.006502,-0.009823,-0.011470,-0.010586,1.000000,-0.011299,-0.012764,-0.036972
location_ZOO ROAD,0.020422,0.084101,0.023183,-0.007199,-0.010350,-0.012177,-0.008829,-0.009466,-0.008144,-0.005622,...,-0.007963,-0.021346,-0.005622,-0.008493,-0.009917,-0.009152,-0.011299,1.000000,-0.011036,-0.031966
location_ZOO TINIALI,0.049825,0.056343,0.059610,-0.008132,-0.011692,-0.013756,-0.009973,-0.010693,-0.009200,-0.006351,...,-0.008996,-0.024113,-0.006351,-0.009594,-0.011203,-0.010339,-0.012764,-0.011036,1.000000,-0.036111


In [31]:
corr_bhk=data['bhk'].corr(data['price'])
print('Correlation between size and price of the house:',corr_bhk)

Correlation between size and price of the house: 0.430773956873426


In [32]:
corr_size=data['size'].corr(data['price'])
print('Correlation between size and price of the house:',corr_size)


Correlation between size and price of the house: 0.6443095538368909


In [33]:
from sklearn.linear_model import LinearRegression
regression=LinearRegression()
regression.fit(X_train,y_train)

In [34]:
y_pred=regression.predict(X_test)
y_pred

array([124.69887969,  69.50924517,  69.79352965,  37.75413383,
        46.3098264 ,  52.05120886,  93.06008385,  60.53465581,
        48.32991043,  35.98632186,  87.61363603,  80.69999756,
       128.82978131,  39.50320657,  58.83734002,  75.90196028,
       125.89163806,  39.05743299,  50.75673319,  65.99472878,
        73.2365815 ,  55.11978102, 253.33036371, 210.10816219,
        65.88988204,  78.42010727,  48.99269888,  44.53756311,
        78.27701108,  71.13261647,  32.60579234,  62.26260087,
        62.42663626,  61.04861722,  58.5312889 ,  68.15783515,
        70.5037578 ,  35.19870685, 103.132562  ,  44.27609887,
        38.89903128,  43.33682851,  66.23099567,  80.11058029,
       142.44074599,  75.96890358,  43.09006191,  86.32975037,
        58.57908972,  71.22217869,  63.89771121,  84.79675527,
       100.48718511, 117.80853907,  65.01803026,  62.36078035,
       125.04472991,  63.27783972,  96.35348362,  84.87134596,
        27.73437616,  83.67303624,  84.375876  ,  41.88

In [35]:
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error
print("R2_score is :",r2_score(y_test,y_pred))
print('MAE :',mean_absolute_error(y_test,y_pred))
print('MSE:',mean_squared_error(y_test,y_pred))

R2_score is : 0.651493324977683
MAE : 14.394104566144918
MSE: 746.0980309211569


In [36]:
with open('regression_model.pkl', 'wb') as file:
    pickle.dump(regression, file)