In [1]:
from pycaret.regression import interpret_model
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import time

from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor

import joblib

# Loading Dataset

In [2]:
df = pd.read_csv('housePrice.csv')
df.head()

Unnamed: 0,Area,Room,Parking,Warehouse,Elevator,Address,Price,Price(USD)
0,63,1,True,True,True,Shahran,1850000000.0,61666.67
1,60,1,True,True,True,Shahran,1850000000.0,61666.67
2,79,2,True,True,True,Pardis,550000000.0,18333.33
3,95,2,True,True,True,Shahrake Qods,902500000.0,30083.33
4,123,2,True,True,True,Shahrake Gharb,7000000000.0,233333.33


In [3]:
df.sample(5)

Unnamed: 0,Area,Room,Parking,Warehouse,Elevator,Address,Price,Price(USD)
178,44,1,True,True,False,Andisheh,350000000.0,11666.67
1897,152,3,True,True,True,Saadat Abad,9800000000.0,326666.67
2091,75,2,True,False,True,Heshmatieh,3000000000.0,100000.0
2805,140,3,True,True,True,Mirdamad,8000000000.0,266666.67
1420,51,1,True,True,True,Tenant,1380000000.0,46000.0


In [4]:
df.shape

(3479, 8)

In [5]:
type(df)

pandas.core.frame.DataFrame

In [6]:
# drop the USD price column
df = df.drop("Price(USD)", axis="columns")

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3479 entries, 0 to 3478
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Area       3479 non-null   object 
 1   Room       3479 non-null   int64  
 2   Parking    3479 non-null   bool   
 3   Warehouse  3479 non-null   bool   
 4   Elevator   3479 non-null   bool   
 5   Address    3456 non-null   object 
 6   Price      3479 non-null   float64
dtypes: bool(3), float64(1), int64(1), object(2)
memory usage: 119.0+ KB


In [11]:
df.columns

Index(['Area', 'Room', 'Parking', 'Warehouse', 'Elevator', 'Address', 'Price'], dtype='object')

In [12]:
df.describe()

Unnamed: 0,Room,Price
count,3479.0,3479.0
mean,2.08,5359022710.58
std,0.76,8099934524.33
min,0.0,3600000.0
25%,2.0,1418250000.0
50%,2.0,2900000000.0
75%,2.0,6000000000.0
max,5.0,92400000000.0


In [13]:
df['Area'] = df['Area'].apply(lambda x: re.sub(',', '', x))
df["Area"] = pd.to_numeric(df["Area"] , errors='coerce')

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3479 entries, 0 to 3478
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Area       3479 non-null   int64  
 1   Room       3479 non-null   int64  
 2   Parking    3479 non-null   bool   
 3   Warehouse  3479 non-null   bool   
 4   Elevator   3479 non-null   bool   
 5   Address    3456 non-null   object 
 6   Price      3479 non-null   float64
dtypes: bool(3), float64(1), int64(2), object(1)
memory usage: 119.0+ KB


In [16]:
df.isnull().sum()

Area          0
Room          0
Parking       0
Warehouse     0
Elevator      0
Address      23
Price         0
dtype: int64

In [17]:
# I want to drop missing values
df.dropna(inplace=True)

In [19]:
df.shape

(3456, 7)

In [20]:
df["Parking"].value_counts(normalize=True)*100

Parking
True    84.75
False   15.25
Name: proportion, dtype: float64

In [22]:
pd.crosstab(df.Parking, df.Room)

Room,0,1,2,3,4,5
Parking,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
False,8,243,244,26,1,5
True,2,426,1699,705,69,28
