# ML Hackathon - Banglore House Price Prediction
## Goal :
The objective of this hackathon is to show case your  ML1  learning  particularly  feature engineering skills leveraging primary and secondary datasets to develop a regression model for predicting the house price in Bengaluru. This predictive model will help the  to make purchasing/renting decisions by predicting fair housing prices.

### Metric to measure :
The measure of accuracy will be RMSE (root mean square error). The predicted Price for each house in the test dataset will be compared with the actual Price to calculate the RMSE value of the entire prediction. The lower the RMSE value, the better the model will be.

In [None]:
!pip install pyforest

Collecting pyforest
  Downloading pyforest-1.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyforest
  Building wheel for pyforest (setup.py) ... [?25l[?25hdone
  Created wheel for pyforest: filename=pyforest-1.1.2-py2.py3-none-any.whl size=15900 sha256=24fc7261845a1fdce639708160b44049d18ccc64259088dabd551279ef18bf7b
  Stored in directory: /root/.cache/pip/wheels/c5/ca/73/5cdc3d087111bfbdef90be5457aa03c00c0e32241b2752445c
Successfully built pyforest
Installing collected packages: pyforest
Successfully installed pyforest-1.1.2


In [None]:
# import all the libraries
import pyforest
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import pickle
import re
import joblib
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor

In [None]:
FILE_PATH = "https://raw.githubusercontent.com/HarpyTech/DSAI-MTech/main/ML-1/data-sets/hackathon/train.csv" #@param {type:"string"}

house_prices = pd.read_csv(FILE_PATH)

house_prices.head()

<IPython.core.display.Javascript object>

Unnamed: 0,ID,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [None]:
class DropInSignificantColumns(BaseEstimator, TransformerMixin):
    def __init__(self, insignificant_columns):
        self.insignificant_columns = insignificant_columns

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X.drop(columns=self.insignificant_columns, inplace=True)
        return X

In [None]:
class TotalSqFtTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        total_sqft = X['total_sqft']
        total_sqmts = total_sqft.apply(self.convert_to_sqft) * 0.09290304
        X['total_sqft'] = total_sqmts
        # X.drop(columns=['total_sqft'], inplace=True)

        return X

    @staticmethod
    def convert_to_sqft(x):
      tokens = x.split('-')
      if len(tokens) == 2:
          return (float(tokens[0]) + float(tokens[1])) / 2
      try:
          return float(x)
      except:
          try:
            if 'Sq. Meter' in x:
                num = float(re.findall("(\d+\.\d+|\d+)", x)[0])
                return num * 10.7639
            elif 'Sq. Yards' in x:
                num = float(re.findall("(\d+\.\d+|\d+)", x)[0])
                return num * 9
            elif 'Acres' in x:
                num = float(re.findall("(\d+\.\d+|\d+)", x)[0])
                return num * 43560
            elif 'Cents' in x:
                num = float(re.findall("(\d+\.\d+|\d+)", x)[0])
                return num * 435.6
            elif 'Guntha' in x:
                num = float(re.findall("(\d+\.\d+|\d+)", x)[0])
                return num * 1089
            elif 'Grounds' in x:
                num = float(re.findall("(\d+\.\d+|\d+)", x)[0])
                return num * 2400
          except:
              return np.NaN

In [None]:
class SizeTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        size = X['size']
        bhk = size.apply(self.safe_extract_int)
        X['size'] = bhk

        return X

    @staticmethod
    def safe_extract_int(x):
        try:
          return int(x.split(' ')[0])
        except (ValueError, IndexError, AttributeError):
          return None

In [None]:
class OutlierRemover(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X = self.remove_outliers_quartile(X, 'total_sqft')
        X = self.remove_outliers_quartile(X, 'size')
        X = self.remove_outliers_quartile(X, 'balcony')

        return X
    @staticmethod
    def remove_outliers_quartile(X, column):
        Q1 = X[column].quantile(0.25)
        Q3 = X[column].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        X = X[(X[column] >= lower_bound) & (X[column] <= upper_bound)]

        return X

In [None]:
class NaNRemover(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X.dropna(subset=['size', 'bath'],inplace=True)

        # print(X.head)

        return X

In [None]:
X = house_prices.drop(columns=['price'])
Y = house_prices['price']

In [None]:
X = NaNRemover().transform(X)
insignificant_columns = ['ID', 'availability', 'society', 'location']
X = DropInSignificantColumns(insignificant_columns)
X = OutlierRemover().transform(X)

X.describe()

<bound method NDFrame.head of           ID             area_type   availability  \
0          0  Super built-up  Area         19-Dec   
1          1            Plot  Area  Ready To Move   
2          2        Built-up  Area  Ready To Move   
3          3  Super built-up  Area  Ready To Move   
4          4  Super built-up  Area  Ready To Move   
...      ...                   ...            ...   
10651  10651            Plot  Area  Ready To Move   
10652  10652  Super built-up  Area  Ready To Move   
10653  10653  Super built-up  Area  Ready To Move   
10654  10654  Super built-up  Area  Ready To Move   
10655  10655  Super built-up  Area  Ready To Move   

                             location       size  society total_sqft  bath  \
0            Electronic City Phase II      2 BHK  Coomee        1056   2.0   
1                    Chikka Tirupathi  4 Bedroom  Theanmp       2600   5.0   
2                         Uttarahalli      3 BHK      NaN       1440   2.0   
3                  Li

TypeError: 'DropInSignificantColumns' object is not subscriptable

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.15, random_state=42)

In [None]:


numerical_features = ['total_sqft', 'bath', 'balcony', 'size']
categorical_features = ['area_type']

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])


model_pipeline = Pipeline(steps=[
    ('nan_remover', NaNRemover()),
    ('total_sqft_transformer', TotalSqFtTransformer()),
    ('size_transformer', SizeTransformer()),
    ('insignificant_columns_remover', DropInSignificantColumns(insignificant_columns)),
    ('outlier_remover', OutlierRemover()),
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor())
])

model_pipeline


In [None]:
from sklearn.metrics import mean_squared_error
model_pipeline.fit(X_train, y_train)

y_pred = model_pipeline.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("RMSE:", rmse)


<bound method NDFrame.head of           ID             area_type   availability               location  \
10137  10137            Plot  Area  Ready To Move     Venkateswara Nagar   
8412    8412  Super built-up  Area  Ready To Move          BTM 2nd Stage   
5483    5483        Built-up  Area  Ready To Move                 Hebbal   
2492    2492  Super built-up  Area         18-Aug               Sarjapur   
3355    3355  Super built-up  Area  Ready To Move           Babusapalaya   
...      ...                   ...            ...                    ...   
5734    5734  Super built-up  Area  Ready To Move  Raja Rajeshwari Nagar   
5191    5191  Super built-up  Area         21-May                 Kogilu   
5390    5390  Super built-up  Area  Ready To Move       Garudachar Palya   
860      860            Plot  Area  Ready To Move  Raja Rajeshwari Nagar   
7270    7270  Super built-up  Area  Ready To Move  Silver Springs Layout   

            size  society total_sqft  bath  balcony  
101

ValueError: Found input variables with inconsistent numbers of samples: [7605, 9057]

In [None]:
class DataPreprocessor:
    def __init__(self, data):
        self.data = data
        self.insignificant_columns = ['ID', 'availability', 'society', 'location']

    def transform(self):
        self.remove_nan()
        self.drop_inconsitant_columns()
        self.transform_total_sqft()
        self.transform_size()
        self.remove_outliers()

        return self.data

    def remove_nan(self):
        self.data.dropna(subset=['location', 'size', 'bath'], inplace=True)

    def drop_inconsitant_columns(self):
        self.data.drop(columns=self.insignificant_columns, inplace=True, axis='columns')

    def transform_total_sqft(self):
        self.data = TotalSqFtTransformer().transform(self.data)

    def transform_size(self):
        self.data = SizeTransformer().transform(self.data)

    def remove_outliers(self):
        self.data = OutlierRemover().transform(self.data)

    def show_nan(self):
      null_values = [[item, self.data[item].isnull().sum(), len(self.data), f"{np.round((self.data[item].isnull().sum() / len(self.data))*100, 2)}%" ] for item in self.data.columns]
      null_data = pd.DataFrame(null_values, columns=['Column', "Rows with Data", "Total Rows", "Null Values"])

      return null_data.sort_values(by="Null Values", ascending=False)


In [None]:
data_processor = DataPreprocessor(house_prices.copy())



# data_processor.data.head()

In [None]:
data_processor.show_nan()

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Unnamed: 0,Column,Rows with Data,Total Rows,Null Values
5,society,4428,10656,41.55%
8,balcony,504,10656,4.73%
7,bath,65,10656,0.61%
4,size,14,10656,0.13%
3,location,1,10656,0.01%
0,ID,0,10656,0.0%
1,area_type,0,10656,0.0%
2,availability,0,10656,0.0%
6,total_sqft,0,10656,0.0%
9,price,0,10656,0.0%


In [None]:
data_processor.remove_nan()

data_processor.show_nan()

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Unnamed: 0,Column,Rows with Data,Total Rows,Null Values
5,society,4425,10590,41.78%
8,balcony,439,10590,4.15%
0,ID,0,10590,0.0%
1,area_type,0,10590,0.0%
2,availability,0,10590,0.0%
3,location,0,10590,0.0%
4,size,0,10590,0.0%
6,total_sqft,0,10590,0.0%
7,bath,0,10590,0.0%
9,price,0,10590,0.0%


In [None]:
data_processor.drop_inconsitant_columns()

data_processor.show_nan()

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Unnamed: 0,Column,Rows with Data,Total Rows,Null Values
4,balcony,439,10590,4.15%
0,area_type,0,10590,0.0%
1,size,0,10590,0.0%
2,total_sqft,0,10590,0.0%
3,bath,0,10590,0.0%
5,price,0,10590,0.0%


In [None]:
data_processor.transform_total_sqft()

data_processor.show_nan()

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Unnamed: 0,Column,Rows with Data,Total Rows,Null Values
4,balcony,439,10590,4.15%
2,total_sqft,1,10590,0.01%
0,area_type,0,10590,0.0%
1,size,0,10590,0.0%
3,bath,0,10590,0.0%
5,price,0,10590,0.0%


In [None]:
data_processor.transform_size()

data_processor.show_nan()

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Unnamed: 0,Column,Rows with Data,Total Rows,Null Values
4,balcony,439,10590,4.15%
2,total_sqft,1,10590,0.01%
0,area_type,0,10590,0.0%
1,size,0,10590,0.0%
3,bath,0,10590,0.0%
5,price,0,10590,0.0%


In [None]:
data_processor.remove_outliers()

data_processor.show_nan()

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Unnamed: 0,Column,Rows with Data,Total Rows,Null Values
0,area_type,0,8962,0.0%
1,size,0,8962,0.0%
2,total_sqft,0,8962,0.0%
3,bath,0,8962,0.0%
4,balcony,0,8962,0.0%
5,price,0,8962,0.0%


In [None]:
data_processor.data.describe()

Unnamed: 0,size,total_sqft,bath,balcony,price
count,8962.0,8962.0,8962.0,8962.0,8962.0
mean,2.454363,121.447588,2.327605,1.552109,80.487554
std,0.694152,37.247709,0.729911,0.788983,60.344548
min,1.0,25.641239,1.0,0.0,8.0
25%,2.0,99.870768,2.0,1.0,47.0
50%,2.0,114.270739,2.0,2.0,65.0
75%,3.0,141.189395,3.0,2.0,94.9975
max,4.0,235.973722,7.0,3.0,1063.0


In [None]:
TEST_FILE_PATH = "https://raw.githubusercontent.com/HarpyTech/DSAI-MTech/main/ML-1/data-sets/hackathon/test.csv" #@param {type:"string"}

test_data = pd.read_csv(TEST_FILE_PATH)

test_data.head()

<IPython.core.display.Javascript object>

Unnamed: 0,ID,area_type,availability,location,size,society,total_sqft,bath,balcony
0,0,Super built-up Area,Ready To Move,Chamrajpet,2 BHK,,650,1.0,1.0
1,1,Super built-up Area,Ready To Move,7th Phase JP Nagar,3 BHK,SrncyRe,1370,2.0,1.0
2,2,Super built-up Area,Ready To Move,Whitefield,3 BHK,AjhalNa,1725,3.0,2.0
3,3,Built-up Area,Ready To Move,Jalahalli,2 BHK,,1000,2.0,0.0
4,4,Plot Area,Ready To Move,TC Palaya,1 Bedroom,,1350,1.0,0.0


In [None]:
test_preprocessor = DataPreprocessor(test_data.copy())

test_preprocessor.show_nan()

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Unnamed: 0,Column,Rows with Data,Total Rows,Null Values
5,society,1074,2664,40.32%
8,balcony,105,2664,3.94%
7,bath,8,2664,0.3%
4,size,2,2664,0.08%
0,ID,0,2664,0.0%
1,area_type,0,2664,0.0%
2,availability,0,2664,0.0%
3,location,0,2664,0.0%
6,total_sqft,0,2664,0.0%


In [None]:
class TransformCatToNum(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass


    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X = TotalSqFtTransformer().transform(X)
        X = SizeTransformer().transform(X)

        return X

In [None]:
test_preprocessor.transform_total_sqft()
test_preprocessor.transform_size()
test_preprocessor.drop_inconsitant_columns()

test_preprocessor.show_nan()

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Unnamed: 0,Column,Rows with Data,Total Rows,Null Values
4,balcony,105,2664,3.94%
3,bath,8,2664,0.3%
1,size,2,2664,0.08%
0,area_type,0,2664,0.0%
2,total_sqft,0,2664,0.0%


In [None]:
data_processor.show_nan()

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Unnamed: 0,Column,Rows with Data,Total Rows,Null Values
0,area_type,0,8962,0.0%
1,size,0,8962,0.0%
2,total_sqft,0,8962,0.0%
3,bath,0,8962,0.0%
4,balcony,0,8962,0.0%
5,price,0,8962,0.0%


In [None]:
# prompt: split the data set inot test train using the data_preprocessor.data and target feature is price

X = data_processor.data.drop(columns=['price'])
Y = data_processor.data['price']
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.15, random_state=42)


In [None]:
# prompt: create model pipeline for the OLS model along with data preprocessor steps and fit the model

from sklearn.linear_model import LinearRegression
ols_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', LinearRegression())
])

ols_pipeline


In [None]:
ols_pipeline.fit(X_train,y_train)

In [None]:
ols_pipeline.score(X_test,y_test)

0.4949041626748434

In [None]:
y_pred = ols_pipeline.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("RMSE:", rmse)

<IPython.core.display.Javascript object>

RMSE: 40.31217109345641


In [None]:
y_pred

array([  9.80291378,  53.65372053,  67.55846576, ..., 180.68871705,
        93.84847374,  48.28140827])

In [None]:
test_data.head()

Unnamed: 0,ID,area_type,availability,location,size,society,total_sqft,bath,balcony
0,0,Super built-up Area,Ready To Move,Chamrajpet,2 BHK,,650,1.0,1.0
1,1,Super built-up Area,Ready To Move,7th Phase JP Nagar,3 BHK,SrncyRe,1370,2.0,1.0
2,2,Super built-up Area,Ready To Move,Whitefield,3 BHK,AjhalNa,1725,3.0,2.0
3,3,Built-up Area,Ready To Move,Jalahalli,2 BHK,,1000,2.0,0.0
4,4,Plot Area,Ready To Move,TC Palaya,1 Bedroom,,1350,1.0,0.0


In [None]:
test_preprocessor.data.head()

Unnamed: 0,area_type,size,total_sqft,bath,balcony
0,Super built-up Area,2.0,60.386976,1.0,1.0
1,Super built-up Area,3.0,127.277165,2.0,1.0
2,Super built-up Area,3.0,160.257744,3.0,2.0
3,Built-up Area,2.0,92.90304,2.0,0.0
4,Plot Area,1.0,125.419104,1.0,0.0


In [None]:
test_data['ID'].shape


(2664,)

In [None]:
test_preprocessor.data.shape

(2664, 5)

In [None]:
predicted_data_frame = pd.DataFrame({'ID': test_data['ID']})

predicted_data_frame.head()

<IPython.core.display.Javascript object>

Unnamed: 0,ID
0,0
1,1
2,2
3,3
4,4


In [None]:
predicted_data_frame['price'] = ols_pipeline.predict(test_preprocessor.data)

In [None]:
predicted_data_frame.head()

Unnamed: 0,ID,price
0,0,5.497817
1,1,74.21723
2,2,113.518069
3,3,53.727925
4,4,130.937226


In [None]:
predicted_data_frame.describe()

Unnamed: 0,ID,price
count,2664.0,2664.0
mean,1331.5,116.686179
std,769.174883,454.007977
min,0.0,-16.663748
25%,665.75,58.229436
50%,1331.5,77.819864
75%,1997.25,124.349343
max,2663.0,22663.227901


In [None]:
predicted_data_frame.to_csv('predicted_data.csv', index=False)

In [None]:
ols_pipeline.fit(X,Y)

In [None]:
ols_pipeline.score(X,Y)

0.46707096138486126