<h2><b>Import needed library

In [17]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import ShuffleSplit
import pickle

In [2]:
df = pd.read_csv("bengaluru.csv")
df.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


<h2><b> Data Cleaning

In [3]:
#remove unused column
df.drop(["availability", "society", "area_type"], axis="columns", inplace=True)

In [4]:
#look unique values each columns
for col in df:
    print(f"{col} : {df[col].unique()}")

location : ['Electronic City Phase II' 'Chikka Tirupathi' 'Uttarahalli' ...
 '12th cross srinivas nagar banshankari 3rd stage' 'Havanur extension'
 'Abshot Layout']
size : ['2 BHK' '4 Bedroom' '3 BHK' '4 BHK' '6 Bedroom' '3 Bedroom' '1 BHK'
 '1 RK' '1 Bedroom' '8 Bedroom' '2 Bedroom' '7 Bedroom' '5 BHK' '7 BHK'
 '6 BHK' '5 Bedroom' '11 BHK' '9 BHK' nan '9 Bedroom' '27 BHK'
 '10 Bedroom' '11 Bedroom' '10 BHK' '19 BHK' '16 BHK' '43 Bedroom'
 '14 BHK' '8 BHK' '12 Bedroom' '13 BHK' '18 Bedroom']
total_sqft : ['1056' '2600' '1440' ... '1133 - 1384' '774' '4689']
bath : [ 2.  5.  3.  4.  6.  1.  9. nan  8.  7. 11. 10. 14. 27. 12. 16. 40. 15.
 13. 18.]
balcony : [ 1.  3. nan  2.  0.]
price : [ 39.07 120.    62.   ...  40.14 231.   488.  ]


In [5]:
#check for null values
df.isnull().sum()

#drop null value
df = df.dropna()
df.isnull().sum()

location      0
size          0
total_sqft    0
bath          0
balcony       0
price         0
dtype: int64

In [6]:
#remove text from size column
df["bhk"] = df["size"].apply(lambda x: int(x.split(' ')[0]))
df

Unnamed: 0,location,size,total_sqft,bath,balcony,price,bhk
0,Electronic City Phase II,2 BHK,1056,2.0,1.0,39.07,2
1,Chikka Tirupathi,4 Bedroom,2600,5.0,3.0,120.00,4
2,Uttarahalli,3 BHK,1440,2.0,3.0,62.00,3
3,Lingadheeranahalli,3 BHK,1521,3.0,1.0,95.00,3
4,Kothanur,2 BHK,1200,2.0,1.0,51.00,2
...,...,...,...,...,...,...,...
13314,Green Glen Layout,3 BHK,1715,3.0,3.0,112.00,3
13315,Whitefield,5 Bedroom,3453,4.0,0.0,231.00,5
13317,Raja Rajeshwari Nagar,2 BHK,1141,2.0,1.0,60.00,2
13318,Padmanabhanagar,4 BHK,4689,4.0,1.0,488.00,4


In [7]:
#range value detect
df.total_sqft.unique()

#function for filterized range value
def is_float(x):
    try:
        float(x)
    except:
        return False
    return True

#range value
df[df["total_sqft"].apply(is_float) == False]

#Range to average value
def converter(x):
    token = x.split("-")
    if len(token) == 2:
        x = (float(token[0]) + float(token[1]))/2
    try:
        return float(x)
    except:
        return None

#convert range to 1 value
df["sqft"] = df.total_sqft.apply(converter)

In [8]:
#make other categories for minority location dataset
lst = df.groupby("location")["location"].agg("count")
lstless = lst[lst<50]

#apply other function
df.location = df.location.apply(lambda x: "other" if x in lstless else x)
len(df.location.unique())

51

<h2><b> Remove Outliers value

In [9]:
#make price_per_sqft column for house
df["price_per_sqft"] = df.price*100000/df.sqft
df.head()

Unnamed: 0,location,size,total_sqft,bath,balcony,price,bhk,sqft,price_per_sqft
0,Electronic City Phase II,2 BHK,1056,2.0,1.0,39.07,2,1056.0,3699.810606
1,other,4 Bedroom,2600,5.0,3.0,120.0,4,2600.0,4615.384615
2,Uttarahalli,3 BHK,1440,2.0,3.0,62.0,3,1440.0,4305.555556
3,other,3 BHK,1521,3.0,1.0,95.0,3,1521.0,6245.890861
4,Kothanur,2 BHK,1200,2.0,1.0,51.0,2,1200.0,4250.0


In [10]:
#remove total bedroom unmatch with total sqft (300 per bedroom)
df = df[df.sqft/df.bhk>300]

In [11]:
#too cheap/expensive price_per_sqft based on location (extreme value will remove to reduce bias)
df.price_per_sqft.describe()

#function to remove outliers based on location (std deviation)
def outlierRem(df):
    df_out = pd.DataFrame()
    for key, sub in df.groupby("location"):
        mean = np.mean(sub.price_per_sqft)
        std = np.std(sub.price_per_sqft)
        reducer = sub[(sub.price_per_sqft>(mean-std)) & (sub.price_per_sqft<=(mean+std))]
        df_out = pd.concat([df_out, reducer], ignore_index=True)
    return df_out

df = outlierRem(df)
df.shape

(10201, 9)

In [12]:
#remove 2 BHK house that have higher price than 3 BHK on same location
def bhkoutliers(df):
    exc = np.array([])
    for location, location_df in df.groupby("location"):
        bhk_stats = {}
        for bhk, bhk_df in location_df.groupby("bhk"):
            bhk_stats[bhk] = {
                "mean" : np.mean(bhk_df.price_per_sqft),
                "std" : np.std(bhk_df.price_per_sqft),
                "count" : bhk_df.shape[0]
            }
        for bhk, bhk_df in location_df.groupby("bhk"):
            stats = bhk_stats.get(bhk-1)
            if stats and stats["count"]>5:
                exc = np.append(exc, bhk_df[bhk_df.price_per_sqft<(stats["mean"])].index.values)

    return df.drop(exc, axis="index")

df = bhkoutliers(df)
df.shape

(6139, 9)

In [13]:
#remove house that have bathroom more than bedroom
df.bath.unique()
df = df[df.bath<df.bhk+2]
df.shape

(6087, 9)

<h2><b> Data preparation

In [14]:
#remove unused column
df.head()
df.drop(["balcony", "total_sqft", "size", "price_per_sqft", "location"], axis="columns", inplace=True)
df

Unnamed: 0,bath,price,bhk,sqft
0,2.0,72.0,2,1080.0
1,2.0,93.0,2,1270.0
2,2.0,100.0,3,1420.0
3,3.0,150.0,3,1850.0
4,2.0,94.0,2,1245.0
...,...,...,...,...
10195,2.0,60.0,2,1015.0
10197,3.0,134.0,3,1805.0
10198,3.0,112.0,3,1715.0
10199,4.0,488.0,4,4689.0


In [15]:
x = df.drop("price", axis='columns')
y = df.price

from sklearn.model_selection import train_test_split
xtr, xte, ytr, yte = train_test_split(x, y, test_size=.3, random_state=10)

<h2><b> Make linear regression model

In [16]:
def find_best(X,y):
    algos = {
        'linear_regression' : {
            'model': LinearRegression(),
            'params': {
                
            }

        },
        'decision_tree': {
            'model': DecisionTreeRegressor(),
            'params': {
                'criterion' : ['mse','friedman_mse'],
                'splitter': ['best','random']
            }
        }
    }
    
    scores = []
    cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
    for algo_name, config in algos.items():
        gs =  GridSearchCV(config['model'], config['params'], cv=cv, return_train_score=False)
        gs.fit(X,y)
        scores.append({
            'model': algo_name,
            'best_score': gs.best_score_,
            'best_params': gs.best_params_
        })

    return pd.DataFrame(scores,columns=['model','best_score','best_params'])

find_best(x,y)

NameError: name 'ShuffleSplit' is not defined

In [None]:
linear = LinearRegression()
linear.fit(xtr, ytr)
linear.score(xte, yte)

df.head()

Unnamed: 0,bath,price,bhk,sqft
0,2.0,72.0,2,1080.0
1,2.0,93.0,2,1270.0
2,2.0,100.0,3,1420.0
3,3.0,150.0,3,1850.0
4,2.0,94.0,2,1245.0


<h2><b> Pickling model

In [None]:
pickle.dump(linear, open("regression.pkl", "wb"))