# Data Cleaning

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import matplotlib
matplotlib.rcParams["figure.figsize"] = (10, 20)

In [None]:
df = pd.read_csv("./datasets/bengaluru_house_prices.csv")
df

In [None]:
df2 = df.drop(["area_type", "society", "balcony", "availability"], axis=1)
df2

In [None]:
df.shape

In [None]:
df2

In [None]:
df2.isnull().sum()

In [None]:
df3 = df2.dropna()
df3.isnull().sum()

In [None]:
df3["size"].unique()

In [None]:
df3["bhk"] = df3["size"].apply(lambda x: int(x.split(" ")[0]))

In [None]:
df3

In [None]:
df3.total_sqft.unique()

In [None]:
def is_float(x):
    try:
        float(x)
    except:
        return False
    return True


In [None]:
df3[~df3["total_sqft"].apply(is_float)]

In [None]:
def convert_sqrt_to_float(x):
    tokens = x.split("-")
    if len(tokens) == 2:
        return (float(tokens[0]) + float(tokens[1])) / 2

    try:
        return float(x)
    except:
        return None
    


In [None]:
convert_sqrt_to_float("266")


In [None]:
df4 = df3.copy()
df4["total_sqft"] = df4["total_sqft"].apply(convert_sqrt_to_float)
df4.head()

In [None]:
df4.loc[30]

# Feature Engineering


In [None]:
df5 = df4.copy()

In [None]:
df5["sprice_per_sqft"] = df5["price"] / df5["total_sqft"] * 10000
df5

In [None]:
df5.location = df5.location.apply(lambda x: x.strip())

location_stats = df5.groupby("location")["location"].agg("count").sort_values(ascending=False)
location_stats.head()

In [None]:
location_less_than_10 = location_stats[location_stats <= 10]
location_less_than_10

In [None]:
len(df5.location.unique())

In [None]:
df5.location = df5.location.apply(lambda x: "other" if x in location_less_than_10 else x)
len(df5.location.unique())

# Outlier Removal

In [None]:
df5

In [None]:
df5[df5.total_sqft / df5.bhk < 300].head()

In [None]:
df5.shape

In [None]:
# remove rows have (total_sqft / bhk) less than 300
df6 = df5[~(df5.total_sqft / df5.bhk < 300)]
df6.shape

In [None]:
def remove_pps_outlier(df):
    df_out = pd.DataFrame()
    for key, subdf in df.groupby("location"):
        mean = np.mean(df.sprice_per_sqft)
        std= np.std(df.sprice_per_sqft)
        reduced_df = subdf[(subdf.sprice_per_sqft > (mean - std)) & (subdf.sprice_per_sqft <= (mean + std))]
        df_out = pd.concat([df_out, reduced_df], ignore_index=True)
    return df_out

df7 = remove_pps_outlier(df6)
df7.shape

In [None]:
df7

In [None]:
def remove_bhk_outlier(df):
    exclude_indices = np.array([])
    for location, location_df in df.groupby("location"):
        bhk_stats = {}
        for bhk, bhk_df in location_df.groupby("bhk"):
            bhk_stats[bhk] = {
                "mean": np.mean(bhk_df.sprice_per_sqft),
                "std": np.std(bhk_df.sprice_per_sqft),
                "count": bhk_df.shape[0]
            }
        for bhk, bhk_df in location_df.groupby("bhk"):
            stats = bhk_stats.get(bhk-1)
            if stats and stats["count"] > 5:
                exclude_indices = np.append(exclude_indices, bhk_df[bhk_df.sprice_per_sqft< (stats["mean"])].index.values)
    return df.drop(exclude_indices, axis="index")

df8 = remove_bhk_outlier(df7)
df8.shape

# Build Model

In [None]:
df9 = df8[df8.bath < df8.bhk + 2]
df9

In [None]:
df10 = df9.drop(["size", "sprice_per_sqft"], axis="columns")
df10

In [None]:
dummies_location = pd.get_dummies(df10.location)
dummies_location.head(3)

In [None]:
df11 = pd.concat([df10.drop(columns=["location"], axis="columns"), dummies_location.drop(columns=["other"], axis="columns")], axis="columns")
df11

In [None]:
df11

In [None]:
X = df11.drop("price", axis="columns")
X

In [None]:
y = df11["price"]
y

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [107]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=10)

model = LinearRegression(n_jobs=-2)
model.fit(X_train, y_train)
model.score(X_test, y_test)

0.8336554371344911

**Use cross validation for evaluate the model**

In [None]:
from sklearn.model_selection import ShuffleSplit, cross_val_score

cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
cross_val_score(LinearRegression(), X, y, cv=cv)

**Use `GridSearchCV` to find the best model**

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso
from sklearn.tree import DecisionTreeRegressor

def find_best_model_using_gridsearchcv(X,y):
    algos = {
        'linear_regression' : {
            'model': LinearRegression(),
            'params': {
                "n_jobs": [-2, -1, 1]
            }
        },
        'lasso': {
            'model': Lasso(),
            'params': {
                'alpha': [1,2],
                'selection': ['random', 'cyclic']
            }
        },
        'decision_tree': {
            'model': DecisionTreeRegressor(),
            'params': {
                'criterion' : ['squared_error','friedman_mse'],
                'splitter': ['best','random']
            }
        }
    }
    scores = []
    cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
    for algo_name, config in algos.items():
        gs =  GridSearchCV(config['model'], config['params'], cv=cv, return_train_score=False)
        gs.fit(X,y)
        scores.append({
            'model': algo_name,
            'best_score': gs.best_score_,
            'best_params': gs.best_params_
        })

    return pd.DataFrame(scores,columns=['model','best_score','best_params'])

find_best_model_using_gridsearchcv(X,y)

**Export the tested model to a pickle file**

In [108]:
import pickle
with open("./Bengaluru.pickle", "wb") as f:
    pickle.dump(model, f)

**Export location and column information to a file that will be useful later on in our prediction application**

In [109]:
import json
columns = {
    "data_columns": [col.lower() for col in X.columns]
}
with open("./columns.json", "w") as f:
    f.write(json.dumps(columns))