In [None]:
# Import the libraries
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
import matplotlib
matplotlib.rcParams["figure.figsize"] = (20, 10)

### Load Dataset

In [None]:
# Load the data from csv to dataframe
df = pd.read_csv("Bengaluru_House_Data.csv")
df.head()

In [None]:
# Size of the dataset
df.shape

In [None]:
# Groupby area_type and then apply count aggregate 
df.groupby('area_type')['area_type'].agg('count')

### Data Cleaning

In [None]:
# Drop the columns which may not be useful for price detection
df2 = df.drop(['area_type', 'society', 'balcony', 'availability'], axis='columns')
df2.head()

In [None]:
# Returns total number of NA for that column
df2.isnull().sum()

In [None]:
# Drop NA
df3 = df2.dropna()
df3

In [None]:
df3["size"].unique()

In [None]:
df3["bhk"] = df3["size"].apply(lambda x: int(x.split(" ")[0]))
df3[df3.bhk>20]
df3.total_sqft.unique()

In [None]:
def is_float(x):
    try:
        float(x)
    except:
        return False
    return True

In [None]:
df3[~df3["total_sqft"].apply(is_float)]

In [None]:
def convert_sqft_to_num(x):
    tokens = x.split("-")
    if len(tokens) == 2:
        return (float(tokens[0])+float(tokens[1]))/2
    try:
        return float(x)
    except:
        return None

In [None]:
df4 = df3.copy()
df4['total_sqft'] = df4['total_sqft'].apply(convert_sqft_to_num)

In [None]:
df4.head()

### Feature Engineering & Dimensionality Reduction

In [None]:
df5 = df4.copy()
# Adding a new feature with the help of availabile columns is called feature engineering
df5["price_per_sqft"] = df5["price"]*100000/df5["total_sqft"]
df5.head()

In [None]:
# High dimensionality problem - There are so many locations available, 
# we will call locations as "other" if they have are less than 10
len(df5["location"].unique())

In [None]:
# Remove Extra spaces in location
df5["location"] = df5["location"].apply(lambda x: x.strip())
location_stats = df5.groupby("location")["location"].agg("count").sort_values(ascending=False)
location_stats

In [None]:
len(location_stats[location_stats<10])

In [None]:
location_stats_less_than_10 = location_stats[location_stats<10]
location_stats_less_than_10

In [None]:
# Dimensionality reduction
df5["location"] = df5["location"].apply(lambda x: "other" if x in location_stats_less_than_10 else x)
len(df5["location"].unique())

### Outlier Detection & Outlier Remover

In [None]:
# We can use standard deviation or domain knowledge to remove outliers.
# For eg, in domain knowledge, it is not possible to have 40 bedrooms per 2000 sq ft.
# We can assume that, 300 sq.ft is equivalent to 1 bedroom. Consider this as threshold.
# So here, we can divide the total_sqft by threshold sqft to remove the outliers
df6 = df5[~(df5["total_sqft"]/df5.bhk<300)]
df6.shape

In [None]:
# Get the statistics for this column, remove the extreme cases.
# Around 68% data lies between mean and one standard deviation
df6["price_per_sqft"].describe()

In [None]:
# We want to find mean standard deviation per location and filter out datapoints which are beyond one standard deviation.
# Remove price per square outliers
def remove_pps_outliers(df):
    df_out = pd.DataFrame()
    for key, subdf in df.groupby('location'):
        m = np.mean(subdf.price_per_sqft)
        st = np.std(subdf.price_per_sqft)
        reduced_df = subdf[(subdf.price_per_sqft>(m-st)) & (subdf.price_per_sqft<=(m+st))]
        df_out = pd.concat([df_out, reduced_df], ignore_index=True)
    return df_out
df7 = remove_pps_outliers(df6)
df7

In [None]:
# Now we want to check property price for 3 bedrooms apartments are 
# more than of 2 bedrooms aprartments or not for same square foot
# Visualization which tells how many such cases are present can be done using scatter plot
def plot_scatter_chart(df, location):
    bhk2 = df[(df.location==location) & (df.bhk==2)]
    bhk3 = df[(df.location==location) & (df.bhk==3)]
    matplotlib.rcParams["figure.figsize"] = (15, 10)
    plt.scatter(bhk2.total_sqft, bhk2.price_per_sqft, color='blue', label='2 BHK', s=50)
    plt.scatter(bhk3.total_sqft, bhk3.price_per_sqft, marker='+', color='green', label='3 BHK', s=50)
    plt.xlabel("Total square feet area")
    plt.ylabel("Price")
    plt.title(location)
    plt.legend()

plot_scatter_chart(df7, "Rajaji Nagar")

###### Remove properties with 3 bedroom apartments price is less than 2 bedroom apartments. We can remove those 2 BHK apartments whose price_per_sqft is less than mean price_per_sqft of 1 BHK apartment.
For eg, 
<br>{<br>
&emsp;'1': {<br>
&emsp;&emsp;'mean': 4000,<br>
&emsp;&emsp;'std': 2000,<br>
&emsp;&emsp;'count': 34<br>
&emsp;},<br>
&emsp;'2': {<br>
&emsp;&emsp;'mean': 4300,<br>
&emsp;&emsp;'std': 2300,<br>
&emsp;&emsp;'count': 22<br>
&emsp;},<br>
}<br>

In [None]:
def remove_bhk_outliers(df):
    exclude_indices = np.array([])
    for location, location_df in df.groupby("location"):
        bhk_stats = {}
        for bhk, bhk_df in location_df.groupby("bhk"):
            bhk_stats[bhk] = {
                "mean": np.mean(bhk_df.price_per_sqft),
                "std": np.std(bhk_df.price_per_sqft),
                "count": bhk_df.shape[0]
            }
        for bhk, bhk_df in location_df.groupby("bhk"):
            stats = bhk_stats.get(bhk-1)
            if stats and stats["count"]>5:
                exclude_indices = np.append(exclude_indices, bhk_df[bhk_df.price_per_sqft<(stats["mean"])].index.values)
    return df.drop(exclude_indices, axis='index')
df8 = remove_bhk_outliers(df7)
df8.shape

In [None]:
# Here, the outliers are removed
plot_scatter_chart(df8, "Rajaji Nagar")

In [None]:
# How many apartments per square feet area
# Histogram
plt.hist(df8.price_per_sqft, rwidth=0.8)
plt.xlabel("Price per square feet")
plt.ylabel("Count")

In [None]:
df8.bath.unique()

In [None]:
# Is there we have any criteria to remove bathroom.
# For eg, 2 BHK apartment have maximum 2 bathrooms. If 2 BHK apartments has 6 bathroom then it is weird.
df8[df8.bath>10]

In [None]:
plt.hist(df8.bath, rwidth=0.8)
plt.xlabel("Number of bathrooms")
plt.ylabel("Count")

In [None]:
# Let's assume, we have criteria that anytime 
# we have a bathrooms greater than number of bedrooms + 2 then consider it as outlier
# Bathroom Outlier = No. of bathrooms > No. of bedrooms + 2
df9 = df8[df8.bath<df8.bhk+2]
df9.shape

In [None]:
# Remove unwanted features
# Remove price_per_sqft, as we don't need it in Machine Learning because we used it for outlier detection
# Remove size column, as we have bhk column in the dataframe.
df10 = df9.drop(['size', 'price_per_sqft'], axis='columns')
df10.head()

The output of df10.head() shows the text data for location attribute. But Machine Learning model can't understand the text data, so we have to convert that into the numerical. For that, we will use one-hot encoding method, also called dummies the method of pandas. 

#### One-Hot Encoding

In [None]:
# It will creates column for each location and set the value 1 when specific location appeared 
# and set rest of the locations as 0.
dummies = pd.get_dummies(df10.location)

In [None]:
# Append dummies df to new df
# To avoid dummy variables trap, you should have one less dummies column.
# For eg, let's drop last column which is 'other'
df11 = pd.concat([df10, dummies.drop('other', axis='columns')], axis="columns")
df11

In [None]:
# Now we can drop location column
df12 = df11.drop('location', axis="columns")
df12.head(3)

In [None]:
# Shape of df12
# df12.shape

In [None]:
# Drop the dependent variable price, so x can contains 
# only independent variables and we can use price in model
X = df12.drop("price", axis="columns")
X.head()

In [None]:
# Dependent variable
y = df12.price
y.head()

### Model Creation

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)

In [None]:
from sklearn.linear_model import LinearRegression

# Linear regression classifier
lr_clf = LinearRegression()

# Training of the model
lr_clf.fit(X_train, y_train)

# Evaluate the score of the model (How good our model is.)
lr_clf.score(X_test, y_test)

In [None]:
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score

# Shuffle split: Randomize the samples, Each fold have equal distribution
cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)

# Majority time we're getting more than 80% score using cross_val_score for our example.
cross_val_score(LinearRegression(), X, y, cv=cv)

In [None]:
# As a data scientist, we've to check which method is giving best score.
# There are different algorithms for calculating the score of the model.
# For that, we use grid search cv (which runs a model on different regressors and parameters and tells best score).
from sklearn.model_selection import GridSearchCV

# Other than Linear Regression, we also want to tries the Lasso and DescisionTree Regressor
from sklearn.linear_model import Lasso
from sklearn.tree import DecisionTreeRegressor

def find_best_model_using_gridsearchcv(X, y):
    algos = {
        "linear_regression": {
            "model": LinearRegression(),
            "params": {
                "normalize": [True, False]
            } # Hyper parameter tuning (GridSearchCV also tells which is best paramter)
        },
        "lasso": {
            "model": Lasso(),
            "params": {
                "alpha": [1, 2],
                "selection": ["random", "cyclic"]  
            } # Hyper parameter tuning
        },
        "decision_tree": {
            "model": DecisionTreeRegressor(),
            "params": {
                "criterion": ["mse", "friedman_mse"],
                "splitter": ["best", "random"]
            } # Hyper parameter tuning
        }
    }
    
    scores = []
    cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
    
    for algo_name, config in algos.items():
        gs = GridSearchCV(config["model"], config["params"], cv=cv, return_train_score=False)
        gs.fit(X, y)
        scores.append({
            "model": algo_name,
            "best_score": gs.best_score_,
            "best_params": gs.best_params_
        })
    
    return pd.DataFrame(scores, columns=["model", "best_score", "best_params"])

# As a result, the function returns that linear regression model returns the best score
find_best_model_using_gridsearchcv(X, y)

In [None]:
# We will go with the linear regression only to predict the price
def predict_price(location, sqft, bath, bhk):
    loc_index = np.where(X.columns==location)[0][0]
    
    x = np.zeros(len(X.columns))
    x[0] = sqft
    x[1] = bath
    x[2] = bhk
    
    if loc_index >= 0:
        x[loc_index] = 1
    
    return lr_clf.predict([x])[0]

In [None]:
# Let's test our function for 1st Phase JP Nagar with square foot 1000 contains 2 bathroom with 2 bhk
predict_price("1st Phase JP Nagar", 1000, 2, 2)

In [None]:
# Let's predict for high price locations
predict_price("Indira Nagar", 1000, 2, 2)

### Export Model to Pickle File

In [None]:
import pickle
with open("banglore_home_prices_model.pickle", "wb") as f:
    # Pass your model and file object in the pickle dump function
    pickle.dump(lr_clf, f)

In [None]:
# Other than model, we also need column information
import json
columns = {
    "data_columns": [col.lower() for col in X.columns]
}
with open("columns.json", "w") as f:
    f.write(json.dumps(columns))