In [None]:
# Import the libraries
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
import matplotlib
matplotlib.rcParams["figure.figsize"] = (20, 10)

### Load Dataset

In [None]:
# Load the data from csv to dataframe
df = pd.read_csv("Bengaluru_House_Data.csv")
df.head()

In [None]:
# Size of the dataset
df.shape

In [None]:
# Groupby area_type and then apply count aggregate 
df.groupby('area_type')['area_type'].agg('count')

### Data Cleaning

In [None]:
# Drop the columns which may not be useful for price detection
df2 = df.drop(['area_type', 'society', 'balcony', 'availability'], axis='columns')
df2.head()

In [None]:
# Returns total number of NA for that column
df2.isnull().sum()

In [None]:
# Drop NA
df3 = df2.dropna()
df3

In [None]:
df3["size"].unique()

In [None]:
df3["bhk"] = df3["size"].apply(lambda x: int(x.split(" ")[0]))\
df3[df3.bhk>20]
df3.total_sqft.unique()

In [None]:
def is_float(x):
    try:
        float(x)
    except:
        return False
    return True

In [None]:
df3[~df3["total_sqft"].apply(is_float)]

In [None]:
def convert_sqft_to_num(x):
    tokens = x.split("-")
    if len(tokens) == 2:
        return (float(tokens[0])+float(tokens[1]))/2
    try:
        return float(x)
    except:
        return None

In [None]:
df4 = df3.copy()
df4['total_sqft'] = df4['total_sqft'].apply(convert_sqft_to_num)

In [None]:
df4.head()

### Feature Engineering & Dimensionality Reduction

In [None]:
df5 = df4.copy()
# Adding a new feature with the help of availabile columns is called feature engineering
df5["price_per_sqft"] = df5["price"]*100000/df5["total_sqft"]
df5.head()

In [None]:
# High dimensionality problem - There are so many locations available, 
# we will call locations as "other" if they have are less than 10
len(df5["location"].unique())

In [None]:
# Remove Extra spaces in location
df5["location"] = df5["location"].apply(lambda x: x.strip())
location_stats = df5.groupby("location")["location"].agg("count").sort_values(ascending=False)
location_stats

In [None]:
len(location_stats[location_stats<10])

In [None]:
location_stats_less_than_10 = location_stats[location_stats<10]
location_stats_less_than_10

In [None]:
# Dimensionality reduction
df5["location"] = df5["location"].apply(lambda x: "other" if x in location_stats_less_than_10 else x)
len(df5["location"].unique())