# Libraries that I'll be used

In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
 

pd.set_option('display.max_columns', None)


# Overviwing the data

In [2]:
data = pd.read_csv('/.csv')
data

FileNotFoundError: [Errno 2] No such file or directory: '/kaggle/input/bengaluru-house-price-data/Bengaluru_House_Data.csv'

In [None]:
data.isna().sum()

# Cleaning the data

In [None]:
data.shape

In [None]:
# let's see how many area we have and the freq of each one of them
data.groupby('area_type')['area_type'].agg('count')

In [None]:
# dropping the columns that will not be used 
data = data.drop(['area_type','society','availability', 'balcony'],axis=1)
data

In [None]:
# how many nan values are there
data.isna().sum()

In [None]:
# droping nan values
data.dropna(inplace=True)


In [None]:
data.isna().sum()

In [None]:
data.shape

In [None]:
# how many unique sizes are there
data['size'].unique()

In [None]:
# creating new column for the

data['bhk'] = data['size'].apply(lambda x : int(x.split(' ')[0]) )


In [None]:
data

In [None]:

data[data['bhk'] > 20]

In [None]:
data.total_sqft.unique()

In [None]:
def is_it_float(x):
    try:
        float(x)
    except:
        return False
    return True

In [None]:
data[~data.total_sqft.apply(is_it_float)].head()

In [None]:

def convert_sqft_to_num(x):
    tokens = x.split('-')
    if len(tokens) == 2:
        return np.average([float(tokens[0]), float(tokens[1])])
    try:
        return float(x)
    except:
        return None


In [None]:
# testing the function
convert_sqft_to_num('2100 - 2850')

In [None]:
data['total_sqft'] = data['total_sqft'].apply(convert_sqft_to_num)


In [None]:
data.head()

# Feature Engineering

In [None]:
data['price_per_sqft'] = data['price']*100000/data['total_sqft']
data

In [None]:
len(data.location.unique())
## too many locations

In [None]:
# how many data available per location
data.location = data.location.apply(lambda x : x.strip())
stats_of_location = data.groupby('location')['location'].agg('count')
stats_of_location.sort_values(ascending=False)

In [None]:
# if we have a location has less than 10 let's call it other
len(stats_of_location[stats_of_location<=10])

In [None]:
# stats_location_less_than_10
stats_of_location_less_than_10 = stats_of_location[stats_of_location<=10]
stats_of_location_less_than_10.sort_values(ascending=False)

In [None]:
data.location = data.location.apply(lambda x: 'other' if x in stats_of_location_less_than_10 else x)

In [None]:
len(data.location.unique())

In [None]:
data

# Outlier detection and removal 

In [None]:
data[data.total_sqft/data.bhk < 300]


In [None]:
data = data[~(data.total_sqft/data.bhk < 300)]
data.shape

In [None]:
data.describe()

In [None]:
# removing outliers above 1 std for each location
def remove_outliers(data):
    data_out = pd.DataFrame()
    for key, subdf in data.groupby('location'):
        mean = np.mean(subdf.price_per_sqft)
        std = np.std(subdf.price_per_sqft)
        reduced_df = subdf[(subdf.price_per_sqft>(mean-std)) & (subdf.price_per_sqft<=(mean+std))]
        data_out = pd.concat([data_out,reduced_df],ignore_index=True)
    return data_out

data = remove_outliers(data)
print(data.shape)


In [None]:
data = data[data.bath < data.bhk+2]
data.shape

In [None]:
 data = data.drop(['size','price_per_sqft'],axis='columns')

In [None]:
data

# Modeling time

In [None]:
dummies = pd.get_dummies(data.location)
dummies.head()

In [None]:
# concat the with tha data and drop other column from the dummies
data = pd.concat([data,dummies.drop('other',axis='columns')],axis='columns')
data


In [None]:
data = data.drop('location',axis='columns')


In [None]:
data.head()
print(data.shape)

In [None]:
X = data.drop('price',axis='columns')
X.head()

In [None]:
y = data.price
y.head()

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=10)

In [None]:

lrg = LinearRegression()
lrg.fit(X_train,y_train)
y_pred = lrg.predict(X_test)

In [None]:
# RMSE for Linear Regression
np.sqrt(mean_squared_error(y_test,y_pred))

In [None]:
model_xgb = XGBRegressor(n_estimators=500)
model_xgb.fit(X_train,y_train)

In [None]:
y_pred = model_xgb.predict(X_test)
# RMSE
np.sqrt(mean_squared_error(y_test,y_pred))

In [None]:
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor

# Define the XGBRegressor model
model = XGBRegressor()

# Define the hyperparameter grid to search over
param_grid = {
    'learning_rate': [0.01, 0.1, 0.3],
    'max_depth': [3, 5, 7],
    'n_estimators': [100, 200, 300]
}

# Perform the grid search with 5-fold cross-validation
grid_search = GridSearchCV(
    model, param_grid, cv=5, verbose=2, n_jobs=-1)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Print the best hyperparameters
print("Best parameters found:", grid_search.best_params_)


In [None]:
def predict_the_price(location,sqft,bath,bhk):
    location_index = np.where(X.columns==location)[0][0]
    
    x = np.zeros(len(X.columns))
    x[0] = sqft
    x[1] = bath
    x[2] = bhk
    if location_index >= 0:
        x[location_index] = 1
    return lr_clf.predict([x])[0]