In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_theme(color_codes=True)

In [None]:
df = pd.read_csv('/kaggle/input/house-rent-prediction-dataset/House_Rent_Dataset.csv')
df.head()

# Data Preprocessing

In [None]:
# Remove unwanted "Posted On" column
df.drop(columns = "Posted On",inplace=True)
df.head()

In [None]:
# Checking unique values in all object-datatypes column
df.select_dtypes(include='object').nunique()

In [None]:
# Remove column Floor and Area Locality column due to amount of unique values
df.drop(columns=['Floor','Area Locality'], inplace=True)
df.head()

# Exploratory Data Analysis (EDA)

In [None]:
# List all category to plot
category = ['Area Type', 'City','Furnishing Status','Tenant Preferred','Point of Contact']

# create subplots figures
fig, axs = plt.subplots(nrows=2, ncols=3, figsize=(20,10))
axs = axs.ravel()

# create barplot for each category
for i, var in enumerate(category):
    sns.barplot(x=var,y='Rent' ,data=df ,ax=axs[i], estimator=np.mean)
    axs[i].set_xticklabels(axs[i].get_xticklabels(), rotation=90)

#autofit layout
fig.tight_layout()

#remove 6th subplot
fig.delaxes(axs[5])

plt.show()

In [None]:
# Specify no of max category
max_categories = 5
category = ['Area Type', 'City', 'Furnishing Status', 'Tenant Preferred', 'Point of Contact']

#create fig and axs
fig, axs = plt.subplots(nrows=2, ncols=3, figsize=(15,15))

# create piechart
for i, var in enumerate(category):
    if 1 < len(axs.flat):
        # count no of occurence in each category
        cat_counts = df[var].value_counts()
        
        # group category as other that are not the top max categories
        if len(cat_counts) > max_categories:
            top_cat_counts = cat_counts[:max_categories]
            others_cat_counts = pd.Series(cat_counts[max_categories:].sum(),index=['Other'])
            cat_counts = pd.concat([top_cat_counts, others_cat_counts])
            
        # create a pie chart
        axs.flat[i].pie(cat_counts, labels=cat_counts.index, autopct='%1.1f%%', startangle=90)
        
        # set title
        axs.flat[i].set_title(f'{var} Distribution')

# remove 6th subplot
fig.delaxes(axs[1,2])

# autofit layout
fig.tight_layout()

plt.show()        

In [None]:
num_vars = ['Size', 'BHK', 'Bathroom']

fig, axs = plt.subplots(nrows=1, ncols=3, figsize=(20,10))
axs = axs.flatten()

for i, var in enumerate (num_vars):
    sns.boxplot(x=var, data=df, ax=axs[i])
    
fig.tight_layout()

plt.show()

In [None]:
num_vars = ['Size', 'BHK', 'Bathroom']

fig, axs = plt.subplots(nrows=1, ncols=3, figsize=(20,10))
axs = axs.flatten()

for i, var in enumerate (num_vars):
    sns.violinplot(x=var, data=df, ax=axs[i])
    
fig.tight_layout()

plt.show()

# Data Preprocessing

In [None]:
# check missing value
check_miss = df.isnull().sum() * 100 / df.shape[0]
check_miss[check_miss > 0].sort_values(ascending=False)

In [None]:
df.shape

# Label Encoding for Each Datatype

In [None]:
# Loop over each column in DF where dtype is "object"
for col in df.select_dtypes(include=['object']).columns:
    # print unique column and its values
    print(f"{col}: {df[col].unique()}")

In [None]:
from sklearn import preprocessing

# Loop over each column in DF where dtype is "object"
for col in df.select_dtypes(include=['object']).columns:
    
    # initialize label encoder
    label_encoder = preprocessing.LabelEncoder()
    
    # fit encoder to the unique values in column
    label_encoder.fit(df[col].unique())
    
    # transform column using encoder
    df[col] = label_encoder.transform(df[col])
    
    # print unique column and its encoded values
    print(f"{col}: {df[col].unique()}")

In [None]:
# Heatmap Correlation
plt.figure(figsize=(20,16))
sns.heatmap(df.corr(), fmt=".2g", annot=True)

# Train Test Split

In [None]:
from sklearn.model_selection import train_test_split

#select features (x) and variables (y)
X = df.drop('Rent', axis=1)
y = df['Rent']

# split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Remove Outlier from train data using Z-score

In [None]:
from scipy import stats

#Define columns to remove outliers
select_cols = ['Size', 'BHK', 'Bathroom']

# calculate z-score for select_cols in training data
z_scores = np.abs(stats.zscore(X_train[select_cols]))

# set threshold for outlier detection, use 3
threshold = 3

# find the indices of outliers based on threshold
outlier_indices = np.where(z_scores > threshold)[0]

# remove the outliers from training_data
X_train = X_train.drop(X_train.index[outlier_indices])
y_train = y_train.drop(y_train.index[outlier_indices])

# Decision Tree Regressor

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import fetch_california_housing

# Create DecisionTreeRegressor object
dtree = DecisionTreeRegressor()

# Define the hyperparameters to tune and their values
param_grid = {
    'max_depth': [2,4,6,8],
    'min_samples_split': [2,4,6,8],
    'min_samples_leaf': [1,2,3,4],
    'max_features': ['auto','sqrt','log2'],
    'random_state': [0,42]
}

# Create a GridSearchCV object
grid_search = GridSearchCV(dtree, param_grid, cv=5, scoring='neg_mean_squared_error')

# Fit the GridSearchCV object to the data
grid_search.fit(X_train, y_train)

# Print the best hyperparameters
print(grid_search.best_params_)

In [None]:
from sklearn.tree import DecisionTreeRegressor
dtree = DecisionTreeRegressor(random_state=0, max_depth=8, max_features='log2', min_samples_leaf=4, min_samples_split=2)
dtree.fit(X_train,y_train)

In [None]:
from sklearn import metrics
from sklearn.metrics import mean_absolute_percentage_error
import math

y_pred = dtree.predict(X_test)

# Mean Absolute Error (MAE)
mae = metrics.mean_absolute_error(y_test, y_pred)

# Mean Absolute Percentage Error (MAPE)
mape = mean_absolute_percentage_error(y_test, y_pred) * 100

# Mean Squared Error (MSE)
mse = metrics.mean_squared_error(y_test, y_pred)

# R^2 score
r2 = metrics.r2_score(y_test, y_pred)

# Root Mean Squared Error (RMSE)
rmse = math.sqrt(mse)

# Print the calculated metrics
print(f"MAE: {mae:.4f}")
print(f"MAPE: {mape:.2f}%")
print(f"MSE: {mse:.4f}")
print(f"R^2: {r2:.4f}")
print(f"RMSE: {rmse:.4f}")

In [None]:
lap_df = pd.DataFrame({
  "Feature Name": X_train.columns,
  "Importance": dtree.feature_importances_
})

# Sort features by importance in descending order
f1 = lap_df.sort_values(by='Importance', ascending=False)
print(f1)

f12 = f1.head(10)
plt.figure(figsize=(10,8))

sns.barplot(x="Importance", y="Feature Name", data=f12)
plt.xlabel("Importance", fontsize =16)
plt.ylabel("Feature Name", fontsize =16)
plt.title("Feature Importance in Decision Tree")
plt.show()

In [None]:
!pip install shap

from shap import TreeExplainer
import shap

explainer = TreeExplainer(dtree)
shap_values = explainer.shap_values(X_test)
shap.summary_plot(shap_values, X_test)

In [None]:
explainer = shap.Explainer(dtree, X_test)
shap_values = explainer(X_test)
shap.plots.waterfall(shap_values[0])

# Random Forest Regressor

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

# create randomforestregressor object
rf = RandomForestRegressor()

# define hyperparameters
param_grid = {
    'max_depth': [3,5,7,9],
    'min_samples_split': [2,5,10],
    'min_samples_leaf': [1,2,4],
    'max_features': ['auto','sqrt'],
    'random_state': [0,42]
}

# Create a GridSearchCV object
grid_search = GridSearchCV(rf, param_grid, cv=5, scoring='r2')

# Fit the GridSearchCV object to the data
grid_search.fit(X_train, y_train)

# Print the best hyperparameters
print(grid_search.best_params_)

In [None]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(random_state=42, max_depth=9, min_samples_split=5, min_samples_leaf=2, max_features='sqrt')

#Fit it
rf.fit(X_train, y_train)

In [None]:
from sklearn import metrics
from sklearn.metrics import mean_absolute_percentage_error
import math

y_pred = rf.predict(X_test)

# Mean Absolute Error (MAE)
mae = metrics.mean_absolute_error(y_test, y_pred)

# Mean Absolute Percentage Error (MAPE)
mape = mean_absolute_percentage_error(y_test, y_pred) * 100

# Mean Squared Error (MSE)
mse = metrics.mean_squared_error(y_test, y_pred)

# R^2 score
r2 = metrics.r2_score(y_test, y_pred)

# Root Mean Squared Error (RMSE)
rmse = math.sqrt(mse)

# Print the calculated metrics
print(f"MAE: {mae:.4f}")
print(f"MAPE: {mape:.2f}%")
print(f"MSE: {mse:.4f}")
print(f"R^2: {r2:.4f}")
print(f"RMSE: {rmse:.4f}")

In [None]:
imp_df = pd.DataFrame({
  "Feature Name": X_train.columns,
  "Importance": rf.feature_importances_
})

# Sort features by importance in descending order
f1 = lap_df.sort_values(by='Importance', ascending=False)
print(f1)

f12 = f1.head(10)
plt.figure(figsize=(10,8))

sns.barplot(x="Importance", y="Feature Name", data=f12)
plt.xlabel("Importance", fontsize =16)
plt.ylabel("Feature Name", fontsize =16)
plt.title("Feature Importance in Random Forest Regressor")
plt.show()

In [None]:
from shap import TreeExplainer

explainer = TreeExplainer(rf)
shap_values = explainer.shap_values(X_test)
shap.summary_plot(shap_values, X_test)

In [None]:
explainer = shap.Explainer(rf, X_test)
shap_values = explainer(X_test)
shap.plots.waterfall(shap_values[0])