In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor, StackingRegressor
from xgboost import XGBRegressor
from sklearn.linear_model import Lasso

In [None]:
df = pd.read_csv("data.csv")

assert df.isna().sum().sum() == 0, "Dataset contains null values"

# Dropping 'Floor' as there are way too many unique values
df = df.drop("Floor", axis=1)

# Dropping 'Area Locality' as there are 2235 unique values in 4747 records
df = df.drop("Area Locality", axis=1)

# Converting INR to EUR
exchange_rate = 0.011
df["Rent"] = df["Rent"] * exchange_rate

# Converting to categories as these features have 3-6 unique values
df["Area Type"] = (
    df["Area Type"]
    # .apply(lambda x: "Not Carpet Area" if x != "Carpet Area" else "Carpet Area")
    .astype("category")
)


df["City"] = (
    df["City"]
    # .apply(lambda x: "Not Mumbai" if x != "Mumbai" else "Mumbai")
    .astype("category")
)


# There is an inherit order for each category
df["Furnishing Status"] = df["Furnishing Status"].astype("category")
df["Furnishing Status"] = pd.Categorical(
    df["Furnishing Status"],
    categories=["Unfurnished", "Semi-Furnished", "Furnished"],
    ordered=True,
)

df["Point of Contact"] = (
    df["Point of Contact"]
    # .apply(lambda x: "Not Contact Agent" if x != "Contact Agent" else "Contact Agent")
    .astype("category")
)

df = df.drop(["Posted On", "Tenant Preferred", "Bathroom"], axis=1)

sns.scatterplot(data=df, x="Size", y="Rent")
print(f'Mean rent with outliers is {df["Rent"].mean():.2f} EUR')
df.head()

In [None]:
outlier_threshold = 1.5

# Calculate IQR for 'Rent'
Q1_rent = df["Rent"].quantile(0.25)
Q3_rent = df["Rent"].quantile(0.75)
IQR_rent = Q3_rent - Q1_rent


outliers_indices_rent = (df["Rent"] < Q1_rent - outlier_threshold * IQR_rent) | (
    df["Rent"] > Q3_rent + outlier_threshold * IQR_rent
)

# Calculate IQR for 'Size'
Q1_size = df["Size"].quantile(0.25)
Q3_size = df["Size"].quantile(0.75)
IQR_size = Q3_size - Q1_size


outliers_indices_size = (df["Size"] < Q1_size - outlier_threshold * IQR_size) | (
    df["Size"] > Q3_size + outlier_threshold * IQR_size
)

# Remove outliers from both 'Rent' and 'Size'
df = df[~(outliers_indices_rent | outliers_indices_size)]

print(f'Mean rent without outliers is {df["Rent"].mean():.2f}')
sns.scatterplot(x=df["Size"], y=df["Rent"])

In [None]:
def size_vs_rent_per_feature(feature):
    # Scatter plot: Size vs. Rent colored by feature
    plt.figure(figsize=(14, 6))
    plt.subplot(1, 2, 1)
    sns.scatterplot(x="Size", y="Rent", data=df, hue=feature, alpha=0.5)
    plt.title(f"Scatter Plot: Size vs. Rent by {feature}")

    # Bar plot: Rent per feature
    plt.subplot(1, 2, 2)
    sns.barplot(x=feature, y="Rent", data=df, errorbar=None)
    plt.title(f"Bar Plot: Rent per {feature}")
    plt.tight_layout()  # Ensures proper spacing between subplots

    plt.show()

In [None]:
if False:
    size_vs_rent_per_feature("Area Type")
    size_vs_rent_per_feature("City")
    size_vs_rent_per_feature("Furnishing Status")
    size_vs_rent_per_feature("Point of Contact")

In [None]:
def RMSE(y_pred, y_test):
    rmae = mean_squared_error(y_pred, y_test) ** 0.5
    print(f"Root mean squared error {rmae:.2f} EUR")

In [None]:
X = df.drop(["Rent"], axis=1)
y = df[["Rent"]]

# One hot encoding for categorical features, dropping first because of dummy variable multicollinearity
X = pd.get_dummies(
    X,
    columns=["Area Type", "City", "Point of Contact"],
    drop_first=True,
)

X["Furnishing Status"] = X["Furnishing Status"].cat.codes

X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=123, test_size=0.20
)

In [None]:
linregress_model = LinearRegression()
linregress_model = linregress_model.fit(X_train, y_train)

y_pred = linregress_model.predict(X_test)

RMSE(y_pred, y_test)

In [None]:
rf = RandomForestRegressor()

param_grid_rf = {
    "max_samples": [0.75],
    "max_depth": range(3, 10),
    "max_features": [3, 5, 7, 9],
    "n_estimators": range(100, 2001, 100),
    "min_samples_leaf": [2, 4, 6, 8, 10],
    "min_samples_split": [2, 4, 5],
}

random_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_grid_rf,
    cv=3,
    n_jobs=-1,
    n_iter=20,
    verbose=False,
    random_state=1,
)

random_search.fit(X_train, np.array(y_train).flatten())

rf_model = random_search.best_estimator_
rf_model.fit(X_train, np.array(y_train).flatten())

y_pred = rf_model.predict(X_test)

RMSE(y_pred, y_test)

In [None]:
xgb = XGBRegressor()

param_grid_xgb = {
    "learning_rate": [0.02, 0.1, 0.2],
    "n_estimators": range(100, 2001, 100),
    "max_depth": range(3, 10),
    "subsample": [0.8, 0.9, 1.0],
    "colsample_bytree": [0.8, 0.9, 1.0],
}

random_search = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=param_grid_xgb,
    n_iter=25,
    cv=3,
    n_jobs=-1,
    verbose=False,
    random_state=1,
)

random_search.fit(X_train, np.array(y_train).flatten())

xgb_model = random_search.best_estimator_

y_pred = xgb_model.predict(X_test)

RMSE(y_pred, y_test)

In [None]:
stacking_model = StackingRegressor(
    estimators=[("linear", linregress_model), ("xgb", xgb_model), ("rf", rf_model)],
    final_estimator=Lasso(),
)

# Fit the stacking model on the training data
stacking_model.fit(X_train, np.array(y_train).flatten())

# Predict on the test set using the stacking model
y_pred = stacking_model.predict(X_test)

RMSE(y_pred, y_test)

In [None]:
f"Mean rent is {y.Rent.mean():.2f} Eur"