In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_squared_error

In [None]:
df = pd.read_csv("data.csv")

assert df.isna().sum().sum() == 0, f"Dataset contains null values"

# Dropping 'Floor' as there are way too many unique values
df = df.drop("Floor", axis=1)

# Dropping 'Area Locality' as there are 2235 unique values in 4747 records
df = df.drop("Area Locality", axis=1)

# Converting INR to EUR
exchange_rate = 0.011
df["Rent"] = df["Rent"] * exchange_rate

# All data comes from year 2022 within span of 4 months
df["Posted On"] = pd.to_datetime(df["Posted On"])
df["Month"] = df["Posted On"].dt.month
df["Day of Week"] = df["Posted On"].dt.day_of_week
df = df.drop("Posted On", axis=1)

# Converting to categories as these features have 3-6 unique values
df["Area Type"] = df["Area Type"].astype("category")
df["City"] = df["City"].astype("category")
# There is an inherit order for each category
df["Furnishing Status"] = df["Furnishing Status"].astype("category")
df["Furnishing Status"] = pd.Categorical(
    df["Furnishing Status"],
    categories=["Unfurnished", "Semi-Furnished", "Furnished"],
    ordered=True,
)
df["Tenant Preferred"] = df["Tenant Preferred"].astype("category")
df["Point of Contact"] = df["Point of Contact"].astype("category")


sns.scatterplot(data=df, x="Size", y="Rent")
df.head()

In [None]:
z_scores_size = (df["Size"] - df["Size"].mean()) / df["Size"].std()

# Threshold for outliers acounted for z-score
outlier_threshold = 3.5

outliers_indices_size = z_scores_size.abs() > outlier_threshold

# Remove outliers from 'Size' to get rid of ~ 60 outliers
df = df[~outliers_indices_size]

In [None]:
def size_vs_rent_per_feature(feature):
    # Scatter plot: Size vs. Rent colored by feature
    plt.figure(figsize=(14, 6))
    plt.subplot(1, 2, 1)
    sns.scatterplot(x="Size", y="Rent", data=df, hue=feature, alpha=0.5)
    plt.title(f"Scatter Plot: Size vs. Rent by {feature}")

    # Bar plot: Rent per feature
    plt.subplot(1, 2, 2)
    sns.barplot(x=feature, y="Rent", data=df, errorbar=None)
    plt.title(f"Bar Plot: Rent per {feature}")
    plt.tight_layout()  # Ensures proper spacing between subplots

    plt.show()

In [None]:
# size_vs_rent_per_feature("BHK")
# size_vs_rent_per_feature("Area Type")
# size_vs_rent_per_feature("City")
# size_vs_rent_per_feature("Furnishing Status")
# size_vs_rent_per_feature("Tenant Preferred")
# size_vs_rent_per_feature("Bathroom")
# size_vs_rent_per_feature("Point of Contact")
# size_vs_rent_per_feature("Month")
# size_vs_rent_per_feature("Day of Week")

In [None]:
X = df.drop(["Rent"], axis=1)
y = df[["Rent"]]

# One hot encoding for categorical features, dropping first because of dummy variable multicollinearity
X = pd.get_dummies(
    X,
    columns=["Area Type", "City", "Point of Contact", "Tenant Preferred"],
    drop_first=True,
)


X["Furnishing Status"] = X["Furnishing Status"].cat.codes


def coef_per_feature(model, X):
    if len(model.coef_[0]) == 1:
        print("Model has only one feature")
    else:
        for coef, feature in zip(model.coef_[0], X.columns):
            print(f"{feature}: {coef:.4f}")

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=123, test_size=0.25
)

linregress_model = LinearRegression()

linregress_model = linregress_model.fit(X_train, y_train)
y_pred = linregress_model.predict(X_test)

rmae = mean_squared_error(y_pred, y_test) ** 0.5
print(f"Root mean squared error {rmae:.2f} EUR")

average_rent = y["Rent"].mean()
print(f"Average Rent: {average_rent:.2f} EUR")

coef_per_feature(linregress_model, X)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=123, test_size=0.25
)

svr_model = SVR()
svr_model.fit(X_train, y_train)