In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_squared_error

In [None]:
df = pd.read_csv("data.csv")

assert df.isna().sum().sum() == 0, "Dataset contains null values"

# Dropping 'Floor' as there are way too many unique values
df = df.drop("Floor", axis=1)

# Dropping 'Area Locality' as there are 2235 unique values in 4747 records
df = df.drop("Area Locality", axis=1)

# Converting INR to EUR
exchange_rate = 0.011
df["Rent"] = df["Rent"] * exchange_rate

# Converting to categories as these features have 3-6 unique values
df["Area Type"] = df["Area Type"].apply(
    lambda x: "Not Carpet Area" if x != "Carpet Area" else "Carpet Area"
)
df["Area Type"] = df["Area Type"].astype("category")


df["City"] = df["City"].apply(lambda x: "Not Mumbai" if x != "Mumbai" else "Mumbai")
df["City"] = df["City"].astype("category")

# There is an inherit order for each category
df["Furnishing Status"] = df["Furnishing Status"].astype("category")
df["Furnishing Status"] = pd.Categorical(
    df["Furnishing Status"],
    categories=["Unfurnished", "Semi-Furnished", "Furnished"],
    ordered=True,
)

df["Point of Contact"] = df["Point of Contact"].apply(
    lambda x: "Not Contact Agent" if x != "Contact Agent" else "Contact Agent"
)

df = df.drop(["Posted On", "Tenant Preferred", "BHK", "Bathroom"], axis=1)

sns.scatterplot(data=df, x="Size", y="Rent")
df.head()

In [None]:
# Method #1 to get rid of ~35 outliers
z_scores_size = (df["Size"] - df["Size"].mean()) / df["Size"].std()

# Threshold for outliers acounted for z-score
outlier_threshold = 4


outliers_indices_size = z_scores_size.abs() > outlier_threshold


# # Method #2 to get rid of ~200 outliers
# # Calculate IQR
# Q1 = df["Size"].quantile(0.25)
# Q3 = df["Size"].quantile(0.75)
# IQR = Q3 - Q1

# # Define outlier threshold based on IQR
# outlier_threshold = 1.5  # You can adjust this threshold as needed

# # Identify outliers using IQR
# outliers_indices_size = (df["Size"] < Q1 - outlier_threshold * IQR) | (
#     df["Size"] > Q3 + outlier_threshold * IQR
# )

# Remove outliers from 'Size'
df = df[~outliers_indices_size]


sns.scatterplot(data=df, x="Size", y="Rent")

In [None]:
def size_vs_rent_per_feature(feature):
    # Scatter plot: Size vs. Rent colored by feature
    plt.figure(figsize=(14, 6))
    plt.subplot(1, 2, 1)
    sns.scatterplot(x="Size", y="Rent", data=df, hue=feature, alpha=0.5)
    plt.title(f"Scatter Plot: Size vs. Rent by {feature}")

    # Bar plot: Rent per feature
    plt.subplot(1, 2, 2)
    sns.barplot(x=feature, y="Rent", data=df, errorbar=None)
    plt.title(f"Bar Plot: Rent per {feature}")
    plt.tight_layout()  # Ensures proper spacing between subplots

    plt.show()

In [None]:
size_vs_rent_per_feature("Area Type")
size_vs_rent_per_feature("City")
size_vs_rent_per_feature("Furnishing Status")
size_vs_rent_per_feature("Point of Contact")

In [None]:
X = df.drop(["Rent"], axis=1)
y = df[["Rent"]]


# One hot encoding for categorical features, dropping first because of dummy variable multicollinearity
X = pd.get_dummies(
    X,
    columns=["Area Type", "City", "Point of Contact"],
    drop_first=True,
)


X["Furnishing Status"] = X["Furnishing Status"].cat.codes

X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=123, test_size=0.25
)


def coef_per_feature(model, X):
    if len(model.coef_[0]) == 1:
        print("Model has only one feature")
    else:
        for coef, feature in zip(model.coef_[0], X.columns):
            print(f"{feature}: {coef:.4f}")


def RMSE(y_pred, y_test):
    rmae = mean_squared_error(y_pred, y_test) ** 0.5
    print(f"Root mean squared error {rmae:.2f} EUR")


def print_avg_rent(y_train):
    average_rent = y_train["Rent"].mean()
    print(f"Average Rent: {average_rent:.2f} EUR")

In [None]:
linregress_model = LinearRegression()
linregress_model = linregress_model.fit(X_train, y_train)

y_pred = linregress_model.predict(X_test)
RMSE(y_pred, y_test)

print_avg_rent(y_train)

coef_per_feature(linregress_model, X)

In [None]:
svr = SVR()


param_grid = {
    "C": [0.01, 1, 100],
    "gamma": [0.01, 1, 10],
    "kernel": ["linear", "rbf", "poly", "sigmoid"],
    "epsilon": [0, 0.1, 0.5, 1, 2],
}


random_search = RandomizedSearchCV(
    svr,
    n_jobs=5,
    param_distributions=param_grid,
    n_iter=10,
    cv=3,
    random_state=123,
    verbose=3,
)


# random_search.fit(X_train, np.array(y_train).flatten())

# # Print the best hyperparameters
# print(f"Best Hyperparameters: {random_search.best_params_}")


# # Get the best SVR model
# svr_model = random_search.best_estimator_

# # Make predictions with the best model
# y_pred = svr_model.predict(X_test)


# print_avg_rent(y_train)
# RMSE(y_pred, y_test)