# OLS Regression on Rental Data and Geospacial Data

In [60]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
from sklearn.preprocessing import PolynomialFeatures, StandardScaler


In [61]:
# Load the CSV file into a Pandas DataFrame
data = pd.read_csv('4_Final_Data.csv')

# Display the first 5 rows of the DataFrame
data


Unnamed: 0,Latitude,Longitude,Monthly rent,Aconto,Square meters,Rooms,Furnished,Shareable,Pets allowed,Elevator,...,Floor_-1 to 0,Floor_1-3,Floor_3-8,Floor_9 or above,Distance to Transport Station (km),Distance to Beach (km),Distance to School (km),Distance to Restaurant (km),Distance to Hospital (km),Distance to Mall (km)
0,55.393856,10.394185,5400.0,800.0,62.0,2.0,0.0,0.0,0.0,1.0,...,0,1,0,0,0.227327,10.937984,0.347042,0.114848,1.856833,0.972492
1,56.300211,10.477949,7100.0,700.0,100.0,4.0,0.0,0.0,1.0,0.0,...,0,1,0,0,6.490199,5.187801,0.198759,0.081711,22.428327,20.441833
2,57.057390,9.936155,5800.0,300.0,64.0,2.0,0.0,0.0,0.0,0.0,...,0,1,0,0,2.057982,46.063274,0.894998,1.091345,1.726659,1.252278
3,55.721333,12.556324,8200.0,500.0,43.0,1.0,0.0,0.0,0.0,0.0,...,0,1,0,0,0.684702,2.087369,0.121870,0.253656,0.873095,1.532133
4,56.199538,10.198303,8195.0,600.0,60.0,3.0,0.0,1.0,1.0,1.0,...,0,1,0,0,1.569096,12.856874,0.929082,0.441412,1.803299,0.677731
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9962,57.053293,9.911053,7600.0,850.0,73.0,2.0,0.0,0.0,0.0,0.0,...,0,1,0,0,0.158809,45.438610,0.556684,0.239033,0.530896,0.832760
9963,55.485419,9.500225,8250.0,1000.0,84.0,3.0,0.0,0.0,1.0,1.0,...,0,1,0,0,1.342171,1.674266,1.353033,0.310371,1.846679,4.283248
9964,56.449611,9.404525,9500.0,1550.0,130.0,4.0,0.0,0.0,0.0,0.0,...,0,0,1,0,0.466744,33.988380,0.126922,0.030089,0.458502,0.100810
9965,55.307771,11.543354,7150.0,500.0,87.0,3.0,0.0,0.0,0.0,0.0,...,0,1,0,0,12.496495,10.451021,2.480094,8.383154,15.404188,16.237640


# Without Geospatial Data

In [62]:
# Prepare the features (X) and the label (y)
# Drop 'Monthly rent', 'Longitude' and 'Latitude' columns
X = data.drop(['Monthly rent', 'Longitude', 'Latitude', 'PostalCode_1000-1999', 'Floor_-1 to 0', 'Distance to Transport Station (km)', 'Distance to Beach (km)', 'Distance to School (km)', 'Distance to Restaurant (km)', 'Distance to Hospital (km)', 'Distance to Mall (km)'], axis=1)
y = data['Monthly rent']  # Labels (the 'Monthly rent' column)

# Add a constant to the features (intercept term)
X = sm.add_constant(X)

# Create an OLS model
model = sm.OLS(y, X)

# Fit the OLS model with heteroscedasticity robust errors
results = model.fit(cov_type='HC3')

# Print the summary of the regression
print(results.summary())

# Get the Predicted Values
y_pred = results.predict(X)

# Calculate the Residuals
residuals = y - y_pred

# Calculate the RMSE
RMSE = np.sqrt((residuals ** 2).mean())
print('Root Mean Squared Error (RMSE):', RMSE)



                            OLS Regression Results                            
Dep. Variable:           Monthly rent   R-squared:                       0.752
Model:                            OLS   Adj. R-squared:                  0.752
Method:                 Least Squares   F-statistic:                     737.7
Date:                Wed, 16 Aug 2023   Prob (F-statistic):               0.00
Time:                        14:57:21   Log-Likelihood:                -89813.
No. Observations:                9967   AIC:                         1.797e+05
Df Residuals:                    9944   BIC:                         1.798e+05
Df Model:                          22                                         
Covariance Type:                  HC3                                         
                           coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------------
const                 1.253e+04 

# With Geospatial Data

In [63]:
# Prepare the features (X) and the label (y)
# Drop 'Monthly rent', 'Longitude' and 'Latitude' columns
X = data.drop(['Monthly rent', 'Longitude', 'Latitude', 'PostalCode_1000-1999', 'Floor_-1 to 0'], axis=1)
y = data['Monthly rent']  # Labels (the 'Monthly rent' column)

# Add a constant to the features (intercept term)
X = sm.add_constant(X)

# Create an OLS model
model = sm.OLS(y, X)

# Fit the OLS model with heteroscedasticity robust errors
results = model.fit(cov_type='HC3')

# Print the summary of the regression
print(results.summary())

# Get the Predicted Values
y_pred = results.predict(X)

# Calculate the Residuals
residuals = y - y_pred

# Calculate the RMSE
RMSE = np.sqrt((residuals ** 2).mean())
print('Root Mean Squared Error (RMSE):', RMSE)



                            OLS Regression Results                            
Dep. Variable:           Monthly rent   R-squared:                       0.778
Model:                            OLS   Adj. R-squared:                  0.778
Method:                 Least Squares   F-statistic:                     639.6
Date:                Wed, 16 Aug 2023   Prob (F-statistic):               0.00
Time:                        14:57:21   Log-Likelihood:                -89261.
No. Observations:                9967   AIC:                         1.786e+05
Df Residuals:                    9938   BIC:                         1.788e+05
Df Model:                          28                                         
Covariance Type:                  HC3                                         
                                         coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------------------------
cons