In [141]:
import pandas as pd
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, mean_squared_error, r2_score
from sklearn.cluster import KMeans, DBSCAN
from sklearn.metrics.pairwise import haversine_distances
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SequentialFeatureSelector
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
df = pd.read_excel("data.xlsx")

In [5]:
df_copy = df.copy()

In [76]:
df = df_copy.copy()
df.head()

Unnamed: 0,id,category,title,body,amenities,bathrooms,bedrooms,currency,fee,has_photo,...,price_display,price_type,square_feet,address,cityname,state,latitude,longitude,source,time
0,5668640009,housing/rent/apartment,One BR 507 & 509 Esplanade,"This unit is located at 507 & 509 Esplanade, R...",,1.0,1,USD,No,Thumbnail,...,2195,Monthly,542,507 509 Esplanade,Redondo Beach,CA,33.852,-118.3759,RentLingo,1577360000.0
1,5668639818,housing/rent/apartment,Three BR 146 Lochview Drive,"This unit is located at 146 Lochview Drive, Ne...",,1.5,3,USD,No,Thumbnail,...,1250,Monthly,1500,146 Lochview Dr,Newport News,VA,37.0867,-76.4941,RentLingo,1577360000.0
2,5668639686,housing/rent/apartment,Three BR 3101 Morningside Drive,This unit is located at 3101 Morningside Drive...,,2.0,3,USD,No,Thumbnail,...,1395,Monthly,1650,3101 Morningside Dr,Raleigh,NC,35.823,-78.6438,RentLingo,1577360000.0
3,5668639659,housing/rent/apartment,Two BR 209 Aegean Way,"This unit is located at 209 Aegean Way, Vacavi...",,1.0,2,USD,No,Thumbnail,...,1600,Monthly,820,209 Aegean Way,Vacaville,CA,38.3622,-121.9712,RentLingo,1577360000.0
4,5668639374,housing/rent/apartment,One BR 4805 Marquette NE,"This unit is located at 4805 Marquette NE, Alb...",,1.0,1,USD,No,Thumbnail,...,975,Monthly,624,4805 Marquette NE,Albuquerque,NM,35.1038,-106.611,RentLingo,1577360000.0


In [77]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99826 entries, 0 to 99825
Data columns (total 22 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             99826 non-null  object 
 1   category       99826 non-null  object 
 2   title          99826 non-null  object 
 3   body           99812 non-null  object 
 4   amenities      83749 non-null  object 
 5   bathrooms      99760 non-null  object 
 6   bedrooms       99699 non-null  object 
 7   currency       99822 non-null  object 
 8   fee            99823 non-null  object 
 9   has_photo      99823 non-null  object 
 10  pets_allowed   39192 non-null  object 
 11  price          99821 non-null  float64
 12  price_display  99820 non-null  object 
 13  price_type     99823 non-null  object 
 14  square_feet    99823 non-null  object 
 15  address        7946 non-null   object 
 16  cityname       99521 non-null  object 
 17  state          99521 non-null  object 
 18  latitu

In [78]:
df = df.drop(columns=["id","category","title","body","time","address","currency","price_display"])

In [79]:
df.columns

Index(['amenities', 'bathrooms', 'bedrooms', 'fee', 'has_photo',
       'pets_allowed', 'price', 'price_type', 'square_feet', 'cityname',
       'state', 'latitude', 'longitude', 'source'],
      dtype='object')

In [80]:
df['amenities'] =  df['amenities'].fillna("No amenities")
test = set()
for value in df['amenities'].unique():
    temp = value.split(",")
    for var in temp:
        test.add(var)

for column in test:
    df[column] = df['amenities'].apply(lambda x: 1 if column in x else 0)

df = df.drop(columns=['amenities','No','USD'])


In [81]:
df['pets_allowed'] = df['pets_allowed'].fillna("No")
df['pets_allowed'].isna().sum()

0

In [82]:
df = df.dropna()

In [83]:
df = df[df['bathrooms']!="Thumbnail"]
df = df[df['bedrooms']!="Thumbnail"]
df = df[df['fee'].isin(["Yes", "No"])]
df['has_photo'] = df['has_photo'].replace({"Thumbnail":"No"})
df['pets_allowed'] = df['pets_allowed'].apply(lambda x: 1 if "Cats" in x or "Dogs" in x else 0)

In [84]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 99335 entries, 0 to 99825
Data columns (total 41 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   bathrooms           99335 non-null  object 
 1   bedrooms            99335 non-null  object 
 2   fee                 99335 non-null  object 
 3   has_photo           99335 non-null  object 
 4   pets_allowed        99335 non-null  int64  
 5   price               99335 non-null  float64
 6   price_type          99335 non-null  object 
 7   square_feet         99335 non-null  object 
 8   cityname            99335 non-null  object 
 9   state               99335 non-null  object 
 10  latitude            99335 non-null  float64
 11  longitude           99335 non-null  float64
 12  source              99335 non-null  object 
 13  Garbage Disposal    99335 non-null  int64  
 14  Gated               99335 non-null  int64  
 15  Clubhouse           99335 non-null  int64  
 16  Parking  

In [85]:
for column in df.columns: 
    if column in ['bathrooms','bedrooms', 'price','latitude','longitude', 'square_feet']:
        df[column] = df[column].astype('float')
    else:
        df[column] = df[column].astype('object')

In [86]:
Regression_comparision = {}

In [142]:
# Splite the model into training, test and validation set
X = df.drop(columns=['price','cityname','state']).copy()
X = pd.get_dummies(X, columns=['fee','has_photo','pets_allowed','price_type','source'], drop_first=True)
y = df['price'].copy()

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.6)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5)

In [143]:
#Linear regression
lr_benchmark = LinearRegression()
lr_benchmark.fit(X_train, y_train)

y_pred = lr_benchmark.predict(X_val)

r2 = r2_score(y_val, y_pred)
MSE = mean_squared_error(y_val, y_pred)
RMSE = np.sqrt(MSE)
MAE = mean_absolute_error(y_val, y_pred)
MAPE = mean_absolute_percentage_error(y_val, y_pred)

Regression_comparision['LinearRegression_benchmark'] = {'r2': r2,
                                                        'MSE': MSE,
                                                        'RMSE': RMSE,
                                                        'MAE': MAE,
                                                        'MAPE': MAPE}

In [144]:
#Ridge
ridge_benchmark = Ridge()
ridge_benchmark.fit(X_train, y_train)

y_pred = ridge_benchmark.predict(X_val)

r2 = r2_score(y_val, y_pred)
MSE = mean_squared_error(y_val, y_pred)
RMSE = np.sqrt(MSE)
MAE = mean_absolute_error(y_val, y_pred)
MAPE = mean_absolute_percentage_error(y_val, y_pred)

Regression_comparision['Ridge_benchmark'] = {'r2': r2,
                                                        'MSE': MSE,
                                                        'RMSE': RMSE,
                                                        'MAE': MAE,
                                                        'MAPE': MAPE}

In [145]:
#Lasso
lasso_benchmark = Lasso()
lasso_benchmark.fit(X_train, y_train)

y_pred = lasso_benchmark.predict(X_val)

r2 = r2_score(y_val, y_pred)
MSE = mean_squared_error(y_val, y_pred)
RMSE = np.sqrt(MSE)
MAE = mean_absolute_error(y_val, y_pred)
MAPE = mean_absolute_percentage_error(y_val, y_pred)

Regression_comparision['Lasso_benchmark'] = {'r2': r2,
                                                        'MSE': MSE,
                                                        'RMSE': RMSE,
                                                        'MAE': MAE,
                                                        'MAPE': MAPE}

In [146]:
#Decision Tree benchmark
dc_benchmark = DecisionTreeRegressor()
dc_benchmark.fit(X_train, y_train)

y_pred = dc_benchmark.predict(X_val)

r2 = r2_score(y_val, y_pred)
MSE = mean_squared_error(y_val, y_pred)
RMSE = np.sqrt(MSE)
MAE = mean_absolute_error(y_val, y_pred)
MAPE = mean_absolute_percentage_error(y_val, y_pred)

Regression_comparision['DecisionTree_benchmark'] = {'r2': r2,
                                                        'MSE': MSE,
                                                        'RMSE': RMSE,
                                                        'MAE': MAE,
                                                        'MAPE': MAPE}

In [147]:
#Random Forest benchmark
rf_benchmark = RandomForestRegressor()
rf_benchmark.fit(X_train, y_train)

y_pred = rf_benchmark.predict(X_val)

r2 = r2_score(y_val, y_pred)
MSE = mean_squared_error(y_val, y_pred)
RMSE = np.sqrt(MSE)
MAE = mean_absolute_error(y_val, y_pred)
MAPE = mean_absolute_percentage_error(y_val, y_pred)

Regression_comparision['RandomForest_benchmark'] = {'r2': r2,
                                                        'MSE': MSE,
                                                        'RMSE': RMSE,
                                                        'MAE': MAE,
                                                        'MAPE': MAPE}

In [148]:
#MLP benchmark
mlp_benchmark = MLPRegressor(max_iter=500)
mlp_benchmark.fit(X_train, y_train)

y_pred = mlp_benchmark.predict(X_val)

r2 = r2_score(y_val, y_pred)
MSE = mean_squared_error(y_val, y_pred)
RMSE = np.sqrt(MSE)
MAE = mean_absolute_error(y_val, y_pred)
MAPE = mean_absolute_percentage_error(y_val, y_pred)

Regression_comparision['MultilayerPerceptron_benchmark'] = {'r2': r2,
                                                        'MSE': MSE,
                                                        'RMSE': RMSE,
                                                        'MAE': MAE,
                                                        'MAPE': MAPE}

In [149]:
results = pd.DataFrame(Regression_comparision)
results

Unnamed: 0,LinearRegression_benchmark,Ridge_benchmark,Lasso_benchmark,DecisionTree_benchmark,RandomForest_benchmark,MultilayerPerceptron_benchmark,LinearRegression_clustering,Ridge_clustering,Lasso_clustering,DecisionTree_clustering,RandomForest_clustering,MultilayerPerceptron_clustering,LinearRegression_clustering_euclidean,Ridge_clustering_euclidean,Lasso_clustering_euclidean,DecisionTree_clustering_euclidean,RandomForest_clustering_euclidean,MultilayerPerceptron_clustering_euclidean
r2,0.272298,0.273425,0.269756,0.560691,0.774582,0.438261,0.32007,0.320621,0.315914,0.378042,0.663129,0.353126,0.309342,0.309789,0.304916,0.328865,0.673813,0.341421
MSE,611601.970202,610654.798077,613737.677054,369220.354193,189453.80568,472117.338757,497939.840024,497536.364796,500983.419425,455484.058289,246704.216207,473731.358975,505796.014059,505468.714473,509037.745133,491498.761319,238879.921925,482303.176056
RMSE,782.049851,781.444047,783.414116,607.63505,435.262916,687.107953,705.648524,705.362577,707.801822,674.895591,496.693282,688.281453,711.193373,710.96323,713.468812,701.069726,488.753437,694.480508
MAE,479.783682,479.687497,480.879968,243.740963,185.904901,405.997591,453.511727,453.457559,454.578687,310.996897,248.433204,427.663503,453.537389,453.454367,454.440761,302.716933,240.632644,422.368063
MAPE,0.339201,0.339148,0.340692,0.155795,0.121324,0.291068,0.320819,0.320814,0.321925,0.195453,0.159832,0.295321,0.319143,0.31911,0.320065,0.189354,0.154833,0.284259
