In [23]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
import plotly.express as px
from sklearn.decomposition import PCA

In [24]:
amenities = pd.read_csv('../airbnb_amenities_clean.csv')
amenities.head()

Unnamed: 0,id,Wireless_Internet,Kitchen,Heating,Essentials,Smoke_detector,Air_conditioning,TV,Shampoo,Hangers,...,First_aid_kit,Cable_TV,Free_parking_on_premises,24_hour_check_in,Lock_on_bedroom_door,Buzzer_wireless_intercom,Safety_card,Self_Check_In,Elevator,Pets_allowed
0,6901257,True,True,True,True,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,6304928,True,True,True,True,True,True,False,True,True,...,False,False,False,False,False,False,False,False,False,False
2,7919400,True,True,True,True,True,True,True,True,True,...,False,True,False,False,False,True,False,False,False,False
3,13418779,True,True,True,True,True,False,True,False,False,...,True,True,False,False,False,True,False,False,False,False
4,3808709,True,True,True,True,True,True,True,True,False,...,False,False,False,False,False,False,False,False,False,False


In [25]:
airbnb = pd.read_csv('../airbnb_clean.csv')
airbnb.head()

Unnamed: 0,id,log_price,property_type,room_type,accommodates,bathrooms,cleaning_fee,city,host_has_profile_pic,host_identity_verified,host_since,instant_bookable,latitude,longitude,number_of_reviews,review_scores_rating,zipcode,bedrooms,beds
0,6901257,5.010635,Apartment,Entire home/apt,3,1.0,True,NYC,True,True,2012-03-26,False,40.696524,-73.991617,2,100.0,11201,1,1
1,6304928,5.129899,Apartment,Entire home/apt,7,1.0,True,NYC,True,False,2017-06-19,True,40.766115,-73.98904,6,93.0,10019,3,3
2,7919400,4.976734,Apartment,Entire home/apt,5,1.0,True,NYC,True,True,2016-10-25,True,40.80811,-73.943756,10,92.0,10027,1,3
3,13418779,6.620073,House,Entire home/apt,4,1.0,True,SF,True,True,2015-04-19,False,37.772004,-122.431619,0,94.066676,94117,2,2
4,3808709,4.744932,Apartment,Entire home/apt,2,1.0,True,DC,True,True,2015-03-01,True,38.925627,-77.034596,4,40.0,20009,0,1


In [87]:
df = pd.merge(airbnb, amenities, on='id')
df.set_index('id', inplace=True)
df = df.drop(columns=["property_type", "city", "room_type", "host_since"])
df.head()

Unnamed: 0_level_0,log_price,accommodates,bathrooms,cleaning_fee,host_has_profile_pic,host_identity_verified,instant_bookable,latitude,longitude,number_of_reviews,...,First_aid_kit,Cable_TV,Free_parking_on_premises,24_hour_check_in,Lock_on_bedroom_door,Buzzer_wireless_intercom,Safety_card,Self_Check_In,Elevator,Pets_allowed
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6901257,5.010635,3,1.0,True,True,True,False,40.696524,-73.991617,2,...,False,False,False,False,False,False,False,False,False,False
6304928,5.129899,7,1.0,True,True,False,True,40.766115,-73.98904,6,...,False,False,False,False,False,False,False,False,False,False
7919400,4.976734,5,1.0,True,True,True,True,40.80811,-73.943756,10,...,False,True,False,False,False,True,False,False,False,False
13418779,6.620073,4,1.0,True,True,True,False,37.772004,-122.431619,0,...,True,True,False,False,False,True,False,False,False,False
3808709,4.744932,2,1.0,True,True,True,True,38.925627,-77.034596,4,...,False,False,False,False,False,False,False,False,False,False


In [93]:
def make_regression(df: pd.DataFrame, target: str, regressor: str='random forest'):
  """
  Makes a regression using one of the regression types specified in the type parameter.
  Args:
    target: The target column to use for regression.
    df: The dataframe to use for regression.
    type: The type of regression to use.
  Returns:
    A tuple containing the best individual and the best individual's fitness.
  """
  from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor, ExtraTreesRegressor, BaggingRegressor, VotingRegressor, StackingRegressor
  from sklearn.model_selection import train_test_split
  from sklearn.metrics import mean_squared_error, r2_score
  
  X = df.drop(columns=[target])
  y = df[target]
  
  Xtrain, Xtest, ytrain, ytest = train_test_split(
        X, y, test_size=0.2, random_state=0)
  
  if regressor == 'random forest':
    regressor = RandomForestRegressor(random_state=0, n_jobs=-1)
  elif regressor == 'adaboost':
    regressor = AdaBoostRegressor(random_state=0)
  elif regressor == 'gradient boosting':
    regressor = GradientBoostingRegressor(random_state=0 )
  elif regressor == 'extra trees':
    regressor = ExtraTreesRegressor(random_state=0, n_jobs=-1)
  elif regressor == 'bagging':
    regressor = BaggingRegressor(random_state=0, n_jobs=-1)
  elif regressor == 'voting':
    regressor = VotingRegressor(estimators=[
        ('gb', GradientBoostingRegressor(random_state=0)), 
        ('rf', RandomForestRegressor(random_state=0, n_jobs=-1)), 
        ('bag', BaggingRegressor(random_state=0, n_jobs=-1)),        
      ])
  elif regressor == 'stacking':
    regressor = StackingRegressor(
        estimators=[
          ('rf', RandomForestRegressor(random_state=0, n_jobs=-1)), 
          ('bag', BaggingRegressor(random_state=0, n_jobs=-1)),        
        ],
        final_estimator=GradientBoostingRegressor(random_state=0)
      )
  else:
    raise ValueError('Invalid regression type: {}'.format(regressor))
  
  regressor.fit(Xtrain, ytrain)
  ypred = regressor.predict(Xtest)
  
  print("Accuracy:", r2_score(ytest, ypred))
  print("MSE:", mean_squared_error(ytest, ypred))

  result = {}
  for perc in [0.05, 0.1, 0.15, 0.2]:
      top = ytest * (1 + perc)
      bot = ytest * (1 - perc)
      count = np.count_nonzero((bot < ypred) & (ypred < top))
      result[f"{100 * perc}%"] = f"{count / ypred.shape[0]:.5f}"
      print(f"{100 * perc}%", f"{count / ypred.shape[0]:.5f}")
  return result
  

In [89]:
min_rev = 10
df_filtered = df[df["number_of_reviews"] >= min_rev]
results = []
for reg in ['random forest', 'adaboost', 'gradient boosting', 'extra trees', 'bagging']:
      print(f"{reg} with {min_rev} min reviews",
            f"({df_filtered.shape[0]} instances)")
      result = make_regression(df_filtered, "review_scores_rating", regressor=reg)
      result['regressor'] = reg
      results.append(result)
      print()
      


random forest with 10 min reviews (29993 instances)
Accuracy: 0.20467246622665825
MSE: 17.703625691284604
5.0% 0.79830
10.0% 0.95666
15.0% 0.98700
20.0% 0.99433

adaboost with 10 min reviews (29993 instances)
Accuracy: -0.12302483315692236
MSE: 24.998017098567477
5.0% 0.61910
10.0% 0.96166
15.0% 0.98716
20.0% 0.99583

gradient boosting with 10 min reviews (29993 instances)
Accuracy: 0.17821475676169396
MSE: 18.29256215472405
5.0% 0.80130
10.0% 0.95333
15.0% 0.98400
20.0% 0.99367

extra trees with 10 min reviews (29993 instances)
Accuracy: 0.09461791004379583
MSE: 20.153389575401697
5.0% 0.76446
10.0% 0.94766
15.0% 0.98333
20.0% 0.99350

bagging with 10 min reviews (29993 instances)
Accuracy: 0.12712411857657102
MSE: 19.429816300153945
5.0% 0.77180
10.0% 0.95166
15.0% 0.98533
20.0% 0.99383



In [90]:

table = pd.DataFrame(results)
table.set_index('regressor', inplace=True)
table
      


Unnamed: 0_level_0,5.0%,10.0%,15.0%,20.0%
regressor,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
random forest,0.7983,0.95666,0.987,0.99433
adaboost,0.6191,0.96166,0.98716,0.99583
gradient boosting,0.8013,0.95333,0.984,0.99367
extra trees,0.76446,0.94766,0.98333,0.9935
bagging,0.7718,0.95166,0.98533,0.99383


In [95]:
for reg in ['voting', 'stacking']:
      print(f"{reg} with {min_rev} min reviews",
            f"({df_filtered.shape[0]} instances)")
      result = make_regression(df_filtered, "review_scores_rating", regressor=reg)
      result['regressor'] = reg
      results.append(result)
      print()


voting with 10 min reviews (29993 instances)
Accuracy: 0.20341566154039947
MSE: 17.731601586482167
5.0% 0.80147
10.0% 0.95666
15.0% 0.98616
20.0% 0.99383

stacking with 10 min reviews (29993 instances)
Accuracy: 0.20511794707067876
MSE: 17.69370949225933
5.0% 0.80863
10.0% 0.95583
15.0% 0.98583
20.0% 0.99417



In [96]:
table_ensembled = pd.DataFrame(results)
table_ensembled.set_index('regressor', inplace=True)
table_ensembled

Unnamed: 0_level_0,5.0%,10.0%,15.0%,20.0%
regressor,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
random forest,0.7983,0.95666,0.987,0.99433
adaboost,0.6191,0.96166,0.98716,0.99583
gradient boosting,0.8013,0.95333,0.984,0.99367
extra trees,0.76446,0.94766,0.98333,0.9935
bagging,0.7718,0.95166,0.98533,0.99383
voting,0.80147,0.95666,0.98616,0.99383
stacking,0.80863,0.95583,0.98583,0.99417
