# Predictive Analytics Final Project

# Setup

In [34]:
# Common imports
import numpy as np
import pandas as pd

np.random.seed(142)


# Get the data

In [37]:
#We will predict the "price" value in the data set:

airbnb = pd.read_csv("airbnb.csv")
airbnb.head()

Unnamed: 0,host_is_superhost,host_identity_verified,neighbourhood_cleansed,latitude,longitude,property_type,room_type,accommodates,bathrooms,bedrooms,...,guests_included,price_per_extra_person,minimum_nights,number_of_reviews,number_days_btw_first_last_review,review_scores_rating,cancellation_policy,price,price_gte_150,price_category
0,0,0,Roslindale,42.282619,-71.133068,House,Entire home/apt,4,1.5,2.0,...,1,0,2,0,0,,moderate,250,1,gte_226
1,0,1,Roslindale,42.286241,-71.134374,Apartment,Private room,2,1.0,1.0,...,0,0,2,36,804,94.0,moderate,65,0,lte_$75
2,1,1,Roslindale,42.292438,-71.135765,Apartment,Private room,2,1.0,1.0,...,1,20,3,41,2574,98.0,moderate,65,0,lte_$75
3,0,0,Roslindale,42.281106,-71.121021,House,Private room,4,1.0,1.0,...,2,25,1,1,0,100.0,moderate,75,0,lte_$75
4,1,1,Roslindale,42.284512,-71.136258,House,Private room,2,1.5,1.0,...,1,0,2,29,380,99.0,flexible,79,0,btw_$75-$150


In [39]:
airbnb.shape

(3555, 23)

# Split the data into train and test

In [42]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(airbnb, test_size=0.3)

## Check the missing values

In [46]:
train_set.isna().sum()

host_is_superhost                      0
host_identity_verified                 0
neighbourhood_cleansed                 0
latitude                               0
longitude                              0
property_type                          3
room_type                              0
accommodates                           0
bathrooms                              8
bedrooms                               8
beds                                   5
bed_type                               0
Number of amenities                    0
guests_included                        0
price_per_extra_person                 0
minimum_nights                         0
number_of_reviews                      0
number_days_btw_first_last_review      0
review_scores_rating                 569
cancellation_policy                    0
price                                  0
price_gte_150                          0
price_category                         0
dtype: int64

In [48]:
test_set.isna().sum()

host_is_superhost                      0
host_identity_verified                 0
neighbourhood_cleansed                 0
latitude                               0
longitude                              0
property_type                          0
room_type                              0
accommodates                           0
bathrooms                              6
bedrooms                               2
beds                                   4
bed_type                               0
Number of amenities                    0
guests_included                        0
price_per_extra_person                 0
minimum_nights                         0
number_of_reviews                      0
number_days_btw_first_last_review      0
review_scores_rating                 231
cancellation_policy                    0
price                                  0
price_gte_150                          0
price_category                         0
dtype: int64

# Data Prep

In [51]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

In [151]:
train = train_set.drop(['price_gte_150', 'price_category'], axis=1)
test = test_set.drop(['price_gte_150', 'price_category'], axis=1)

## Separate the target variable (we don't want to transform it)

In [57]:
train_y = train[['price']]
test_y = test[['price']]

train_inputs = train.drop(['price'], axis=1)
test_inputs = test.drop(['price'], axis=1)

##  Identify the numerical and categorical columns

In [61]:
numeric_columns = ['latitude', 'longitude', 'accommodates', 
                   'bathrooms', 'bedrooms', 'beds', 
                   'Number of amenities', 'guests_included', 
                   'price_per_extra_person', 'minimum_nights', 
                   'number_of_reviews', 'number_days_btw_first_last_review', 
                   'review_scores_rating']
 
binary_columns = ['host_is_superhost', 'host_identity_verified']
 
categorical_columns = ['neighbourhood_cleansed', 'property_type', 
                       'room_type', 'bed_type', 'cancellation_policy']

In [63]:
binary_columns

['host_is_superhost', 'host_identity_verified']

In [65]:
numeric_columns

['latitude',
 'longitude',
 'accommodates',
 'bathrooms',
 'bedrooms',
 'beds',
 'Number of amenities',
 'guests_included',
 'price_per_extra_person',
 'minimum_nights',
 'number_of_reviews',
 'number_days_btw_first_last_review',
 'review_scores_rating']

In [67]:
categorical_columns

['neighbourhood_cleansed',
 'property_type',
 'room_type',
 'bed_type',
 'cancellation_policy']

# Pipeline

In [70]:
numeric_transformer = Pipeline(steps=[
                ('imputer', SimpleImputer(strategy='mean')),
                ('scaler', StandardScaler())])

In [72]:
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='unknown')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [74]:
binary_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent'))])

In [76]:
preprocessor = ColumnTransformer([
        ('num', numeric_transformer, numeric_columns),
        ('cat', categorical_transformer, categorical_columns),
        ('binary', binary_transformer, binary_columns)],
        remainder='drop')


# Transform: fit_transform() for TRAIN

In [79]:
#Fit and transform the train data
train_x = preprocessor.fit_transform(train_inputs)

train_x

array([[-0.57516407, -0.16927383, -1.15578785, ...,  0.        ,
         0.        ,  1.        ],
       [ 0.1958557 ,  0.18733179, -0.58455222, ...,  0.        ,
         0.        ,  1.        ],
       [ 0.42207037,  0.69457073, -0.01331659, ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [ 0.04321532, -0.1499537 , -1.15578785, ...,  0.        ,
         0.        ,  1.        ],
       [ 0.7776294 , -1.47612124, -0.58455222, ...,  0.        ,
         1.        ,  0.        ],
       [-1.41278267, -0.78143866, -0.58455222, ...,  0.        ,
         0.        ,  1.        ]])

In [81]:
train_x.shape

(2488, 66)

# Transform: transform() for TEST

In [84]:
# Transform the test data
test_x = preprocessor.transform(test_inputs)

test_x

array([[ 1.39510971,  1.60175586,  1.70039032, ...,  0.        ,
         0.        ,  1.        ],
       [ 0.77451956, -1.74858009,  1.70039032, ...,  0.        ,
         0.        ,  1.        ],
       [-0.05462059,  0.13283793,  0.55791905, ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [ 0.60448836,  0.73614916,  1.12915468, ...,  0.        ,
         0.        ,  0.        ],
       [-0.99210521, -0.23519825, -1.15578785, ...,  0.        ,
         1.        ,  1.        ],
       [-1.44872042, -0.7116745 , -0.58455222, ...,  0.        ,
         0.        ,  1.        ]])

In [86]:
test_x.shape

(1067, 66)

# Calculating the baseline

In [89]:
from sklearn.dummy import DummyRegressor

dummy_regr = DummyRegressor(strategy="mean")

dummy_regr.fit(train_x, train_y)

In [91]:
from sklearn.metrics import mean_squared_error

In [93]:
# This is the baseline Train RMSE

dummy_train_pred = dummy_regr.predict(train_x)

baseline_train_mse = mean_squared_error(train_y, dummy_train_pred)

baseline_train_rmse = np.sqrt(baseline_train_mse)

print('Baseline Train RMSE: {}' .format(baseline_train_rmse))

Baseline Train RMSE: 102.49847834318642


In [95]:
# This is the baseline Test RMSE

dummy_test_pred = dummy_regr.predict(test_x)

baseline_test_mse = mean_squared_error (test_y, dummy_test_pred)

baseline_test_rmse = np.sqrt(baseline_test_mse)

print('Baseline Test RMSE: {}' .format(baseline_test_rmse))

Baseline Test RMSE: 105.36001777194679


# Linear Regression

In [98]:
from sklearn.linear_model import LinearRegression

# Initialize the Linear Regression model
linear_regr = LinearRegression()

# Fit the model with the training data
linear_regr.fit(train_x, train_y)


In [100]:
# Predict on the training data
linear_train_pred = linear_regr.predict(train_x)

# Calculate Mean Squared Error (MSE) for the training data
linear_train_mse = mean_squared_error(train_y, linear_train_pred)

# Calculate Root Mean Squared Error (RMSE)
linear_train_rmse = np.sqrt(linear_train_mse)

print('Linear Regression Train RMSE: {}'.format(linear_train_rmse))

Linear Regression Train RMSE: 62.35154838482041


In [102]:
# Predict on the test data
linear_test_pred = linear_regr.predict(test_x)

# Calculate Mean Squared Error (MSE) for the test data
linear_test_mse = mean_squared_error(test_y, linear_test_pred)

# Calculate Root Mean Squared Error (RMSE)
linear_test_rmse = np.sqrt(linear_test_mse)

print('Linear Regression Test RMSE: {}'.format(linear_test_rmse))

Linear Regression Test RMSE: 65.19962860102886


# SVM Model

In [109]:
from sklearn.svm import SVR 

svm_reg = SVR(kernel="linear") 

svm_reg.fit(train_x, train_y)

  y = column_or_1d(y, warn=True)


In [111]:
#Train RMSE
svm_train_pred = svm_reg.predict(train_x)

train_mse = mean_squared_error(train_y, svm_train_pred)

train_rmse = np.sqrt(train_mse)

print('Train RMSE: {}' .format(train_rmse))

Train RMSE: 65.69219348792122


In [113]:
#Test RMSE
svm_test_pred = svm_reg.predict(test_x)

test_mse = mean_squared_error(test_y, svm_test_pred)

test_rmse = np.sqrt(test_mse)

print('Test RMSE: {}' .format(test_rmse))

Test RMSE: 68.4896006729622


# Decision Tree Model

In [116]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor(max_depth=10) 

tree_reg.fit(train_x, train_y)

In [118]:
#Train RMSE
train_pred = tree_reg.predict(train_x)

train_mse = mean_squared_error(train_y, train_pred)

train_rmse = np.sqrt(train_mse)

print('Train RMSE: {}' .format(train_rmse))

Train RMSE: 43.671685159574764


In [120]:
#Test RMSE
test_pred = tree_reg.predict(test_x)

test_mse = mean_squared_error(test_y, test_pred)

test_rmse = np.sqrt(test_mse)

print('Test RMSE: {}' .format(test_rmse))

Test RMSE: 77.12837549352737


# Decision Tree Model 2

In [122]:
#Let's restrict the minimum samples per leaf node

tree_reg2 = DecisionTreeRegressor(min_samples_leaf = 10, max_depth=5) 

tree_reg2.fit(train_x, train_y)

In [124]:
#Train RMSE
train_pred = tree_reg2.predict(train_x)

train_mse = mean_squared_error(train_y, train_pred)

train_rmse = np.sqrt(train_mse)

print('Train RMSE: {}' .format(train_rmse))

Train RMSE: 65.29145296563588


In [126]:
#Test RMSE
test_pred = tree_reg2.predict(test_x)

test_mse = mean_squared_error(test_y, test_pred)

test_rmse = np.sqrt(test_mse)

print('Test RMSE: {}' .format(test_rmse))

Test RMSE: 74.02132430589675


# Random Forest

In [132]:
from sklearn.ensemble import RandomForestRegressor 

rnd_reg = RandomForestRegressor(n_estimators=500, max_depth=10, n_jobs=-1) 

rnd_reg.fit(train_x, train_y)

  return fit_method(estimator, *args, **kwargs)


In [135]:
#Train RMSE
train_pred = rnd_reg.predict(train_x)

train_mse = mean_squared_error(train_y, train_pred)

train_rmse = np.sqrt(train_mse)

print('Train RMSE: {}' .format(train_rmse))

Train RMSE: 40.495861863086645


In [137]:
#Test RMSE
test_pred = rnd_reg.predict(test_x)

test_mse = mean_squared_error(test_y, test_pred)

test_rmse = np.sqrt(test_mse)

print('Test RMSE: {}' .format(test_rmse))

Test RMSE: 62.05750335725834


# Results:

In [141]:
import pandas as pd
from sklearn.metrics import mean_squared_error

# Step 1: Fit each model (example assumes models are already defined and trained)
# Note: Replace these model names with those used in your PDF code.

# Example models (replace with your actual models)
models = {
    'Dummy Regressor': dummy_regr,
    'Linear Regression': linear_regr,
    'SVM Regression': svm_reg,
    'Decision Tree': tree_reg,
    'Decision Tree 2': tree_reg2,
    'Random Forest': rnd_reg
}

# Step 2: Calculate metrics (Train and Test RMSE)
results = {
    'Model': [],
    'Train RMSE': [],
    'Test RMSE': []
}

for model_name, model in models.items():
    # Predictions on train and test sets
    train_pred = model.predict(train_x)
    test_pred = model.predict(test_x)
    
    # Calculate RMSE
    train_rmse = mean_squared_error(train_y, train_pred, squared=False)
    test_rmse = mean_squared_error(test_y, test_pred, squared=False)
    
    # Append results
    results['Model'].append(model_name)
    results['Train RMSE'].append(train_rmse)
    results['Test RMSE'].append(test_rmse)

# Step 3: Display results in a DataFrame
results_df = pd.DataFrame(results)
print("Model Performance Summary:")
print(results_df)




Model Performance Summary:
               Model  Train RMSE   Test RMSE
0    Dummy Regressor  102.498478  105.360018
1  Linear Regression   62.351548   65.199629
2     SVM Regression   65.692193   68.489601
3      Decision Tree   43.671685   77.128375
4    Decision Tree 2   65.291453   74.021324
5      Random Forest   40.495862   62.057503




# Analysis:

#### Dummy Regressor: This model, which predicts the mean price for all instances, has the highest RMSE for both training (102.5) and test (105.4) data.It serves as a baseline, so all other models should ideally perform better than this.

#### Linear Regression: This model improves significantly over the Dummy Regressor with a Train RMSE of 62.4 and Test RMSE of 65.2. It indicates that Linear Regression is capturing patterns in the data fairly well.

#### SVM Regression: The SVM model has similar RMSE values to Linear Regression, with a slightly higher Test RMSE (68.5). This suggests it's performing comparably but has a bit more error when predicting unseen data.

#### Decision Tree: This model achieves the lowest Train RMSE (43.7), meaning it fits the training data very closely. However, its Test RMSE is higher (77.1), indicating potential overfitting—it performs well on training data but not as well on new data.

#### Decision Tree 2: This version of the Decision Tree has a slightly better balance, with Train RMSE at 65.3 and Test RMSE at 74.0. It’s less overfitted than the first Decision Tree but still not ideal.

#### Random Forest: This model achieves the best results overall, with the lowest Test RMSE (62.1) and a low Train RMSE (40.5). This indicates that Random Forest generalizes well and captures complex patterns without overfitting.


# Conclusion:

#### Among all models, Random Forest provides the best balance between training and test performance, making it the most effective model here for predicting prices accurately across both seen and unseen data. The other models either fit too closely to the training data or didn’t generalize as well on the test set.