In [39]:
# Importing necessary libraries
import pandas as pd
import numpy as np

# Visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns

# Machine Learning models for regression from sklearn
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import (RandomForestRegressor, GradientBoostingRegressor, 
                              AdaBoostRegressor)
import xgboost as xgb

# Model selection and evaluation metrics for regression
from sklearn.model_selection import train_test_split
from sklearn.metrics import (mean_squared_error, mean_absolute_error, r2_score)

# Machine Learning models for classification from sklearn
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (RandomForestClassifier, GradientBoostingClassifier, 
                              AdaBoostClassifier)
from xgboost import XGBClassifier

# Model selection and evaluation metrics for classification
from sklearn.metrics import (accuracy_score, classification_report, 
                             confusion_matrix)
from sklearn.model_selection import GridSearchCV

# Preprocessing libraries
from sklearn.preprocessing import LabelEncoder


In [3]:
#Importing dataset 
df = pd.read_csv("/Users/neeraj/Documents/Portfolio_project/Airbnb-Price-Predictor/data/processed/1.Listing_details_1.csv")
df = df[df['price'].notna()]

We are using these parameters for modeling because they directly capture the key aspects of a listing that impact pricing:

1. Location-based features : Location is a primary driver of price, reflecting proximity to popular areas and local demand.
2. Property characteristics : These features define the type, size, and capacity of the listing, which naturally influence the price.
3. Review metrics : These aggregate ratings and review numbers capture the overall quality of the listing, including host performance (like responsiveness), which we assume is already reflected in the ratings.

We are excluding factors like host details and individual review parameters since they are already encapsulated in the review scores and broader ratings, thus avoiding redundant information. Occupancy is not considered, as we're focusing on factors more directly tied to pricing decisions rather than seasonal or demand-based variables.

In [4]:
model_data = df[['neighbourhood_group_cleansed', 'property_type', 'room_type', 'accommodates', 'bathrooms_text', 'beds', 'number_of_reviews', 'review_scores_rating', 'reviews_per_month','price']]

In [7]:
# Filter out rows where 'review_scores_rating' is NaN (i.e., no rating)
model_data_n = model_data[pd.notna(model_data['review_scores_rating'])].copy()  # Use .copy() to avoid SettingWithCopyWarning

# Replace blank values in 'beds' with values from 'accommodates'
# Use .loc to avoid SettingWithCopyWarning
model_data_n.loc[model_data_n['beds'] == '', 'beds'] = np.nan  # Convert blank strings to NaN
model_data_n.loc[:, 'beds'] = model_data_n['beds'].fillna(model_data_n['accommodates'])  # Fill NaNs in 'beds' with values from 'accommodates'

Let's extract the main features from these columns, examples exact bathroom needs to be extractged from the bathroom columns

In [8]:
# Convert 'price' column to numeric after removing non-numeric characters
model_data_n['price'] = pd.to_numeric(model_data_n['price'].str.replace('[^0-9.]', '', regex=True))

# Converting price column to float
for lab, row in model_data_n.iterrows():
    model_data_n.at[lab, 'price'] = float(row['price'])

In [None]:
# Converting pricing to bins

bin_edges = [0, 50, 100, 150, 200, 500, 1000, float('inf')] 
bin_labels = ['0-50', '51-100', '101-150', '151-200', '201-500', '501-1000', '1000+']

# Create the 'price_category' column using pd.cut()
model_data_n['price_category'] = pd.cut(model_data_n['price'], bins=bin_edges, labels=bin_labels, right=False)

# Get unique price categories
unique_price_categories = model_data_n['price_category'].unique()
price_category_counts = model_data_n['price_category'].value_counts().sort_index()

# Print the unique price categories and their counts
print("Unique Price Categories:")
print(unique_price_categories)

print("\nCount of Properties in Each Price Category:")
print(price_category_counts)

In [10]:
# List of property types to check for
property_types = ['rental unit', 'serviced apartment', 'condo', 'hotel', 'home', 'boutique hotel',
                  'townhouse', 'aparthotel', 'hostel', 'bungalow', 'bed and breakfast', 'guesthouse', 'villa']

# Initialize the 'other' column in model_data_n
model_data_n['other'] = 0

# Iterate over the list of property types
for prop_type in property_types:
    # Create a new column for each property type
    model_data_n[prop_type] = model_data_n['property_type'].str.lower().str.contains(prop_type.lower()).astype(int)

# Calculate the sum of the property type columns
model_data_n['sum_of_columns'] = model_data_n[property_types].sum(axis=1)

# Mark the 'other' column based on the sum of columns
model_data_n.loc[model_data_n['sum_of_columns'] == 0, 'other'] = 1


In [None]:
import re

# Convert column to string type
model_data_n['bathrooms_text'] = model_data_n['bathrooms_text'].astype(str)

# Initialize new columns
model_data_n['num_bathrooms'] = 0
model_data_n['Shared_bath'] = 0
model_data_n['Half_bath'] = 0

# Iterate over the Series
for lab, row in model_data_n['bathrooms_text'].items():
    # Use regular expression to extract the number of bathrooms from the string
    match = re.search(r'(\d+(\.\d+)?)', row)
    if match:
        model_data_n.loc[lab, 'num_bathrooms'] = float(match.group(1))
    else:
        model_data_n.loc[lab, 'num_bathrooms'] = 1

    # Determine if the bathroom is shared
    model_data_n.loc[lab, 'Shared_bath'] = 0 if 'shared' in row.lower() else 1

    # Determine if there is a half-bath
    model_data_n.loc[lab, 'Half_bath'] = 0 if 'half' in row.lower() else 1


In [None]:
model_data_n.head()

In [14]:
# Perform one-hot encoding for 'neighbourhood_group_cleansed' and 'room_type'
model_data_n_encoded = pd.get_dummies(model_data_n, columns=['neighbourhood_group_cleansed', 'room_type'], drop_first=True)


In [None]:
# Drop the specified columns from the DataFrame
columns_to_drop = ['property_type', 'bathrooms_text', 'sum_of_columns', 'other']
model_data_n_encoded = model_data_n_encoded.drop(columns=columns_to_drop)

In [None]:
model_data_n_encoded.head()

In [None]:

label_encoder = LabelEncoder()
model_data_n_encoded['price_category_encoded'] = label_encoder.fit_transform(model_data_n_encoded['price_category'])
print(model_data_n_encoded[['price_category', 'price_category_encoded']].head())

In [26]:
# Drop the specified columns from the DataFrame
columns_to_drop = ['price_category']
model_data_n_encoded = model_data_n_encoded.drop(columns=columns_to_drop)

In [30]:
model_data_n_encoded = model_data_n_encoded.astype(int)
model_data_n_encoded.head()

Unnamed: 0,accommodates,beds,number_of_reviews,review_scores_rating,reviews_per_month,price,rental unit,serviced apartment,condo,hotel,...,Shared_bath,Half_bath,neighbourhood_group_cleansed_East Region,neighbourhood_group_cleansed_North Region,neighbourhood_group_cleansed_North-East Region,neighbourhood_group_cleansed_West Region,room_type_Hotel room,room_type_Private room,room_type_Shared room,price_category_encoded
0,3,3,19,4,0,150,0,0,0,0,...,1,1,1,0,0,0,0,1,0,3
1,1,1,24,4,0,80,0,0,0,0,...,0,0,1,0,0,0,0,1,0,6
2,2,2,46,4,0,80,0,0,0,0,...,0,0,1,0,0,0,0,1,0,6
3,1,1,20,4,0,64,1,0,0,0,...,0,1,0,0,0,0,0,1,0,6
4,1,1,16,4,0,78,1,0,0,0,...,0,1,0,0,0,0,0,1,0,6


In [31]:
X = model_data_n_encoded.drop(['price', 'price_category_encoded'], axis=1)
y = model_data_n_encoded['price_category_encoded']

In [35]:
# Splitting the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21)

# --- Model Initialization ---
models = {
    'Decision Tree Classifier': DecisionTreeClassifier(),
    'Random Forest Classifier': RandomForestClassifier(),
    'Gradient Boosting Classifier': GradientBoostingClassifier(),
    'AdaBoost Classifier': AdaBoostClassifier(),
    'XGBoost Classifier': XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}


In [37]:
# --- Training and Evaluation ---
for model_name, model in models.items():
    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test)
    
    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{model_name} Accuracy: {accuracy:.4f}")

Decision Tree Classifier Accuracy: 0.5686
Random Forest Classifier Accuracy: 0.6359
Gradient Boosting Classifier Accuracy: 0.6190
AdaBoost Classifier Accuracy: 0.5070




XGBoost Classifier Accuracy: 0.6162


In [40]:
#Random forest is giving the best output, we will do the grid search to fine tune the hyperparameters

# Define the Random Forest model
model1 = RandomForestClassifier(random_state=21)

# Define the parameter grid to search
param_grid = {
    'n_estimators': [50, 100, 200],  # Number of trees in the forest
    'max_features': ['auto', 'sqrt', 'log2'],  # Number of features to consider at every split
    'max_depth': [None, 10, 20, 30],  # Maximum number of levels in tree
    'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4]  # Minimum number of samples required to be at a leaf node
}

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=model1, param_grid=param_grid,
                           scoring='accuracy', cv=5, n_jobs=-1, verbose=2)

# Fit GridSearchCV
grid_search.fit(X_train, y_train)

# Print the best parameters and best score
print("Best Parameters:", grid_search.best_params_)
print("Best Cross-validation Score:", grid_search.best_score_)

# Use the best estimator to make predictions on the test set
best_model = grid_search.best_estimator_
y_pred_best = best_model.predict(X_test)

# Evaluate the best model
accuracy_best = accuracy_score(y_test, y_pred_best)
print(f"Best Model Accuracy: {accuracy_best:.4f}")


Fitting 5 folds for each of 324 candidates, totalling 1620 fits
[CV] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.0s
[CV] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.0s
[CV] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.0s
[CV] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.0s
[CV] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.0s
[CV] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.0s
[CV] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.0s
[CV] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_

540 fits failed out of a total of 1620.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
154 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/neeraj/Documents/.conda/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/neeraj/Documents/.conda/lib/python3.12/site-packages/sklearn/base.py", line 1467, in wrapper
    estimator._validate_params()
  File "/Users/neeraj/Documents/.conda/lib/python3.12/site-packages/sklearn/base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "/Users/neeraj/Documents/.conda/lib/python3.12/site-packages/sklearn/utils/_param_validation.py

In [41]:
from joblib import dump

# Save the best model to a file
model_filename = 'best_random_forest_model.joblib'
dump(best_model, model_filename)
print(f'Model saved to {model_filename}')

Model saved to best_random_forest_model.joblib
