In [2]:
# Importing necessary libraries
import pandas as pd
import numpy as np

# Visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns

# Machine Learning models from sklearn
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
import xgboost as xgb

# Model selection and evaluation metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [3]:
#Importing dataset 
df = pd.read_csv("/Users/neeraj/Documents/Portfolio_project/Airbnb-Price-Predictor/data/processed/1.Listing_details_1.csv")
df = df[df['price'].notna()]

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3329 entries, 0 to 3456
Data columns (total 75 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   id                                            3329 non-null   float64
 1   listing_url                                   3329 non-null   object 
 2   scrape_id                                     3329 non-null   int64  
 3   last_scraped                                  3329 non-null   object 
 4   source                                        3329 non-null   object 
 5   name                                          3329 non-null   object 
 6   description                                   0 non-null      float64
 7   neighborhood_overview                         2164 non-null   object 
 8   picture_url                                   3329 non-null   object 
 9   host_id                                       3329 non-null   int64 

Remove the irrelevant columns used for scraping and retain only the relevant ones in the dataset. This will help avoid unnecessary processing.

In [5]:

bnb_listing = df.drop(["listing_url","scrape_id","last_scraped","last_scraped","source","description","picture_url","host_url","host_thumbnail_url","host_picture_url","longitude","latitude","amenities","calendar_last_scraped","bathrooms","bedrooms","calendar_updated",'license','host_about'], axis=1)

Grouping columns into relevant groups to make it easier to process the data.

In [6]:
property_unique = ['id','name','neighbourhood_overview']
neighbourhood = ['neighbourhood','neighbourhood_cleansed','neighbourhood_group_cleansed']
host = ["host_id","host_name","host_since","host_location","host_about","host_response_time","host_response_rate","host_acceptance_rate","host_is_superhost","host_neighbourhood","host_listings_count","host_total_listings_count","host_verifications","host_has_profile_pic","host_identity_verified"]
property_details = ["property_type","room_type","accommodates","bathrooms_text","beds"]
price = ["price"]
future_bookings = ['minimum_nights','maximum_nights','minimum_minimum_nights','maximum_minimum_nights','minimum_maximum_nights','maximum_maximum_nights','minimum_nights_avg_ntm','maximum_nights_avg_ntm','has_availability','availability_30','availability_60','availability_90','availability_365']
reviews = ["number_of_reviews","number_of_reviews_ltm","number_of_reviews_l30d","first_review","last_review","review_scores_rating","review_scores_accuracy","review_scores_cleanliness","review_scores_checkin","review_scores_communication","review_scores_location","review_scores_value","reviews_per_month"]
others = ["instant_bookable","calculated_host_listings_count","calculated_host_listings_count_entire_homes","calculated_host_listings_count_private_rooms","calculated_host_listings_count_shared_rooms"]

Trying to understand the importance of each variable, including unique and null values, as well as their relationships with other important variables

In [7]:
#Checking if all the listings are unique or not
unique_property_count = bnb_listing['id'].nunique()
print("Number of unique listings based on the 'id' column:", unique_property_count)

Number of unique listings based on the 'id' column: 3329


In [None]:
unique_host_count = bnb_listing['host_id'].nunique()
print("Number of unique host based on the 'host_id' column:", unique_host_count)

In [None]:
# Assuming total_listing is a Series with host_id as index and count as values
total_listing = bnb_listing.groupby('host_id')['id'].count()

sns.relplot(x=total_listing.index, y=total_listing.values, kind = 'scatter')
plt.xlabel('Host ID')
plt.ylabel('Count of Listings')
plt.title('Number of Listings per Host')
plt.xticks(rotation=90)  # Rotate x-axis labels for better visibility
plt.show()


So all listings are unqiue

In [None]:
sns.relplot( 
         x = bnb_listing[bnb_listing["number_of_reviews"] > 0]['price'],
         y= bnb_listing[bnb_listing["number_of_reviews"] > 0]['review_scores_rating'], 
         kind = 'scatter'
)
plt.xticks(rotation = 90)
plt.show()


Need to bucketize price to get better clarity, also rating will not be directly related with the price as can be seen from the above data points.

In [None]:
print(bnb_listing['host_location'].unique())
by_country = bnb_listing.groupby('host_location')['id'].count()
print(by_country.sort_values(ascending=False))

People from outside Signapore are also running Airbnb in Singapore. Top of the list are people from India, Indorenesia and France.

Ignoring host_about for now, we can run NLP to understand if there are anything specific that can help in the correlation between price and what is mentioned

In [None]:
print(bnb_listing['host_response_time'].unique()) #Need to do hot encoding. This can impact rating, let's try to see correlation between rating and reponse time
Category_order = [ 'within an hour','within a few hours','within a day','a few days or more']
sns.catplot(x='host_response_time',y= 'review_scores_rating',data = bnb_listing,hue = 'host_response_time',order = Category_order)
plt.xticks(rotation = 45)
plt.show()

Response time do make a impact on the overall rating and which might influence the future bookings. Will explore this data set futher

In [None]:
print(bnb_listing['host_response_rate'].unique()) 
avg_reponse_rating = bnb_listing.groupby('host_response_rate')['review_scores_rating'].mean()
print(avg_reponse_rating)

sns.relplot(x=avg_reponse_rating.index,y=avg_reponse_rating[1],kind='line')

#Need to do hot encoding. This can impact rating, let's try to see correlation between rating and reponse time
# Category_order = [ 'within an hour','within a few hours','within a day','a few days or more']
# sns.catplot(y ='host_response_rate',x = 'review_scores_rating',data = bnb_listing,hue = 'host_response_time',order = Category_order)
# plt.xticks(rotation = 45)
# plt.show()

ax = plt.gca()  
ax.set_ylim(4.5, None) 

In [None]:
bnb_listing['host_acceptance_rate'].head()

In [None]:
bnb_listing.groupby('host_is_superhost')['id'].count()

Let's try to understand the correlation between price and property features, we have these as property features -  
 21  property_type   22  room_type  23  accommodates   25  bathrooms_text 27  beds 28  price

In [18]:
# Convert 'price' column to numeric after removing non-numeric characters
bnb_listing['price'] = pd.to_numeric(bnb_listing['price'].str.replace('[^0-9.]', '', regex=True))

In [19]:
# Converting price column to float
for lab, row in bnb_listing.iterrows():
    bnb_listing.at[lab, 'price'] = float(row['price'])


In [None]:
bnb_listing['property_type'].unique()

total_property = bnb_listing.groupby('property_type')['property_type'].count()
print(total_property.sort_values(ascending=False))


In [None]:
# Now calculate the mean
total_property_price = bnb_listing.groupby('property_type')[['price', 'review_scores_rating']].mean()

print(total_property_price.sort_values(by='price', ascending=False))

sns.relplot(x='price',y='review_scores_rating',data=total_property_price)

There is no clear correlation between price and rating, higher price doesn't mean higher rating . 

In [None]:
bnb_listing['room_type'].unique()
room_type_summary = bnb_listing.groupby('room_type').agg(room_count=('room_type', 'count'))
print(room_type_summary)

In [None]:
#'''Getting error needs to be checked'''
bnb_listing.pivot_table(values='property_type', index='property_type', columns='room_type', aggfunc='count')

In [None]:
sns.set_style('darkgrid')
# sns.set_palette()
# sns.set_context('Paper')


bnb_listing['accommodates'].unique()
accomodates_count = bnb_listing.groupby('accommodates')['price'].agg(['count', 'mean'])
accomodates_count = accomodates_count.rename(columns={'count': 'number_of_property', 'mean': 'avg_price'})
accomodates_count = accomodates_count.sort_values('avg_price', ascending=False)

#Filtering out property with where number of count is greater  than 20 to understand price trend better:

accomodates_count_30 = accomodates_count[accomodates_count['number_of_property'] > 30 ]
sns.relplot(x = accomodates_count.index, y = 'avg_price', data = accomodates_count, kind = 'line', palette='Green', markers= True)
sns.relplot(x = accomodates_count_30.index, y = 'avg_price', data = accomodates_count_30, kind = 'line',markers= True)

Average price increase after as number of people that can be accomdate in a property increase, it peaks at 7 after which it decrease might be attributed that after 8 we have shared room more like dorms

In [None]:
bnb_listing['bathrooms_text'].unique()

In [None]:
bnb_listing['beds'].unique()
bed_groups = bnb_listing.groupby('beds')['beds'].agg('count')
print(bed_groups)

In [None]:
bnb_listing['host_about'].head()

In [None]:
bnb = bnb_listing[['property_type', 'room_type', 'accommodates', 'bathrooms_text', 'beds','price']]

# Calculate the correlation matrix
correlation_matrix = df.corr()

# Create the heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Heatmap')
plt.show()


Let's try to understand negihborhood impact on the price of the rooms

 17  neighbourhood                                 
 18  neighbourhood_cleansed                        
 19  neighbourhood_group_cleansed                

In [None]:
bnb_listing['neighbourhood'].isna().sum()

As most high % of this column values are not present we will drop this column for model development 

In [None]:
# Define the bin edges for the price categories
bin_edges = [0, 50, 100, 150, 200, 500, 1000, float('inf')]  # Define the edges of the price categories

# Define the labels for the price categories
bin_labels = ['0-50', '51-100', '101-150', '151-200', '201-500','501-1000','1000+']  # Define the labels for the price categories

# Create the 'price_category' column using pd.cut()
bnb_listing['price_category'] = pd.cut(bnb_listing['price'], bins=bin_edges, labels=bin_labels, right=False)

bnb_listing['price_category'].unique()

In [None]:
bnb_listing['neighbourhood_cleansed']
bnb_listing.groupby('neighbourhood_cleansed')['neighbourhood_cleansed'].count()
bnb_listing.pivot_table(values='price', 
                        index='neighbourhood_cleansed',
                        columns='accommodates', 
                        aggfunc=lambda x: np.round(np.mean(x)),
                        fill_value=0)

In [None]:
# Plot the heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(pivot_table_rounded, cmap='coolwarm', annot=True, fmt='d', linewidths=0.5)
plt.title('Average Price by Neighbourhood and # of Accomodates')
plt.xlabel('Accomdates')
plt.ylabel('Neighbourhood')
plt.xticks(rotation=45)
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

In [None]:
bnb_listing['neighbourhood_group_cleansed'].unique()

We will develop three model, first is modeling price based on the property features and reviews


1. Model price based on the property features first and also the reviews provided
DATA CLEANING

In [33]:
x1_model = bnb_listing[['neighbourhood_cleansed','neighbourhood_group_cleansed','property_type',
                       'room_type','accommodates','bathrooms_text','beds','number_of_reviews','number_of_reviews_ltm',
                       'number_of_reviews_l30d','review_scores_rating','review_scores_accuracy',
                       'review_scores_cleanliness','review_scores_checkin','review_scores_communication','review_scores_location',
                       'review_scores_value','reviews_per_month']]
y = bnb_listing['price']

In [None]:
print(type(x1_model))

In [None]:
x1_model['bathrooms_text'].unique()

baths = x1_model.groupby('bathrooms_text')['bathrooms_text'].agg('count')
print(baths)

From this we can do feature engineering, and identify the shared bath and number of bath, that we can impute the missing value or nan values.
- Half bath or full bath can be one columns
- Number of baths can  be another
- Private or shared can be another

Our assumbtion is where only bath is written , those are private full baths.

In [None]:
import re

# Convert column to string type
x1_model['bathrooms_text'] = x1_model['bathrooms_text'].astype(str)

# Iterate over the Series
for lab, row in x1_model['bathrooms_text'].items():
    # Use regular expression to extract number of bathrooms from string
    match = re.search(r'(\d+(\.\d+)?)', row)
    if match:
        # Assign the extracted number of bathrooms to a new column
        x1_model.loc[lab, 'num_bathrooms'] = float(match.group(1))
    else:
        x1_model.loc[lab, 'num_bathrooms'] = 1

    if 'shared' in row:
        x1_model.loc[lab, 'Shared_bath'] = 0
    else:
        x1_model.loc[lab, 'Shared_bath'] = 1

    if 'half' in row.lower():
        x1_model.loc[lab, 'Half_bath'] = 0
    elif 'Half' in row.lower():
        x1_model.loc[lab, 'Half_bath'] = 0
    else:
        x1_model.loc[lab, 'Half_bath'] = 1



In [None]:
bath_details = x1_model.groupby('num_bathrooms')['num_bathrooms'].agg(numbers_bath='count').reset_index()

sns.barplot(x='num_bathrooms', y='numbers_bath', data=bath_details)
plt.xlabel('Number of Bathrooms')
plt.ylabel('Count')
plt.title('Distribution of Properties by Number of Bathrooms')
plt.show()


In most of the properties only one bathroom is there

In [None]:
plt.figure(figsize=(10, 6))
plt.scatter(x1_model['beds'], x1_model['num_bathrooms'], alpha=0.5)
plt.xlabel('Number of Beds')
plt.ylabel('Number of Bathrooms')
plt.title('Relationship between Beds and Bathrooms')
plt.show()


In [None]:
plt.figure(figsize=(8, 6))
sns.heatmap(x1_model[['beds', 'num_bathrooms']].corr(), annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Heatmap between Beds and Bathrooms')
plt.show()


In [42]:
# Replacing null beds with median values
for lab, row in x1_model.iterrows():
    if pd.isnull(row['beds']):
        x1_model.at[lab, 'beds'] = x1_model['beds'].mode()[0]


In [None]:
# Assuming that average rating start with 3 and based on user experience it goes up and down. So we will update properties rating where no rating is given with 3
# Replacing null beds with median values
columns_to_fill = ['review_scores_rating', 'review_scores_accuracy', 'review_scores_cleanliness',
                   'review_scores_checkin', 'review_scores_communication', 'review_scores_location',
                   'review_scores_value', 'reviews_per_month']

# Fill missing values with 3 for specified columns
x1_model[columns_to_fill] = x1_model[columns_to_fill].fillna(3)


In [44]:
x1_model_1500 = x1_model[x1_model['number_of_reviews'] != 0]

In [None]:
x1_model_1500.isna().sum()

In [None]:
x1_model_1500['property_type'].unique()

In [None]:
x1_model['room_type'].unique()

In [None]:
room_new_type = x1_model.groupby(['property_type', 'room_type'])['room_type'].agg(count='count')
print(room_new_type.sort_values(by='count', ascending=False))


In [56]:
# List of property types to check for
property_types = ['rental unit', 'serviced apartment', 'condo', 'hotel', 'home', 'boutique hotel',
                  'townhouse', 'aparthotel', 'hostel', 'bungalow', 'bed and breakfast', 'guesthouse', 'villa']

# Iterate over the rows in the 'property_type' column
for lab, row in x1_model_1500['property_type'].items():
    # Check if the property type exists in the row
    for prop_type in property_types:
        if prop_type.lower() in row.lower():
            x1_model_1500.loc[lab, prop_type] = 1
        else:
            x1_model_1500.loc[lab,prop_type] = 0
    # If property type not found, mark "other" column as 1



In [None]:
sum_of_columns = x1_model_1500[property_types].sum(axis=1)
x1_model['other'] = 1 - sum_of_columns
x1_model_1500.loc[sum_of_columns == 1, 'other'] = 0

In [None]:
x1_model_1500[['property_type','rental unit', 'serviced apartment', 'condo', 'hotel', 'home', 'boutique hotel',
                  'townhouse', 'aparthotel', 'hostel', 'bungalow', 'bed and breakfast', 'guesthouse', 'villa','other']].head()

Doing hot encodoing

In [None]:
x1_model_1500['neighbourhood_cleansed'].unique()

In [53]:
x1_model_1500_dummy = pd.get_dummies(x1_model_1500[['neighbourhood_cleansed','neighbourhood_group_cleansed','room_type']]).astype(int)

In [None]:
# Define the columns for x1_model and x1_dummy
x1_col1 = ['accommodates', 'beds', 'number_of_reviews', 'number_of_reviews_ltm', 'number_of_reviews_l30d',
           'review_scores_rating', 'review_scores_accuracy', 'review_scores_cleanliness', 'review_scores_checkin',
           'review_scores_communication', 'review_scores_location', 'review_scores_value', 'reviews_per_month',
           'num_bathrooms', 'Shared_bath', 'Half_bath', 'rental unit', 'serviced apartment', 'condo', 'hotel', 'home',
           'boutique hotel', 'townhouse', 'aparthotel', 'hostel', 'bungalow', 'bed and breakfast', 'guesthouse',
           'villa', 'other']

# Select the columns from x1_dummy
x2_col2 = x1_model_1500_dummy.columns

# Concatenate x1_model and x1_dummy
x1_final_1500 = pd.concat([x1_model_1500[x1_col1], x1_model_1500_dummy[x2_col2]], axis=1)


In [None]:
y_final_1500 = df['price'][x1_model['number_of_reviews'] != 0]

In [None]:
y_final_1500 = bnb_listing['price'][x1_model['number_of_reviews'] != 0]
y_final_1500.head()

In [None]:
print("Shape of x1_final_1500:", x1_final_1500.shape)
print("Shape of y_final_1500:", y_final_1500.shape)

In [None]:
#Splitting data into train and test
X_train, X_test, y_train, y_test = train_test_split(x1_final_1500, y_final_1500, test_size=0.2, random_state=21)

In [None]:
models = {
    'Decision Tree Regression': DecisionTreeRegressor(),
    'Random Forest Regression': RandomForestRegressor(),
    'Gradient Boosting Regression': GradientBoostingRegressor(),
    'AdaBoost Regression': AdaBoostRegressor(),
    'xgboost': xgb.XGBRegressor()
}

In [None]:
# Assuming 'column_name' is the name of the column you want to convert
y_final_1500['price'] = y_final_1500['price'].str.replace(',', '').astype(float)

In [None]:
from sklearn.metrics import mean_squared_error

model_mae = {}

for label, regressor in models.items():
    regressor.fit(X_train, y_train)
    predictions = regressor.predict(X_train)
    mae = mean_absolute_error(y_train, predictions)
    model_mae[label] = mae

# Print model MSE
for model, mae in model_mae.items():
    print(f"{model}: MAE={mae}")


Decision tree is coming out be the best model atleast for the training data set. Let's try to test the model on the test data set  

In [None]:
from sklearn.metrics import mean_absolute_error

model_mae = {}

for label, regressor in regression_models.items():
    regressor.fit(X_train, y_train)
    predictions = regressor.predict(X_test) 
    mae = mean_absolute_error(y_test, predictions)
    model_mae[label] = mae

# Print model MAE
for model, mae in model_mae.items():
    print(f"{model}: MAE={mae}")


Let's run decision tree independently and try to understand which factor are important

In [None]:
DT_model = DecisionTreeRegressor()
DT_model.fit(X_train,y_train)
pred = DT_model.predict(X_test)

# Checking accuracy 

mse = mean_squared_error(y_test, pred)
rmse = np.sqrt(mse)
r_squared = r2_score(y_test, pred)
mae = mean_absolute_error(y_test, pred)

print("Mean Squared Error (MSE):", mse)
print("Root Mean Squared Error (RMSE):", rmse)
print("R-squared:", r_squared)
print("Mean Absolute Error (MAE):", mae)


In [None]:
import pandas as pd

# Create a DataFrame to store y_test and predictions
results_df = pd.DataFrame({
    'y_test': y_test.values.flatten(),  # Convert y_test to a 1D array
    'predictions': predictions
})

# Write the DataFrame to a CSV file
results_df.to_csv('predictions.csv', index=False)


In [None]:
# Get feature importances for Decision Tree model
importances = DT_model.feature_importances_

# Create a DataFrame to store feature importances
feature_importance_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': importances
})

# Sort the DataFrame by importance
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)


In [None]:
feature_importance_df.head(10)

Explore sample size to understand if we have enough samples to traine the model

In [None]:
x1_model_1500.head()