In [1]:
## Airbnb Bookings Data Analysis
#Exploring listing and bookings data to uncover factors that drive cancellations, revenue potential, and guest behavior.

## 1. Objective
#Initial EDA and vizualisations to identify the drivers of cancellations, revenue and guest behavior

## 2. Data Overview
#This data is publicly available on kaggle.com and contains

## 3. Cleaning & Preparation
#Missing data, formatting, value sanity checks.

## 4. Key Questions Explored
# What listing features correlate with high occupancy?
# Does cancellation policy impact pricing or reviews?
# How does location affect guest behavior?

## 5. Visual Insights
#Charts, grouped metrics, comparisons.

## 6. Observations & Takeaways
#Summary of business-relevant insights.


In [2]:
# importing necessary libraries
import pandas as pd
import numpy as np
import time
import glob
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import seaborn as sns
from scipy import stats
from scipy.stats.mstats import winsorize
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score, silhouette_samples

# dimensionality reduction
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
#import umap

# clustering
from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import DBSCAN
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.mixture import GaussianMixture

In [3]:
## Part 1: Importing, Cleaning & EDA

### Importing

In [4]:
#orginal df
df=pd.read_csv(r'C:\Users\riesn\Documents\vacation-rental-bookings-analysis\data\Airbnb_Open_Data.csv', delimiter=',', encoding='utf-8', low_memory=False)


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 102599 entries, 0 to 102598
Data columns (total 26 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   id                              102599 non-null  int64  
 1   NAME                            102349 non-null  object 
 2   host id                         102599 non-null  int64  
 3   host_identity_verified          102310 non-null  object 
 4   host name                       102193 non-null  object 
 5   neighbourhood group             102570 non-null  object 
 6   neighbourhood                   102583 non-null  object 
 7   lat                             102591 non-null  float64
 8   long                            102591 non-null  float64
 9   country                         102067 non-null  object 
 10  country code                    102468 non-null  object 
 11  instant_bookable                102494 non-null  object 
 12  cancellation_pol

In [6]:
pd.options.display.max_columns = None
pd.options.display.max_rows = None
df.head()

Unnamed: 0,id,NAME,host id,host_identity_verified,host name,neighbourhood group,neighbourhood,lat,long,country,country code,instant_bookable,cancellation_policy,room type,Construction year,price,service fee,minimum nights,number of reviews,last review,reviews per month,review rate number,calculated host listings count,availability 365,house_rules,license
0,1001254,Clean & quiet apt home by the park,80014485718,unconfirmed,Madaline,Brooklyn,Kensington,40.64749,-73.97237,United States,US,False,strict,Private room,2020.0,$966,$193,10.0,9.0,10/19/2021,0.21,4.0,6.0,286.0,Clean up and treat the home the way you'd like...,
1,1002102,Skylit Midtown Castle,52335172823,verified,Jenna,Manhattan,Midtown,40.75362,-73.98377,United States,US,False,moderate,Entire home/apt,2007.0,$142,$28,30.0,45.0,5/21/2022,0.38,4.0,2.0,228.0,Pet friendly but please confirm with me if the...,
2,1002403,THE VILLAGE OF HARLEM....NEW YORK !,78829239556,,Elise,Manhattan,Harlem,40.80902,-73.9419,United States,US,True,flexible,Private room,2005.0,$620,$124,3.0,0.0,,,5.0,1.0,352.0,"I encourage you to use my kitchen, cooking and...",
3,1002755,,85098326012,unconfirmed,Garry,Brooklyn,Clinton Hill,40.68514,-73.95976,United States,US,True,moderate,Entire home/apt,2005.0,$368,$74,30.0,270.0,7/5/2019,4.64,4.0,1.0,322.0,,
4,1003689,Entire Apt: Spacious Studio/Loft by central park,92037596077,verified,Lyndon,Manhattan,East Harlem,40.79851,-73.94399,United States,US,False,moderate,Entire home/apt,2009.0,$204,$41,10.0,9.0,11/19/2018,0.1,3.0,1.0,289.0,"Please no smoking in the house, porch or on th...",


In [7]:
# Cleaning price and service fee columns
def clean_currency(x):
    # Remove any non-numeric characters such as $ or commas and convert to float
    if pd.isnull(x):
        return np.nan
    try:
        x = str(x).replace('$', '').replace(',', '').strip()
        return float(x)
    except Exception as e:
        # In case conversion fails, log the incident and return NaN
        # This method resolves common errors when numeric conversions are attempted on improperly formatted strings.
        return np.nan

# Apply cleaning function to relevant columns
for col in ['price', 'service fee']:
    if col in df.columns:
        df[col] = df[col].apply(clean_currency)

# Handle missing values in reviews per month by filling with 0 (i.e., no reviews per month)
if 'reviews per month' in df.columns:
    df['reviews per month'] = df['reviews per month'].fillna(0)

# Check the cleaned columns
print('Data types after cleaning:')
print(df[['price', 'service fee', 'reviews per month']].dtypes)

# Optional: Drop or fill other missing values as needed
# For this analysis we proceed without dropping rows, but note that this may be adjusted in future analysis.

Data types after cleaning:
price                float64
service fee          float64
reviews per month    float64
dtype: object
