# Airbnb: New York Data Analysis

This analysis is for potential investors who are considering investing in short-term rental properties in New York City, New York, United States.


### Data Loading And Overview

In [1]:
import pandas as pd
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
import warnings
warnings.filterwarnings('ignore')

##Data loading and overview
calendar = pd.read_csv("calendar.csv")
listings = pd.read_csv("listings.csv")
reviews = pd.read_csv("reviews.csv")

#view first 5 rows
calendar.head()

Unnamed: 0,listing_id,date,available,price,adjusted_price,minimum_nights,maximum_nights
0,2595,2023-06-05,t,$240.00,$240.00,30.0,1125.0
1,2595,2023-06-06,t,$240.00,$240.00,30.0,1125.0
2,2595,2023-06-07,t,$240.00,$240.00,30.0,1125.0
3,2595,2023-06-08,t,$240.00,$240.00,30.0,1125.0
4,2595,2023-06-09,t,$240.00,$240.00,30.0,1125.0


In [2]:
#view first 5 rows
listings.head()

Unnamed: 0,id,listing_url,scrape_id,last_scraped,source,name,description,neighborhood_overview,picture_url,host_id,...,review_scores_communication,review_scores_location,review_scores_value,license,instant_bookable,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
0,2595,https://www.airbnb.com/rooms/2595,20230605012709,2023-06-05,city scrape,Rental unit in New York · ★4.68 · Studio · 1 b...,"Beautiful, spacious skylit studio in the heart...",Centrally located in the heart of Manhattan ju...,https://a0.muscache.com/pictures/miso/Hosting-...,2845,...,4.8,4.81,4.4,,f,3,3,0,0,0.3
1,5121,https://www.airbnb.com/rooms/5121,20230605012709,2023-06-05,previous scrape,Rental unit in Brooklyn · ★4.52 · 1 bedroom · ...,One room available for rent in a 2 bedroom apt...,,https://a0.muscache.com/pictures/2090980c-b68e...,7356,...,4.91,4.47,4.52,,f,2,0,2,0,0.29
2,14991,https://www.airbnb.com/rooms/14991,20230605012709,2023-06-05,city scrape,Rental unit in New York · ★4.93 · 1 bedroom · ...,Room for rent in my Manhattan apartment. The a...,,https://a0.muscache.com/pictures/56919050/b99e...,59023,...,4.92,4.92,4.85,,f,1,0,1,0,0.14
3,5136,https://www.airbnb.com/rooms/5136,20230605012709,2023-06-05,city scrape,Rental unit in Brooklyn · ★5.0 · 2 bedrooms · ...,We welcome you to stay in our lovely 2 br dupl...,,https://a0.muscache.com/pictures/miso/Hosting-...,7378,...,5.0,4.67,5.0,,f,1,1,0,0,0.03
4,15341,https://www.airbnb.com/rooms/15341,20230605012709,2023-06-05,city scrape,Condo in New York · ★4.56 · 1 bedroom · 2 beds...,Greetings! <br /><br />Come relax here after y...,This 1-bedroom apartment is conveniently locat...,https://a0.muscache.com/pictures/16faf1ad-cb1f...,60049,...,4.8,4.71,4.34,,f,1,1,0,0,0.3


In [3]:
#view first 5 rows
reviews.head()

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments
0,2595,17857,2009-11-21,50679,Jean,Notre séjour de trois nuits.\r<br/>Nous avons ...
1,2595,19176,2009-12-05,53267,Cate,Great experience.
2,2595,19760,2009-12-10,38960,Anita,I've stayed with my friend at the Midtown Cast...
3,2595,34320,2010-04-09,71130,Kai-Uwe,"We've been staying here for about 9 nights, en..."
4,2595,46312,2010-05-25,117113,Alicia,We had a wonderful stay at Jennifer's charming...


In [4]:
#look at the dataframe properties
calendar.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15898711 entries, 0 to 15898710
Data columns (total 7 columns):
 #   Column          Dtype  
---  ------          -----  
 0   listing_id      int64  
 1   date            object 
 2   available       object 
 3   price           object 
 4   adjusted_price  object 
 5   minimum_nights  float64
 6   maximum_nights  float64
dtypes: float64(2), int64(1), object(4)
memory usage: 849.1+ MB


In [5]:
#look at the dataframe properties
listings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43566 entries, 0 to 43565
Data columns (total 75 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   id                                            43566 non-null  int64  
 1   listing_url                                   43566 non-null  object 
 2   scrape_id                                     43566 non-null  int64  
 3   last_scraped                                  43566 non-null  object 
 4   source                                        43566 non-null  object 
 5   name                                          43566 non-null  object 
 6   description                                   42866 non-null  object 
 7   neighborhood_overview                         25201 non-null  object 
 8   picture_url                                   43566 non-null  object 
 9   host_id                                       43566 non-null 

In [6]:
#look at the dataframe properties
reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1148772 entries, 0 to 1148771
Data columns (total 6 columns):
 #   Column         Non-Null Count    Dtype 
---  ------         --------------    ----- 
 0   listing_id     1148772 non-null  int64 
 1   id             1148772 non-null  int64 
 2   date           1148772 non-null  object
 3   reviewer_id    1148772 non-null  int64 
 4   reviewer_name  1148769 non-null  object
 5   comments       1148534 non-null  object
dtypes: int64(3), object(3)
memory usage: 52.6+ MB


### Identifying Missing Values, Data Cleaning and Converting Data Types

In [7]:
#check for missing data/null values
calendar.isnull().sum()

listing_id          0
date                0
available           0
price               0
adjusted_price      0
minimum_nights    375
maximum_nights    375
dtype: int64

In [8]:
#Check For duplicated rows
calendar.duplicated().sum()

0

In [9]:
# Removing '$' signs and ',' from 'price' and 'adjusted_price' columns
calendar['price'] = calendar['price'].str.replace('$', '').str.replace(',', '')
calendar['adjusted_price'] = calendar['adjusted_price'].str.replace('$', '').str.replace(',', '')

# Converting 'date' column to datetime data type
calendar['date'] = pd.to_datetime(calendar['date'])

# Converting 'price' and 'adjusted_price' columns to numeric data type
numeric_columns = ['price', 'adjusted_price']
calendar[numeric_columns] = calendar[numeric_columns].apply(pd.to_numeric, errors='coerce')

# The updated DataFrame with new data types
calendar


Unnamed: 0,listing_id,date,available,price,adjusted_price,minimum_nights,maximum_nights
0,2595,2023-06-05,t,240.0,240.0,30.0,1125.0
1,2595,2023-06-06,t,240.0,240.0,30.0,1125.0
2,2595,2023-06-07,t,240.0,240.0,30.0,1125.0
3,2595,2023-06-08,t,240.0,240.0,30.0,1125.0
4,2595,2023-06-09,t,240.0,240.0,30.0,1125.0
...,...,...,...,...,...,...,...
15898706,905109354552786380,2024-05-30,t,763.0,763.0,1.0,999.0
15898707,905109354552786380,2024-05-31,f,999.0,999.0,1.0,1.0
15898708,905109354552786380,2024-06-01,f,999.0,999.0,1.0,1.0
15898709,905109354552786380,2024-06-02,f,999.0,999.0,1.0,1.0


In [10]:
calendar.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15898711 entries, 0 to 15898710
Data columns (total 7 columns):
 #   Column          Dtype         
---  ------          -----         
 0   listing_id      int64         
 1   date            datetime64[ns]
 2   available       object        
 3   price           float64       
 4   adjusted_price  float64       
 5   minimum_nights  float64       
 6   maximum_nights  float64       
dtypes: datetime64[ns](1), float64(4), int64(1), object(1)
memory usage: 849.1+ MB


In [11]:
#Descriptive statistics
calendar.describe(include='all')

Unnamed: 0,listing_id,date,available,price,adjusted_price,minimum_nights,maximum_nights
count,15898710.0,15898711,15898711,15898710.0,15898710.0,15898340.0,15898340.0
unique,,366,2,,,,
top,,2024-05-26 00:00:00,f,,,,
freq,,43559,9924527,,,,
first,,2023-06-05 00:00:00,,,,,
last,,2024-06-04 00:00:00,,,,,
mean,2.58276e+17,,,224.9185,224.4988,23.42719,1263115.0
std,3.605985e+17,,,559.4554,558.6297,47.8968,52032870.0
min,2595.0,,,0.0,0.0,1.0,1.0
25%,20213660.0,,,84.0,84.0,2.0,360.0
