In [None]:
#Data preparation

#Importing libraries used, and read Boston Airbnb listings data
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
df = pd.read_csv('./data/boston_airbnb/listings.csv')

#Remove columns with more than 50% missing values
missing_cols = set(df.columns[df.isnull().mean()>0.5])
df = df.drop(missing_cols, axis=1, inplace=False)

#Replace "$" sign in price column, convert to numeric
df['price'] = df['price'].replace({'\$':''}, regex = True)
df['price'] = pd.to_numeric(df['price'] ,errors='coerce')

#Drop rows with null prices
df.dropna(subset=['price'], axis=0)

#Drop rows with prices more than $400
df = df[df['price']<=400]

#Display number of rows and columns, and the column names of the remaining data
print(df.shape)
print(df.columns)

In [None]:
#Get the total number of listings per host and plot on a bar chart for the top 20 hosts
vals = df['host_name'].value_counts()
((vals[:20])).plot(kind="bar", title="Number of listings per host", xlabel="Host name", ylabel="Number of listings");

In [None]:
#Display the total number of listings per host
print(vals)

In [None]:
#Display a histogram of the total number of listings per host, setting the bins from 1 to 10 in steps of 1
vals.hist(bins=[1,2,3,4,5,6,7,8,9,10]);

In [None]:
#Filter the data to in include hosts with more than one listing only
df_1 = df.groupby('host_name').filter(lambda x: len(x) > 1) 

#Get the number of listings, mean listing price, and mean review score per host
df_1 = df_1.groupby(['host_name']).agg({'id':"count", 'price':"mean", 'review_scores_rating':"mean"})

#Rename the id column to count that contains the number of listings per host
df_1 = df_1.rename(columns={'id': 'count'},inplace=False)

#Sort the data in descending order by the number of listings per host
df_1 = df_1.sort_values("count", ascending=False)

#Display the data
print(df_1)

In [None]:
#Display a histogram of mean review score per host
df_1['review_scores_rating'].hist();

In [None]:
#Plot the mean review score per host in ascending order for the 20 hosts with the lowest scores
df_1 = df_1.sort_values("review_scores_rating", ascending=True)
((df_1['review_scores_rating'][:20])).plot(kind="bar", title="Mean review score of listings per host", xlabel="Host name", ylabel="Review score");

In [None]:
#Display the data sorted by review scores in ascending order
print(df_1.head(10))

In [None]:
#Display a histogram of mean price per host
df_1['price'].hist();

In [None]:
#Plot the mean price per host in descending order of the 20 hosts with the highest mean prices
df_1 = df_1.sort_values("price", ascending=False)
((df_1['price'][:20])).plot(kind="bar", title="Mean price of listings per host", xlabel="Host name", ylabel="Price");

In [None]:
#Sort and displaythe data mean price per host in descending order
df_1 = df_1.sort_values("price", ascending=False)
print(df_1.head(20))

In [None]:
#Plot the mean price and review scores per host against each other
sns.set_theme(style="ticks")
sns.pairplot(df_1[['price','review_scores_rating']]);


In [None]:
#Display the descriptive statistics of the number of listings, mean price and mean review score per host
df_1.describe()