### Assignment 2

**Analysis of Airbnb listings in Mallorca**

Course: Data Analysis 2

Created by: Marcell Magda

Date: -

## Import Libraries

In [30]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import os
from pathlib import Path
import sys
from patsy import dmatrices
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.impute import SimpleImputer
from sklearn.inspection import permutation_importance
from sklearn.inspection import PartialDependenceDisplay
from sklearn.inspection import partial_dependence
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error

from collections import defaultdict
import ast  # To safely evaluate string literals as lists

## 1. Import data
IMPORTANT: Using listings from September as those have ammenities, the later ones don't.

In [31]:
# DATA IMPORT - FROM GITHUB
data = pd.read_csv('listings.csv')

In [32]:
data.head()

Unnamed: 0,id,listing_url,scrape_id,last_scraped,source,name,description,neighborhood_overview,picture_url,host_id,...,review_scores_communication,review_scores_location,review_scores_value,license,instant_bookable,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
0,69998,https://www.airbnb.com/rooms/69998,20230911050559,2023-09-12,previous scrape,Loft in Palma de Mallorca · ★4.94 · 1 bedroom ...,Perfect place to escape for a few days of peac...,( soho-chic bohemian neighborhood of the city ...,https://a0.muscache.com/pictures/6d4007c1-f223...,353156,...,4.96,4.89,4.8,,t,2,2,0,0,1.81
1,548218,https://www.airbnb.com/rooms/548218,20230911050559,2023-09-11,city scrape,Rental unit in Palma de Mallorca · ★4.73 · 2 b...,Este establecimiento de Turismo Interior está ...,,https://a0.muscache.com/pictures/miso/Hosting-...,2694897,...,4.92,4.9,4.71,TI/90,f,5,5,0,0,3.6
2,106833,https://www.airbnb.com/rooms/106833,20230911050559,2023-09-11,city scrape,Villa in Sant Llorenç des Cardassar · ★4.94 · ...,<b>The space</b><br />This is a restored old b...,,https://a0.muscache.com/pictures/miso/Hosting-...,551974,...,5.0,4.71,4.76,ET/1961,f,1,1,0,0,0.12
3,553166,https://www.airbnb.com/rooms/553166,20230911050559,2023-09-12,previous scrape,Cabin in Fornalutx · ★4.95 · 2 bedrooms · 2 be...,License nº ( 505/ 2015 ) / ET )<br />Prices ...,As you can read by other guests this is a uniq...,https://a0.muscache.com/pictures/6766065/d0ace...,2718219,...,4.97,4.91,4.82,505/2015/ET,f,1,1,0,0,1.03
4,159218,https://www.airbnb.com/rooms/159218,20230911050559,2023-09-11,city scrape,Cottage in Selva · ★4.91 · 1 bedroom · 2 beds ...,We are strong supporters of sustainable touris...,,https://a0.muscache.com/pictures/22458820/5aad...,763897,...,4.94,4.8,4.75,ET-3025 Conselleria de Turismo de Baleares,f,1,1,0,0,2.34


In [33]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18832 entries, 0 to 18831
Data columns (total 75 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   id                                            18832 non-null  int64  
 1   listing_url                                   18832 non-null  object 
 2   scrape_id                                     18832 non-null  int64  
 3   last_scraped                                  18832 non-null  object 
 4   source                                        18832 non-null  object 
 5   name                                          18832 non-null  object 
 6   description                                   18660 non-null  object 
 7   neighborhood_overview                         7011 non-null   object 
 8   picture_url                                   18832 non-null  object 
 9   host_id                                       18832 non-null 

## 2. Prepare Data

### 2.1 Extract Relevant Amenities

In [38]:
# Check structure of amenities values
data.amenities.value_counts()

amenities
["Hot water", "Iron", "Shampoo", "Wifi", "Carbon monoxide alarm", "Fire extinguisher", "First aid kit", "Air conditioning", "Extra pillows and blankets", "Paid parking on premises", "Hair dryer", "TV with standard cable", "Smoke alarm", "Self check-in", "Essentials", "Hangers", "Heating", "Pets allowed", "Bed linens", "Dedicated workspace", "Building staff", "Luggage dropoff allowed"]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           

In [35]:
## EXTRACT NUMBERS OF DIFFERENT AMENITIES

# Initialize a defaultdict to count amenities
amenities_count = defaultdict(int)

# Sample data - replace with your actual DataFrame column
amenities_series = data['amenities']

for amenities_str in amenities_series:
    # Convert the string representation of a list to an actual list
    # using ast.literal_eval for safely evaluating the string
    try:
        amenities_list = ast.literal_eval(amenities_str)
    except ValueError:
        # Skip rows where conversion fails
        continue
    
    # Update the count for each amenity
    for amenity in amenities_list:
        amenities_count[amenity] += 1

# Convert the defaultdict to a regular dict for easier handling
amenities_count = dict(amenities_count)

# Sort the amenities by count, descending
sorted_amenities = sorted(amenities_count.items(), key=lambda x: x[1], reverse=True)

# Display the sorted list of amenities and their counts
for amenity, count in sorted_amenities:
    print(f"{amenity}: {count}")

Kitchen: 17673
Wifi: 17570
Essentials: 15591
Hair dryer: 15408
Dishes and silverware: 14532
Iron: 14262
Refrigerator: 13809
Microwave: 13623
Bed linens: 13604
Washer: 12839
TV: 12063
Coffee maker: 11895
Oven: 11863
Hangers: 11802
Cooking basics: 11638
Dishwasher: 11580
Air conditioning: 11218
Hot water: 10469
BBQ grill: 10268
Free parking on premises: 9955
High chair: 9879
Heating: 9667
Private entrance: 9321
Crib: 9285
Freezer: 8824
Stove: 8266
First aid kit: 8090
Free street parking: 7712
Toaster: 7611
Hot water kettle: 7266
Fire extinguisher: 7119
Patio or balcony: 7058
Bathtub: 6952
Backyard: 6928
Outdoor furniture: 6634
Pool: 6542
Dedicated workspace: 6519
Self check-in: 6360
Outdoor dining area: 5862
Dining table: 5544
Long term stays allowed: 5519
Extra pillows and blankets: 5224
Private patio or balcony: 5170
Lockbox: 5130
Wine glasses: 4872
Drying rack for clothing: 4759
Room-darkening shades: 4738
Smoke alarm: 4700
Indoor fireplace: 4592
Mountain view: 4239
Shampoo: 4227
Gard

### 2.2 Create New Dataframe For Relevant Variables

In [49]:
import pandas as pd

def clean_airbnb_data(df):
    """
    Clean and preprocess the Airbnb dataset.
    Filter accommodations, clean price, filters property types, selected amenities and creates dummies.
    """
    # Keep listings accommodating 2 to 6 guests
    filtered_df = df[(df['accommodates'] >= 2) & (df['accommodates'] <= 6)]
    
    
    # Select relevant columns for modeling
    relevant_columns = [
        'accommodates', 'beds', 'review_scores_rating', 'host_is_superhost',
        'latitude', 'longitude', 'host_since', 'number_of_reviews',
        'availability_365', 'minimum_nights', 'maximum_nights',
        'property_type', 'room_type', 'price', 'amenities'
    ]
    processed_df = filtered_df[relevant_columns].copy()
    
    # List of selected amenities
    selected_amenities = [
        "BBQ grill", "Free parking on premises", "High chair", "Heating", "Private entrance", "Crib", "Freezer", 
        "Stove", "First aid kit", "Free street parking", "Toaster", "Hot water kettle", "Fire extinguisher", 
        "Patio or balcony", "Bathtub", "Backyard", "Outdoor furniture", "Pool", "Dedicated workspace", 
        "Self check-in", "Outdoor dining area", "Dining table", "Long term stays allowed", 
        "Extra pillows and blankets", "Private patio or balcony", "Lockbox", "Wine glasses", 
        "Drying rack for clothing", "Room-darkening shades", "Smoke alarm", "Indoor fireplace", "Mountain view", 
        "Shampoo", "Garden view", "Smoking allowed"
    ]

    # Create dummy variables for each selected amenity
    for amenity in selected_amenities:
        column_name = f"amenity_{amenity.replace(' ', '_').lower()}"
        processed_df[column_name] = processed_df['amenities'].str.contains(amenity, case=False, na=False).astype(int)
    
    # Price column cleanup
    processed_df['price'] = processed_df['price'].replace({'\$': '', ',': ''}, regex=True).astype(float)
    processed_df.dropna(subset=['price'], inplace=True)
    processed_df = processed_df[processed_df['price'] > 0]
    
    # Filter out rare property types
    property_counts = processed_df['property_type'].value_counts()
    common_properties = property_counts[property_counts >= 100].index
    processed_df = processed_df[processed_df['property_type'].isin(common_properties)]
    
    # Generate dummy variables
    processed_df = pd.get_dummies(processed_df, columns=['property_type', 'room_type'], prefix=['d_type', 'd_room'])
    
    # Rename columns to lower case and replace spaces with underscores
    processed_df.columns = processed_df.columns.str.lower().str.replace(' ', '_')
    
    # Rename specific columns
    rename_map = {
        'review_scores_rating': 'n_review_scores_rating',
        'host_since': 'date_host_start',
        'minimum_nights': 'n_minimum_nights',
        'maximum_nights': 'n_maximum_nights',
        'accommodates': 'n_accommodates',
        'beds': 'n_beds',
        'availability_365': 'n_availability_365',
        'number_of_reviews': 'n_number_of_reviews',
    }
    processed_df.rename(columns=rename_map, inplace=True)
    
    # Data type conversions
    processed_df['date_host_start'] = pd.to_datetime(processed_df['date_host_start'])
    processed_df['n_beds'] = processed_df['n_beds'].fillna(0).astype(int)
    processed_df['host_is_superhost'] = processed_df['host_is_superhost'].map({'t': 1, 'f': 0}).fillna(0).astype(int)
    processed_df['n_review_scores_rating'] = processed_df['n_review_scores_rating'].fillna(0).astype(int)
    
    # Ensure dummy variables are integers
    dummy_cols = processed_df.filter(regex='^d_').columns
    processed_df[dummy_cols] = processed_df[dummy_cols].fillna(0).astype(int)
    
    return processed_df

# Assuming 'data' is your DataFrame loaded previously
cleaned_data = clean_airbnb_data(data)


In [50]:
cleaned_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 12226 entries, 1 to 18830
Data columns (total 63 columns):
 #   Column                              Non-Null Count  Dtype         
---  ------                              --------------  -----         
 0   n_accommodates                      12226 non-null  int64         
 1   n_beds                              12226 non-null  int32         
 2   n_review_scores_rating              12226 non-null  int32         
 3   host_is_superhost                   12226 non-null  int32         
 4   latitude                            12226 non-null  float64       
 5   longitude                           12226 non-null  float64       
 6   date_host_start                     12225 non-null  datetime64[ns]
 7   n_number_of_reviews                 12226 non-null  int64         
 8   n_availability_365                  12226 non-null  int64         
 9   n_minimum_nights                    12226 non-null  int64         
 10  n_maximum_nights           

In [51]:
cleaned_data.isna().sum().sum()

1

In [53]:
cleaned_data.shape

(12226, 63)