<a href="https://www.kaggle.com/code/nilsklute/kaggle-airbnb-preprocessing?scriptVersionId=216845580" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/berlin-amsterdam/raw_data/amsterdam/calendar.csv
/kaggle/input/berlin-amsterdam/raw_data/amsterdam/listings.csv
/kaggle/input/berlin-amsterdam/raw_data/amsterdam/neighbourhoods.geojson
/kaggle/input/berlin-amsterdam/raw_data/amsterdam/reviews.csv
/kaggle/input/berlin-amsterdam/raw_data/amsterdam/neighbourhoods.csv
/kaggle/input/berlin-amsterdam/raw_data/amsterdam/summary_information/listings.csv
/kaggle/input/berlin-amsterdam/raw_data/amsterdam/summary_information/reviews.csv
/kaggle/input/berlin-amsterdam/raw_data/berlin/calendar.csv
/kaggle/input/berlin-amsterdam/raw_data/berlin/listings.csv
/kaggle/input/berlin-amsterdam/raw_data/berlin/neighbourhoods.geojson
/kaggle/input/berlin-amsterdam/raw_data/berlin/reviews.csv
/kaggle/input/berlin-amsterdam/raw_data/berlin/neighbourhoods.csv
/kaggle/input/berlin-amsterdam/raw_data/berlin/summary_information/listings.csv
/kaggle/input/berlin-amsterdam/raw_data/berlin/summary_information/reviews.csv


In [2]:
import numpy as np
import pandas as pd
import os
from copy import deepcopy
import torch 
import transformers as tf
from torch.utils.data import DataLoader
import json
import sklearn
from tqdm import tqdm
import requests
from PIL import Image
from io import BytesIO
import torchvision.transforms as transforms
import torchvision.models as models
from torchvision.models import ResNet50_Weights

DEBUG_MODE = True # determines if preprocessing is in DEBUG_MODE (no processing of file --> execution of main-function)

In [3]:
class InsideAirbnbDataset:
    def __init__(
            self,
            raw_data_dir: str = "C:/Users/nilsk/Dokumente/Machine Learning (MSc.)/1. Semester/Data Literacy/DataLit-InsideAirbnb/data/raw_data",
            process_all_cities: bool = True,
            cities_to_process: list   = ["berlin"]):
        
        self.process_all_cities = process_all_cities
        self.cities_to_process = cities_to_process

        self.raw_data_dir = raw_data_dir

        # read in raw data from raw data directory in repository
        self.raw_data_dict = self._read_data_from_files()

        # integrate the reviews from reviews df into the listings df for each city in the raw_data_dict
        self._integrate_reviews_into_listings()

        # aggregate all listings dfs from each city and store in one all_cities_listings df
        self.all_cities_listings = self._aggregate_regional_listings_into_one_df()
        
    
    def _read_data_from_files(self):
        print(f"reading in data from {self.raw_data_dir}")
        cities_in_raw_data_dir = os.listdir(self.raw_data_dir)

        if not self.process_all_cities and not set(self.cities_to_process).issubset(cities_in_raw_data_dir):
            raise ValueError("not all requested citys are in directory")
        
        raw_data_dict = {}

        if self.process_all_cities:
            self.cities_to_process = cities_in_raw_data_dir
        
        for city in self.cities_to_process:
            print(f"collecting data for city: {city}")
            raw_data_dict[city] = {}
            city_dir = self.raw_data_dir + '/' + city
            file_names = [f for f in os.listdir(city_dir) if os.path.isfile(os.path.join(city_dir, f))]

            for file_name in file_names:
                if file_name.endswith('.csv') or file_name.endswith('.geojson') or file_name.endswith('.csv.gz'):
                    file_path = os.path.join(city_dir, file_name)
            
                    # Read the file into a DataFrame
                    if file_name.endswith('.geojson'):
                        df = pd.read_json(file_path)  # Adjust based on the specific geojson handling
                    else:
                        file_name_core = file_name.split(sep=".")[0]

                        if file_name_core == "reviews":
                            index_col = 1
                        else:
                            index_col = 0
                            
                        df = pd.read_csv(file_path, index_col=index_col)

                    raw_data_dict[city][file_name] = df

        print(f"collecting data process done")

        return raw_data_dict

    def _integrate_reviews_into_listings(self):
        print(f"initializing reviews collection process and integration into city listings")
        cities = self.raw_data_dict.keys()

        for city in cities:
            print(f"current city: {city}")
            city_listings = self.raw_data_dict[city]["listings.csv"]
            city_reviews = self.raw_data_dict[city]["reviews.csv"]       
            city_calendar = self.raw_data_dict[city]["calendar.csv"] 

            city_listings_indices = city_listings.index.to_list()
            city_listings["comments"] = [[] for _ in range(len(city_listings))]

            for index in city_listings_indices:
                city_index_reviews = city_reviews[city_reviews["listing_id"] == index]
                comments = city_index_reviews["comments"].to_list()

                comments_with_newline = []
                for comment in comments:
                    if type(comment) is float: #if it is nan, as nan are float values
                        comment = ""
                    comment_transformed = comment.replace('<br/>', '\n').replace('\r', '')
                    comments_with_newline.append(comment_transformed)

                city_listings.at[index, 'comments'] = comments_with_newline
        
        print("integration of reviews into cites listings done")

    def _aggregate_regional_listings_into_one_df(self):
        print("initializing aggregation of regional listings into one dataframe")
        cities = self.raw_data_dict.keys()
        all_cities_listings = []

        for city in cities:
            city_listings = self.raw_data_dict[city]["listings.csv"]
            city_listings.insert(0, 'region', city)
            all_cities_listings.append(city_listings)

        all_cities_listings = pd.concat(all_cities_listings, ignore_index=True)
        print("aggregation done")
        return all_cities_listings

    def add_nlp_embedding(self, 
                          nlp_col_names = ['name', 'description', 'neighborhood_overview', 'host_about', 'amenities','comments'], 
                          batch_size = 32):
        print("initializing NLP embedding process")
        print(f"batch size: {batch_size}") 
        device = 'cuda' if torch.cuda.is_available() else 'cpu'
        model_name = 'distilbert-base-multilingual-cased'
        tokenizer = tf.AutoTokenizer.from_pretrained(model_name, clean_up_tokenization_spaces=True)
        model = tf.AutoModel.from_pretrained(model_name).to(device)
        print(f"embeddings are computed using transformer model: {model_name} from hugging face")
        
        for nlp_col_name in nlp_col_names:
            print(f"current nlp column: {nlp_col_name}")

            nlp_col = self.all_cities_listings[nlp_col_name]
            nlp_col_list = []

            # convert nlp columns to a list 
            if nlp_col_name in ['name', 'description', 'neighborhood_overview', 'host_about', 'comments']:
                nlp_col_list = nlp_col.fillna(value="").to_list()
            elif nlp_col_name == "amenities":
                for amenities_raw_entry in nlp_col:
                    amenities_collection = json.loads(amenities_raw_entry) # amenities_raw_entry is in json string format
                    nlp_col_list.append(amenities_collection)
            else:
                raise ValueError(f"no procedure found for converting {nlp_col_name} to list")
            

            nlp_col_list_embedded = []

            pooling_approach = ['amenities', 'comments']
            # for each entry in nlp column, single embeddings are inferred for amenity_items / single reviews --> then mean pooling
            if nlp_col_name in pooling_approach:
                for i, entry in enumerate(tqdm(nlp_col_list)):
                    if entry == []:
                        entry = np.asarray([" "])
                        
                    dataloader = DataLoader(entry, batch_size=batch_size)
                    entry_items_embeddings_list = []
                    
                    for batch in dataloader:
                        inputs = tokenizer(batch, padding=True, truncation=True, return_tensors="pt").to(device)
                        with torch.no_grad():
                            outputs = model(**inputs)
                        embeddings = outputs.last_hidden_state[:, 0, :]
                        embeddings = embeddings.squeeze(0).cpu().numpy()
                        entry_items_embeddings_list.append(embeddings)
                    
                    embeddings_array = np.vstack(entry_items_embeddings_list)
                    mean_pooled_embedding = np.mean(embeddings_array, axis=0)
                    nlp_col_list_embedded.append(mean_pooled_embedding)
                    
            # embeddings are inferred directly for the entries of all other nlp columns
            else:
                dataloader = DataLoader(nlp_col_list, batch_size=batch_size)
                for batch in tqdm(dataloader):
                    inputs = tokenizer(batch, padding=True, truncation=True, return_tensors="pt").to(device)
                    with torch.no_grad():
                        outputs = model(**inputs)
                    embeddings = outputs.last_hidden_state[:, 0, :]
                    embeddings = embeddings.squeeze(0).cpu().numpy()
                    nlp_col_list_embedded += list(embeddings)
            
            nlp_col_embedded_name = nlp_col_name + '_emb'
            self.all_cities_listings[nlp_col_embedded_name] = nlp_col_list_embedded
        
        print("nlp embedding done")
    
    def dimensionality_reduction(self, 
                                 col_names = [
                                    'name_emb', 
                                    'description_emb', 
                                    'neighborhood_overview_emb', 
                                    'host_about_emb', 
                                    'amenities_emb',
                                    'comments_emb'
                                 ],
                                keep_variance = 0.95):
        
        print("initializing dimensionality reduction")
            
        for col_name in col_names:
            print(f"current embeddings: {col_name}")
            col = self.all_cities_listings[col_name]
            col_array = np.asarray([np.asarray(entry) for entry in col])

            pca = sklearn.decomposition.PCA(n_components = keep_variance, svd_solver='full')
            pca.fit(col_array)
            dim_red_col_array = pca.transform(col_array)
            print(f"used {pca.n_components_ } components for dim reduction to explain {keep_variance*100}% of the data")
            
            dim_red_col_name = col_name + '_dim_red'
            self.all_cities_listings[dim_red_col_name] = list(dim_red_col_array)
        print("dimensionality reduction done")

    def add_image_embedding(self, 
                            image_url_col_names = ['host_picture_url','picture_url'], 
                            batch_size = 32, 
                            embedd_n_images = -1):
        
        print("initializing image embedding process")
        
        for image_url_col_name in image_url_col_names:
            print(f"downloading images from web for column '{image_url_col_name}'")
            
            image_url_col = self.all_cities_listings[image_url_col_name]
            image_list = []
            no_access_indices = []
            image_size = (256,256)
            
            for i, image_url in enumerate(tqdm(image_url_col)):
                if embedd_n_images >= 0 and i == embedd_n_images:
                    break
                response = requests.get(image_url)
                
                # code for successful request is 200
                if response.status_code == 200:
                    image = Image.open(BytesIO(response.content)).resize(image_size)
                    if image.mode != "RGB":
                        image = image.convert('RGB')
                    image_list.append(image)
                else:
                    no_access_indices.append(i)
                    image_list.append(Image.new("RGB", image_size))
                    #response.raise_for_status()
    
            print(f"pictures from rows {no_access_indices} could not be accessed")
            print("transform images and construct dataloader")
    
            normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])
            
            image_transform = transforms.Compose([
                                            transforms.Resize(256),
                                            transforms.CenterCrop(224),
                                            transforms.ToTensor(),
                                            normalize
                                            ])
            tensor_image_list = [image_transform(image) for image in image_list]
    
            data_loader = DataLoader(tensor_image_list, batch_size=batch_size)
            
            resnet = models.resnet50(weights=ResNet50_Weights.DEFAULT)
            modules = list(resnet.children())[:-1]  # remove the FC layer
            resnet_feature_extractor = torch.nn.Sequential(*modules)
            resnet_feature_extractor.eval()
            
            print("embedding image data using ResNet50")
            feature_embeddings_list = []
       
            for batch in tqdm(data_loader):
                with torch.no_grad():
                    feature_embeddings = resnet_feature_extractor(batch)
                feature_embeddings = feature_embeddings.view(feature_embeddings.size(0), -1).numpy()
    
                feature_embeddings_list += list(feature_embeddings)
    
            col_name_core = image_url_col_name.split('_')[:-1]
            image_col_embedded_name = '_'.join(col_name_core + ['emb'])
            
            # only important if embedd_n_images not -1 --> not all images get embedded
            feature_embeddings_list_n = len(feature_embeddings_list)
            all_listings_n = len(self.all_cities_listings)
            diff = all_listings_n - feature_embeddings_list_n
            for _ in range (diff):
                feature_embeddings_list.append([]) 
                
    
            valid_feature_embeddings_list = deepcopy(feature_embeddings_list)[:feature_embeddings_list_n]
            for index in no_access_indices[::-1]:
                del valid_feature_embeddings_list[index]
    
            valid_feature_embeddings_array = np.asarray(valid_feature_embeddings_list)
            mean_embedding = np.mean(valid_feature_embeddings_array, axis=0)
            print(f"mean_embedding: {mean_embedding}")
            
            for no_access_index in no_access_indices:
                feature_embeddings_list[no_access_index] = mean_embedding
    
            self.all_cities_listings[image_col_embedded_name] = feature_embeddings_list
            
        print("image embedding done")
    
    def save_all_cities_listings_to_file(self, 
                                         file_name, 
                                         saving_dir =  'C:/Users/nilsk/Dokumente/Machine Learning (MSc.)/1. Semester/Data Literacy/DataLit-InsideAirbnb/data/preprocessed_data'):
        
        self.saving_dir = saving_dir
        file_path = saving_dir + '/' + file_name
        self.all_cities_listings.to_csv(file_path)
        print(f"all cities listings saved to path: {file_path}")

In [4]:
data_set = InsideAirbnbDataset(raw_data_dir="/kaggle/input/berlin-amsterdam/raw_data")

reading in data from /kaggle/input/berlin-amsterdam/raw_data
collecting data for city: amsterdam
collecting data for city: berlin
collecting data process done
initializing reviews collection process and integration into city listings
current city: amsterdam
current city: berlin
integration of reviews into cites listings done
initializing aggregation of regional listings into one dataframe
aggregation done


In [31]:
#data_set.save_all_cities_listings_to_file('ignore_all_listings.csv', saving_dir="/kaggle/working")
print(data_set.all_cities_listings['accommodates'])
print(data_set.all_cities_listings.columns)

0        3
1        2
2        2
3        3
4        6
        ..
23671    2
23672    2
23673    2
23674    3
23675    1
Name: accommodates, Length: 23676, dtype: int64
Index(['region', 'listing_url', 'scrape_id', 'last_scraped', 'source', 'name',
       'description', 'neighborhood_overview', 'picture_url', 'host_id',
       'host_url', 'host_name', 'host_since', 'host_location', 'host_about',
       'host_response_time', 'host_response_rate', 'host_acceptance_rate',
       'host_is_superhost', 'host_thumbnail_url', 'host_picture_url',
       'host_neighbourhood', 'host_listings_count',
       'host_total_listings_count', 'host_verifications',
       'host_has_profile_pic', 'host_identity_verified', 'neighbourhood',
       'neighbourhood_cleansed', 'neighbourhood_group_cleansed', 'latitude',
       'longitude', 'property_type', 'room_type', 'accommodates', 'bathrooms',
       'bathrooms_text', 'bedrooms', 'beds', 'amenities', 'price',
       'minimum_nights', 'maximum_nights', 'minimu

In [28]:
data_set.add_nlp_embedding()

initializing NLP embedding process
batch size: 32
embeddings are computed using transformer model: distilbert-base-multilingual-cased from hugging face
current nlp column: name


100%|██████████| 740/740 [00:09<00:00, 77.29it/s]


current nlp column: description


100%|██████████| 740/740 [01:04<00:00, 11.53it/s]


current nlp column: neighborhood_overview


100%|██████████| 740/740 [01:37<00:00,  7.58it/s]


current nlp column: host_about


100%|██████████| 740/740 [01:26<00:00,  8.52it/s]


current nlp column: amenities


100%|██████████| 23676/23676 [04:03<00:00, 97.18it/s] 


current nlp column: comments


  8%|▊         | 1795/23676 [04:08<50:32,  7.22it/s]  


KeyboardInterrupt: 

In [29]:
data_set.dimensionality_reduction(col_names = ['name_emb'])

initializing dimensionality reduction
current embeddings: name_emb


AttributeError: module 'sklearn' has no attribute 'decomposition'

In [12]:
data_set.add_image_embedding(embedd_n_images = 64)

initializing image embedding process
downloading images from web for column 'host_picture_url'


  0%|          | 64/23676 [00:04<26:51, 14.65it/s]


pictures from rows [] could not be accessed
transform images and construct dataloader
embedding image data using ResNet50


100%|██████████| 2/2 [00:08<00:00,  4.00s/it]


mean_embedding: [0.06219788 0.02726397 0.10110724 ... 0.03680211 0.03957219 0.06989616]
downloading images from web for column 'picture_url'


  0%|          | 38/23676 [00:10<1:49:59,  3.58it/s]


KeyboardInterrupt: 

In [None]:
numerical_columns = ['host_since', 'host_response_rate', 'host_acceptance_rate', 'host_listings_count', 'host_total_listings_count', 'latitude', 'longitude', 
                     'bathrooms', 'bedrooms', 'beds',
                     'accommodates', 'price', 'minimum_nights', 'maximum_nights', 'minimum_minimum_nights','maximum_minimum_nights', 'minimum_maximum_nights',
                     'maximum_maximum_nights', 'minimum_nights_avg_ntm', 'maximum_nights_avg_ntm', 'availability_30', 'availability_60', 'availability_90',
                     'availability_365', 'number_of_reviews', 'number_of_reviews_ltm', 'number_of_reviews_l30d', 'first_review', 'last_review', 'review_scores_rating', 
                     'review_scores_accuracy', 'review_scores_cleanliness', 'review_scores_checkin', 'review_scores_communication', 'review_scores_location', 'review_scores_value', 
                     'reviews_per_month', 'calculated_host_listings_count', 'calculated_host_listings_count_entire_homes', 'calculated_host_listings_count_private_rooms', 
                     'calculated_host_listings_count_shared_rooms']
categorical_columns = ['region', 'host_location', 'host_response_time', 'host_is_superhost', 'host_neighbourhood', 'host_has_profile_pic', 'host_identity_verified', 
                       'neighbourhood_group_cleansed', 'neighbourhood_group_cleansed', 'property_type', 'room_type', 'has_availability', 'instant_bookable'] #make list with unique values of each column here
natural_language_columns = ['name', 'description', 'neighborhood_overview', 'host_about', 'amenities', 'comments']
image_weblinks_columns = ['picture_url', 'host_picture_url']
meta_data_columns = ['listing_url', 'scrape_id', 'last_scraped', 'source',  'host_id', 'host_url', 'host_name', 'host_thumbnail_url', 'host_verifications', 'neighbourhood', 'calendar_last_scraped', 'license']
nan_columns = ['calendar_updated']

# not shure: host_name, difference between 'host_listings_count', 'host_total_listings_count', host_verifications
#how to encode?: host_since as calendar information, host_neigbourhood , 'latitude' and 'longitude'; 'license' as has_license (boolean)?
# even include? 'neighbourhood' if we have 'region' as part of df but 'neighbourhood_group_cleansed', 'neighbourhood_group_cleansed' are more exact; 'bathrooms_text' if bathroom is the same


#which category?: 'bathrooms'
all_listings = data_set.all_cities_listings
#print(all_listings.columns)

categorical_uniques_n = {cat_col: len(all_listings[cat_col].unique()) for cat_col in categorical_columns}
categorical_uniques = {cat_col: all_listings[cat_col].unique() for cat_col in categorical_columns}

numerical_uniques_n = {num_col: len(all_listings[num_col].unique()) for num_col in numerical_columns}


print(numerical_uniques_n)
#print(numerical_uniques_n['host_has_profile_pic'])