In [2]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/berlin-amsterdam/raw_data/amsterdam/calendar.csv
/kaggle/input/berlin-amsterdam/raw_data/amsterdam/listings.csv
/kaggle/input/berlin-amsterdam/raw_data/amsterdam/neighbourhoods.geojson
/kaggle/input/berlin-amsterdam/raw_data/amsterdam/reviews.csv
/kaggle/input/berlin-amsterdam/raw_data/amsterdam/neighbourhoods.csv
/kaggle/input/berlin-amsterdam/raw_data/amsterdam/summary_information/listings.csv
/kaggle/input/berlin-amsterdam/raw_data/amsterdam/summary_information/reviews.csv
/kaggle/input/berlin-amsterdam/raw_data/berlin/calendar.csv
/kaggle/input/berlin-amsterdam/raw_data/berlin/listings.csv
/kaggle/input/berlin-amsterdam/raw_data/berlin/neighbourhoods.geojson
/kaggle/input/berlin-amsterdam/raw_data/berlin/reviews.csv
/kaggle/input/berlin-amsterdam/raw_data/berlin/neighbourhoods.csv
/kaggle/input/berlin-amsterdam/raw_data/berlin/summary_information/listings.csv
/kaggle/input/berlin-amsterdam/raw_data/berlin/summary_information/reviews.csv


In [3]:
import numpy as np
import pandas as pd
import os
from copy import deepcopy
import torch 
import transformers as tf
from torch.utils.data import DataLoader

PATH_TO_REPO = "/kaggle/input"
RAW_DATA_DIR = PATH_TO_REPO + '/berlin-amsterdam/raw_data'
SAVING_DIR = '/kaggle/working'
PROCESS_ALL_CITIES = True
CITY_LIST   = ["berlin"] #list cities which should be processed if not PROCESS_ALL_CITIES
DEBUG_MODE = True # determines if preprocessing is in DEBUG_MODE (no processing of file --> execution of main-function)

In [4]:
def collecting_data_from_source(city_list):
    """ 
    Converts raw data in RAW_DATA_DIR to proper CSV file format for cities specified in CITY_LIST (see above for global settings).
    Converted files are saved in SAVING_DIR.
    
    """
    print("initializing preprocessing")
    cities_in_raw_data_dir = os.listdir(RAW_DATA_DIR)

    if not PROCESS_ALL_CITIES and not set(CITY_LIST).issubset(cities_in_raw_data_dir):
        raise ValueError("not all requested citys are in directory")
    
    data_dict = {}

    if PROCESS_ALL_CITIES:
        CITY_LIST = cities_in_raw_data_dir
    
    for city in CITY_LIST:
        data_dict[city] = {}
        city_dir = RAW_DATA_DIR + '/' + city
        FILE_NAMES = [f for f in os.listdir(city_dir) if os.path.isfile(os.path.join(city_dir, f))]

        for file_name in FILE_NAMES:
            if file_name.endswith('.csv') or file_name.endswith('.geojson') or file_name.endswith('.csv.gz'):
                file_path = os.path.join(city_dir, file_name)
        
                # Read the file into a DataFrame
                if file_name.endswith('.geojson'):
                    df = pd.read_json(file_path)  # Adjust based on the specific geojson handling
                else:
                    file_name_core = file_name.split(sep=".")[0]

                    if file_name_core == "reviews":
                        index_col = 1
                    else:
                        index_col = 0
                        
                    df = pd.read_csv(file_path, index_col=index_col)

                #basename = file_name.split(sep=".")[0]
                
                data_dict[city][file_name] = df
    print(f"collected data from {RAW_DATA_DIR} and stored in data dictionary")            
    
    return data_dict

                
    

In [5]:
data_dict = collecting_data_from_source(CITY_LIST)

initializing preprocessing
collected data from /kaggle/input/berlin-amsterdam/raw_data and stored in data dictionary


In [30]:
brln_listings = data_dict["berlin"]["listings.csv"]
amstrdm_listings = data_dict["amsterdam"]["listings.csv"]
brln_reviews = data_dict["berlin"]["reviews.csv"]

brln_list_n = len(brln_listings)
brln_rev_n = len(brln_reviews)

brln_list_index = set(brln_listings.index.to_list())
brln_rev_index = set(brln_reviews.index.to_list())

amstrdm_list_index = set(amstrdm_listings.index.to_list())


""" print(len(brln_rev_index))
print(brln_rev_n)
print(brln_reviews["listing_id"])
data_dict["berlin"].keys()

print(len(brln_listings.index.to_list()))
print(len(brln_list_index)) 

print(sorted(brln_listings.index.to_list()) == brln_listings.index.to_list())

print(len(brln_list_index.intersection(amstrdm_list_index)))"""
"""cur_column = brln_listings.columns[73]
print(brln_listings.iloc[100][cur_column])
print(cur_column)
print(brln_listings.columns)
print(len(brln_listings["license"].unique()))"""


numerical_columns = ['host_since', 'host_response_rate', 'host_acceptance_rate', 'host_listings_count', 'host_total_listings_count', 'latitude', 'longitude', 
                     'bathrooms', 'bedrooms', 'beds'
                     'accommodates', 'price', 'minimum_nights', 'maximum_nights', 'minimum_minimum_nights','maximum_minimum_nights', 'minimum_maximum_nights',
                     'maximum_maximum_nights', 'minimum_nights_avg_ntm', 'maximum_nights_avg_ntm', 'availability_30', 'availability_60', 'availability_90',
                     'availability_365', 'number_of_reviews', 'number_of_reviews_ltm', 'number_of_reviews_l30d', 'first_review', 'last_review', 'review_scores_rating', 
                     'review_scores_accuracy', 'review_scores_cleanliness', 'review_scores_checkin', 'review_scores_communication', 'review_scores_location', 'review_scores_value', 
                     'reviews_per_month''calculated_host_listings_count', 'calculated_host_listings_count_entire_homes', 'calculated_host_listings_count_private_rooms', 
                     'calculated_host_listings_count_shared_rooms']
categorical_columns = ['host_location', 'host_response_time', 'host_is_superhost', 'host_neighbourhood', 'host_has_profile_pic', 'host_identity_verified', 
                       'neighbourhood_group_cleansed', 'neighbourhood_group_cleansed', 'property_type', 'room_type', 'has_availability', 'instant_bookable'] #make list with unique values of each column here
natural_language_columns = ['name', 'description', 'neighborhood_overview', 'host_about', 'amenities']
image_weblinks_columns = ['picture_url', 'host_picture_url']
meta_data_columns = ['listing_url', 'scrape_id', 'last_scraped', 'source',  'host_id', 'host_url', 'host_name', 'host_thumbnail_url', 'host_verifications', 'neighbourhood', 'calendar_last_scraped', 'license']
nan_columns = ['calendar_updated']

# not shure: host_name, difference between 'host_listings_count', 'host_total_listings_count', host_verifications
#how to encode?: host_since as calendar information, host_neigbourhood , 'latitude' and 'longitude'; 'license' as has_license (boolean)?
# even include? 'neighbourhood' if we have 'region' as part of df but 'neighbourhood_group_cleansed', 'neighbourhood_group_cleansed' are more exact; 'bathrooms_text' if bathroom is the same


#which category?: 'bathrooms'

categorical_uniques_n = {cat_col: len(brln_listings[cat_col].unique()) for cat_col in categorical_columns}
categorical_uniques = {cat_col: brln_listings[cat_col].unique() for cat_col in categorical_columns}

numerical_uniques_n = {cat_col: len(brln_listings[cat_col].unique()) for cat_col in categorical_columns}


print(categorical_uniques_n)
#print(categorical_uniques['beds'])


{'host_location': 478, 'host_response_time': 5, 'host_is_superhost': 3, 'host_neighbourhood': 161, 'host_has_profile_pic': 3, 'host_identity_verified': 3, 'neighbourhood_group_cleansed': 12, 'property_type': 63, 'room_type': 4, 'has_availability': 3, 'instant_bookable': 2}


In [28]:
def integrate_reviews_and_aggregate_regions(data_dict):
    cities = data_dict.keys()
    cities_listings_with_region = []

    data_dict = deepcopy(data_dict)

    for city in cities:
        print(f"collecting reviews for city: {city}")
        city_listings = data_dict[city]["listings.csv"]
        city_reviews = data_dict[city]["reviews.csv"]       
        city_calendar = data_dict[city]["calendar.csv"] 

        city_listings_indices = city_listings.index.to_list()
        city_listings["comments"] = [[] for _ in range(len(city_listings))]

        for index in city_listings_indices:
            city_index_reviews = city_reviews[city_reviews["listing_id"] == index]
            comments = city_index_reviews["comments"].to_list()

            if comments == []:
                comments = [" "]

            comments_with_newline = []
            for comment in comments:
                if type(comment) is float:
                    comment = " "
                comment_transformed = comment.replace('<br/>', '\n').replace('\r', '')
                comments_with_newline.append(comment_transformed)

            city_listings.at[index, 'comments'] = comments_with_newline
        
        city_listings.insert(0, 'region', city)
        cities_listings_with_region.append(city_listings)

    print(f"integrate all city dataframes into one")
    cities_listings_with_region = pd.concat(cities_listings_with_region, ignore_index=True)

    return cities_listings_with_region
    

In [29]:
cities_listings_with_region = integrate_reviews_and_aggregate_regions(data_dict)

collecting reviews for city: amsterdam
collecting reviews for city: berlin
integrate all city dataframes into one


In [39]:
numerical_columns = ['host_since', 'host_response_rate', 'host_acceptance_rate', 'host_listings_count', 'host_total_listings_count', 'latitude', 'longitude', 
                     'bathrooms', 'bedrooms', 'beds',
                     'accommodates', 'price', 'minimum_nights', 'maximum_nights', 'minimum_minimum_nights','maximum_minimum_nights', 'minimum_maximum_nights',
                     'maximum_maximum_nights', 'minimum_nights_avg_ntm', 'maximum_nights_avg_ntm', 'availability_30', 'availability_60', 'availability_90',
                     'availability_365', 'number_of_reviews', 'number_of_reviews_ltm', 'number_of_reviews_l30d', 'first_review', 'last_review', 'review_scores_rating', 
                     'review_scores_accuracy', 'review_scores_cleanliness', 'review_scores_checkin', 'review_scores_communication', 'review_scores_location', 'review_scores_value', 
                     'reviews_per_month', 'calculated_host_listings_count', 'calculated_host_listings_count_entire_homes', 'calculated_host_listings_count_private_rooms', 
                     'calculated_host_listings_count_shared_rooms']
categorical_columns = ['region', 'host_location', 'host_response_time', 'host_is_superhost', 'host_neighbourhood', 'host_has_profile_pic', 'host_identity_verified', 
                       'neighbourhood_group_cleansed', 'neighbourhood_group_cleansed', 'property_type', 'room_type', 'has_availability', 'instant_bookable'] #make list with unique values of each column here
natural_language_columns = ['name', 'description', 'neighborhood_overview', 'host_about', 'amenities']
image_weblinks_columns = ['picture_url', 'host_picture_url']
meta_data_columns = ['listing_url', 'scrape_id', 'last_scraped', 'source',  'host_id', 'host_url', 'host_name', 'host_thumbnail_url', 'host_verifications', 'neighbourhood', 'calendar_last_scraped', 'license']
nan_columns = ['calendar_updated']

# not shure: host_name, difference between 'host_listings_count', 'host_total_listings_count', host_verifications
#how to encode?: host_since as calendar information, host_neigbourhood , 'latitude' and 'longitude'; 'license' as has_license (boolean)?
# even include? 'neighbourhood' if we have 'region' as part of df but 'neighbourhood_group_cleansed', 'neighbourhood_group_cleansed' are more exact; 'bathrooms_text' if bathroom is the same


#which category?: 'bathrooms'

#print(cities_listings_with_region.columns)
categorical_uniques_n = {cat_col: len(cities_listings_with_region[cat_col].unique()) for cat_col in categorical_columns}
categorical_uniques = {cat_col: cities_listings_with_region[cat_col].unique() for cat_col in categorical_columns}

numerical_uniques_n = {num_col: len(cities_listings_with_region[num_col].unique()) for num_col in numerical_columns}


print(numerical_uniques_n)
#print(numerical_uniques_n['host_has_profile_pic'])

{'host_since': 4510, 'host_response_rate': 75, 'host_acceptance_rate': 102, 'host_listings_count': 80, 'host_total_listings_count': 90, 'latitude': 17223, 'longitude': 19193, 'bathrooms': 20, 'bedrooms': 17, 'beds': 27, 'accommodates': 16, 'price': 734, 'minimum_nights': 114, 'maximum_nights': 220, 'minimum_minimum_nights': 119, 'maximum_minimum_nights': 122, 'minimum_maximum_nights': 209, 'maximum_maximum_nights': 209, 'minimum_nights_avg_ntm': 403, 'maximum_nights_avg_ntm': 718, 'availability_30': 31, 'availability_60': 61, 'availability_90': 91, 'availability_365': 366, 'number_of_reviews': 622, 'number_of_reviews_ltm': 175, 'number_of_reviews_l30d': 40, 'first_review': 3780, 'last_review': 2307, 'review_scores_rating': 141, 'review_scores_accuracy': 135, 'review_scores_cleanliness': 173, 'review_scores_checkin': 128, 'review_scores_communication': 136, 'review_scores_location': 138, 'review_scores_value': 159, 'reviews_per_month': 824, 'calculated_host_listings_count': 37, 'calcula

In [10]:
def add_comments_embedding(cities_listings_with_region):
    comments_list = cities_listings_with_region["comments"].to_list()
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    
    model_name = "bert-base-uncased"
    tokenizer = tf.AutoTokenizer.from_pretrained(model_name, clean_up_tokenization_spaces=True)
    model = tf.AutoModel.from_pretrained(model_name).to(device)
    
    comments_list_embedded = []
    batch_size = 32
    
    for i, comments in enumerate(comments_list):
        if i > 3:
            break
        dataloader = DataLoader(comments, batch_size=batch_size)
        embeddings_list = []
        
        for batch in dataloader:
            inputs = tokenizer(batch, padding=True, truncation=True, return_tensors="pt").to(device)
            with torch.no_grad():
                outputs = model(**inputs)
            embeddings = outputs.last_hidden_state[:, 0, :]
            embeddings = embeddings.squeeze(0).cpu().numpy()
            embeddings_list.append(embeddings)
        
        embeddings_array = np.vstack(embeddings_list)
        mean_pooled_embedding = np.mean(embeddings_array, axis=0)
        print(len(mean_pooled_embedding))
        comments_list_embedded.append(mean_pooled_embedding)
        

    cities_listings_with_region["comments_emb"] = comments_list_embedded

    return cities_listings_with_region


In [11]:
cities_listings_with_region = add_comments_embedding(cities_listings_with_region)

cities_listings_with_region.to_csv("cities_listings_with_region.csv")

768
768


KeyboardInterrupt: 