In [1]:
import numpy as np
import pandas as pd
import os
from copy import deepcopy

PATH_TO_REPO = "C:/Users/nilsk/Dokumente/Machine Learning (MSc.)/1. Semester/Data Literacy/DataLit-InsideAirbnb"
RAW_DATA_DIR = PATH_TO_REPO + '/data/raw_data'
SAVING_DIR = PATH_TO_REPO + '/data/preprocessed_data'
PROCESS_ALL_CITIES = True
CITY_LIST   = ["berlin"] #list cities which should be processed if not PROCESS_ALL_CITIES
DEBUG_MODE = True # determines if preprocessing is in DEBUG_MODE (no processing of file --> execution of main-function)

In [2]:
def collecting_data_from_source(city_list):
    """ 
    Converts raw data in RAW_DATA_DIR to proper CSV file format for cities specified in CITY_LIST (see above for global settings).
    Converted files are saved in SAVING_DIR.
    
    """
    print("initializing preprocessing")
    cities_in_raw_data_dir = os.listdir(RAW_DATA_DIR)

    if not PROCESS_ALL_CITIES and not set(CITY_LIST).issubset(cities_in_raw_data_dir):
        raise ValueError("not all requested citys are in directory")
    
    data_dict = {}

    if PROCESS_ALL_CITIES:
        CITY_LIST = cities_in_raw_data_dir
    
    for city in CITY_LIST:
        data_dict[city] = {}
        city_dir = RAW_DATA_DIR + '/' + city
        FILE_NAMES = [f for f in os.listdir(city_dir) if os.path.isfile(os.path.join(city_dir, f))]

        for file_name in FILE_NAMES:
            if file_name.endswith('.csv') or file_name.endswith('.geojson') or file_name.endswith('.csv.gz'):
                file_path = os.path.join(city_dir, file_name)
        
                # Read the file into a DataFrame
                if file_name.endswith('.geojson'):
                    df = pd.read_json(file_path)  # Adjust based on the specific geojson handling
                else:
                    file_name_core = file_name.split(sep=".")[0]

                    if file_name_core == "reviews":
                        index_col = 1
                    else:
                        index_col = 0
                        
                    df = pd.read_csv(file_path, index_col=index_col)

                #basename = file_name.split(sep=".")[0]
                
                data_dict[city][file_name] = df
    
    return data_dict

                
    print("preprocessing done")

In [3]:
data_dict = collecting_data_from_source(CITY_LIST)

initializing preprocessing


  mask |= (ar1 == a)


In [4]:
brln_listings = data_dict["berlin"]["listings.csv"]
amstrdm_listings = data_dict["amsterdam"]["listings.csv"]
brln_reviews = data_dict["berlin"]["reviews.csv"]

brln_list_n = len(brln_listings)
brln_rev_n = len(brln_reviews)

brln_list_index = set(brln_listings.index.to_list())
brln_rev_index = set(brln_reviews.index.to_list())

amstrdm_list_index = set(amstrdm_listings.index.to_list())


""" print(len(brln_rev_index))
print(brln_rev_n)
print(brln_reviews["listing_id"])
data_dict["berlin"].keys()

print(len(brln_listings.index.to_list()))
print(len(brln_list_index)) """

brln_reviews.iloc[1]
print(sorted(brln_listings.index.to_list()) == brln_listings.index.to_list())

print(len(brln_list_index.intersection(amstrdm_list_index)))


False
0


In [19]:
def integrate_reviews_and_aggregate(data_dict):
    cities = data_dict.keys()
    cities_listings_with_region = []

    data_dict = deepcopy(data_dict)

    for city in cities:
        print(f"collecting reviews for city: {city}")
        city_listings = data_dict[city]["listings.csv"]
        city_reviews = data_dict[city]["reviews.csv"]       
        city_calendar = data_dict[city]["calendar.csv"] 

        city_listings_indices = city_listings.index.to_list()
        city_listings["comments"] = [[] for _ in range(len(city_listings))]

        for index in city_listings_indices:
            city_index_reviews = city_reviews[city_reviews["listing_id"] == index]
            comments = city_index_reviews["comments"].to_list()

            comments_with_newline = []
            for comment in comments:
                if type(comment) is float:
                    comment = ""
                comment_transformed = comment.replace('<br/>', '\n').replace('\r', '')
                comments_with_newline.append(comment_transformed)

            city_listings.at[index, 'comments'] = comments_with_newline
        
        city_listings.insert(0, 'region', city)
        cities_listings_with_region.append(city_listings)

    print(f"integrate all city dataframes into one")
    cities_listings_with_region = pd.concat(cities_listings_with_region, ignore_index=True)

    return cities_listings_with_region
    

In [20]:
cities_listings_with_region = integrate_reviews_and_aggregate(data_dict)

collecting reviews for city: amsterdam
collecting reviews for city: berlin
integrate all city dataframes into one


In [21]:
print(comments_list_transformed[0])

['Appartement très agréable, seul hic, il est un peu excentré.', 'Great apartment! It was very clean and comfortable, in a residential area with easy tram access to the city center, and a very hospitable host. Jeroen gave us great restaurant recommendations in the area in his guidebook, and was easy to reach. I would definitely recommend staying here when visiting Amsterdam! ', '\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nApartment very nice and warm, like his host, who provided us with a very good weekend', 'Week perfect , good location , thanks :)', "Communication with Jeroen was excellent, and the flat was exactly as described. It's a nice and quiet neighbourhood, with easy access to public transportation, and there are also some nice nearby restaurants. I would definitely stay again in the future. ", "We were left stranded by another host at the last minute and this kind man came to our rescue. He arranged for us to check in within an hour of requesting to book. The hos