In [87]:
import numpy as np
import pandas as pd
import os
from copy import deepcopy

PATH_TO_REPO = "C:/Users/nilsk/Dokumente/Machine Learning (MSc.)/1. Semester/Data Literacy/DataLit-InsideAirbnb"
RAW_DATA_DIR = PATH_TO_REPO + '/data/raw_data'
SAVING_DIR = PATH_TO_REPO + '/data/preprocessed_data'
PROCESS_ALL_CITIES = True
CITY_LIST   = ["berlin"] #list cities which should be processed if not PROCESS_ALL_CITIES
DEBUG_MODE = True # determines if preprocessing is in DEBUG_MODE (no processing of file --> execution of main-function)

In [27]:
def collecting_data_from_source(city_list):
    """ 
    Converts raw data in RAW_DATA_DIR to proper CSV file format for cities specified in CITY_LIST (see above for global settings).
    Converted files are saved in SAVING_DIR.
    
    """
    print("initializing preprocessing")
    cities_in_raw_data_dir = os.listdir(RAW_DATA_DIR)

    if not PROCESS_ALL_CITIES and not set(CITY_LIST).issubset(cities_in_raw_data_dir):
        raise ValueError("not all requested citys are in directory")
    
    data_dict = {}

    if PROCESS_ALL_CITIES:
        CITY_LIST = cities_in_raw_data_dir
    
    for city in CITY_LIST:
        data_dict[city] = {}
        city_dir = RAW_DATA_DIR + '/' + city
        FILE_NAMES = [f for f in os.listdir(city_dir) if os.path.isfile(os.path.join(city_dir, f))]

        for file_name in FILE_NAMES:
            if file_name.endswith('.csv') or file_name.endswith('.geojson') or file_name.endswith('.csv.gz'):
                file_path = os.path.join(city_dir, file_name)
        
                # Read the file into a DataFrame
                if file_name.endswith('.geojson'):
                    df = pd.read_json(file_path)  # Adjust based on the specific geojson handling
                else:
                    file_name_core = file_name.split(sep=".")[0]

                    if file_name_core == "reviews":
                        index_col = 1
                    else:
                        index_col = 0
                        
                    df = pd.read_csv(file_path, index_col=index_col)

                #basename = file_name.split(sep=".")[0]
                
                data_dict[city][file_name] = df
    
    return data_dict

                
    print("preprocessing done")

In [28]:
data_dict = collecting_data_from_source(CITY_LIST)

initializing preprocessing


  mask |= (ar1 == a)


In [84]:
brln_listings = data_dict["berlin"]["listings.csv"]
amstrdm_listings = data_dict["amsterdam"]["listings.csv"]
brln_reviews = data_dict["berlin"]["reviews.csv"]

brln_list_n = len(brln_listings)
brln_rev_n = len(brln_reviews)

brln_list_index = set(brln_listings.index.to_list())
brln_rev_index = set(brln_reviews.index.to_list())

amstrdm_list_index = set(amstrdm_listings.index.to_list())


""" print(len(brln_rev_index))
print(brln_rev_n)
print(brln_reviews["listing_id"])
data_dict["berlin"].keys()

print(len(brln_listings.index.to_list()))
print(len(brln_list_index)) """

brln_reviews.iloc[1]
print(sorted(brln_listings.index.to_list()) == brln_listings.index.to_list())

print(len(brln_list_index.intersection(amstrdm_list_index)))


False
0


In [91]:
def integrating_into_one_df(data_dict):
    cities = data_dict.keys()
    cities_listings_with_region = []

    data_dict = deepcopy(data_dict)

    for city in cities:
        print(f"collecting reviews for city: {city}")
        city_listings = data_dict[city]["listings.csv"]
        city_reviews = data_dict[city]["reviews.csv"]       
        city_calendar = data_dict[city]["calendar.csv"] 

        city_listings_indices = city_listings.index.to_list()
        city_listings["comments"] = [[] for _ in range(len(city_listings))]

        for index in city_listings_indices:
            city_index_reviews = city_reviews[city_reviews["listing_id"] == index]
            comments = city_index_reviews["comments"].to_list()
            city_listings.at[index, 'comments'] = comments
        
        city_listings.insert(0, 'region', city)
        cities_listings_with_region.append(city_listings)

    print(f"integrate all city dataframes into one")
    cities_listings_with_region = pd.concat(cities_listings_with_region, ignore_index=True)

    return cities_listings_with_region

    
    


        
        


        
             

In [92]:
cities_listings_with_region = integrating_into_one_df(data_dict)

collecting reviews for city: amsterdam
collecting reviews for city: berlin
integrate all city dataframes into one


In [94]:
cities_listings_with_region

Unnamed: 0,region,listing_url,scrape_id,last_scraped,source,name,description,neighborhood_overview,picture_url,host_id,...,review_scores_location,review_scores_value,license,instant_bookable,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month,comments
0,amsterdam,https://www.airbnb.com/rooms/6624170,20240905174946,2024-09-05,previous scrape,"Warm, cozy sunlighted downtown appt",2 room appt. 1.8 km from central station with ...,,https://a0.muscache.com/pictures/df91da10-f7d4...,34670170,...,4.56,4.65,0363 3CF3 3233 3B24 6BEC,f,1,1,0,0,0.38,"[Appartement très agréable, seul hic, il est u..."
1,amsterdam,https://www.airbnb.com/rooms/8837071,20240905174946,2024-09-05,previous scrape,Cozy apartment in city center,Located in Amsterdam's sweet spot. A stone's t...,see the guide,https://a0.muscache.com/pictures/5fee12d4-61d0...,13034277,...,,,0363 0C20 1768 FAAA 3556,f,1,1,0,0,,[]
2,amsterdam,https://www.airbnb.com/rooms/716107,20240905174946,2024-09-05,previous scrape,Loft style home nearby city centre,,,https://a0.muscache.com/pictures/9927048/b367a...,3689867,...,4.66,4.61,0363 0014 1E57 F097 81A5,f,1,1,0,0,1.21,[Marijke is a very nice and helpful hostess ! ...
3,amsterdam,https://www.airbnb.com/rooms/664538756986273255,20240905174946,2024-09-06,previous scrape,Geweldige duurzame eco woonark op unieke plek!,This unique eco houseboat is located in the mo...,,https://a0.muscache.com/pictures/miso/Hosting-...,7647768,...,5.00,5.00,0363 468D A0A1 6595 5E39,f,1,1,0,0,0.04,[If you are at all interested in sustainable d...
4,amsterdam,https://www.airbnb.com/rooms/8191077,20240905174946,2024-09-05,previous scrape,Old bar apartment,This just renovated apartment for 6 persons is...,Our apartment is situated in the centre of Ams...,https://a0.muscache.com/pictures/miso/Hosting-...,43205475,...,4.83,4.45,Exempt,f,1,0,1,0,2.41,"[Week end très agréable, l'apartement est dans..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23671,berlin,https://www.airbnb.com/rooms/1242619875270165606,20240918135550,2024-09-18,city scrape,2 Room Apartment in Central Berlin Greenest Area,**Bright 2-Room Apartment in Berlin’s Green Oa...,,https://a0.muscache.com/pictures/miso/Hosting-...,295111,...,,,,f,3,3,0,0,,[]
23672,berlin,https://www.airbnb.com/rooms/1242745030641013657,20240918135550,2024-09-18,city scrape,Cutest apartment at Maybachufer,"Sit back and relax in this quiet, stylish space.",,https://a0.muscache.com/pictures/hosting/Hosti...,5367476,...,,,,f,1,0,1,0,,[]
23673,berlin,https://www.airbnb.com/rooms/1242797685006849808,20240918135550,2024-09-18,city scrape,Beautiful Modern flat at Center,Discover the charm of Authentic Beautiful Stud...,,https://a0.muscache.com/pictures/hosting/Hosti...,554070013,...,5.00,5.00,First name and Last name: Oleksandra Mykhailen...,t,1,1,0,0,1.00,"[great apartment, very nice host!]"
23674,berlin,https://www.airbnb.com/rooms/1242815787146152998,20240918135550,2024-09-18,city scrape,Helles cozy Studioapartment,"Our cozy, quiet studio apartment in Hessenwink...",,https://a0.muscache.com/pictures/miso/Hosting-...,43543355,...,,,09/Z/AZ/005093-24,f,1,1,0,0,,[]
