# 1. Data load

Load the [listings.csv](data/listings.md) file and parse the column types

## 1.1. Init

### 1.1.1. Importing libraries

In [1]:
import json

import pandas as pd

import warnings

%run ./scripts/pd_display.py

### 1.1.2. Configuring libraries

In [2]:
warnings.filterwarnings("ignore") # So it does not return a new error message with the kernel-id for every run at `pd.read_csv` (git)

## 1.2. Defining functions

In [3]:
def tf_to_bool(v):
    "changes t | f to type bool, returns None when neither t or f"
    match v:
        case 't':
            return True
        case 'f':
            return False
        case _:
            return None

## 1.3. Loading data

### 1.3.1. read from CSV

In [4]:
dtype = {
    'listing_id': 'Int64',
    'name': 'string',
    'host_id': 'Int64',
    'host_since': 'string',
    'host_location': 'string',
    'host_response_time': 'category',
    'host_response_rate': 'float',
    'host_acceptance_rate': 'float',
    'host_total_listings_count': 'Int64',
    'neighbourhood': 'category',
    'district': 'category',
    'city': 'category',
    'latitude': 'float',
    'longitude': 'float',
    'property_type': 'category',
    'room_type': 'category',
    'accommodates': 'Int64',
    'bedrooms': 'Int64',
    'price': 'Int64',
    'minimum_nights': 'Int64',
    'maximum_nights': 'Int64',
    'review_scores_rating': 'Int64',
    'review_scores_accuracy': 'Int64',
    'review_scores_cleanliness': 'Int64',
    'review_scores_checkin': 'Int64',
    'review_scores_communication': 'Int64',
    'review_scores_location': 'Int64',
    'review_scores_value': 'Int64',
}

In [5]:
converters = {
    **{x: tf_to_bool for x in ['instant_bookable', 'host_identity_verified', 'host_has_profile_pic', 'host_is_superhost']},
    'amenities': json.loads
}

In [6]:
df = pd.read_csv(
    'data/listings.csv',
    encoding="raw_unicode_escape",  # unicode_escape DOES NOT WORK, because of the json.loads
    dtype=dtype,
    converters=converters
)

#
# Warning: Columns (8,10,11) have mixed types. Specify dtype option on import or set low_memory=False.
#

### 1.3.2. boolean type fixes

Fixing the `DtypeWarning` for the columns `(8,10,11)`, setting those types to type *bool*

In [7]:
for c in ["host_has_profile_pic", "host_identity_verified", "host_is_superhost"]:
    df[c] = df[c].astype(bool)

### 1.3.3. `host_since` to `datetime`

In [8]:
df["host_since"] = pd.to_datetime(df["host_since"], format='%Y-%m-%d')

## 1.4 Store

In [9]:
df.to_pickle("./pickles/001.dataframe.data-load.pkl")