# Imports

In [1]:
import pandas as pd

In [2]:
calendar = pd.read_csv('og_data/calendar2024.csv')

In [3]:
calendar.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7631731 entries, 0 to 7631730
Data columns (total 7 columns):
 #   Column          Dtype  
---  ------          -----  
 0   listing_id      int64  
 1   date            object 
 2   available       object 
 3   price           object 
 4   adjusted_price  float64
 5   minimum_nights  float64
 6   maximum_nights  float64
dtypes: float64(3), int64(1), object(3)
memory usage: 407.6+ MB


# Calendar

## Fix the price column if it's formatted as "$1,865.00"

In [9]:
calendar['price'] = calendar['price'].str.replace("[$,]", "", regex=True).astype(float) # Remove $ and , and convert to float

## Drop columns with all NaN values

In [10]:
calendar = calendar.dropna(axis=1, how='all')

In [11]:
# Sort by id and date to ensure proper interpolation
calendar = calendar.sort_values(by=['listing_id', 'date'])

# Define a function to interpolate within each group
def interpolate_nights(group):
    # Interpolate numeric columns
    group['minimum_nights'] = group['minimum_nights'].interpolate(method='nearest', limit_direction='both')
    group['maximum_nights'] = group['maximum_nights'].interpolate(method='nearest', limit_direction='both')
    return group

# Apply interpolation group-wise by id
calendar = calendar.groupby('listing_id').apply(interpolate_nights)

# Convert to integers after interpolation (rounding to avoid float issues)
calendar['minimum_nights'] = calendar['minimum_nights'].fillna(0).round().astype(int)
calendar['maximum_nights'] = calendar['maximum_nights'].fillna(0).round().astype(int)

### Wondering how interpolate works?
<details>
<summary>CLICK HERE FOR EXAMPLE</summary>

### Original Data

| listing_id | date       | minimum_nights | maximum_nights |
|------------|------------|----------------|----------------|
| 1          | 2024-01-01 | 2              | 5              |
| 1          | 2024-01-02 | NaN            | NaN            |
| 1          | 2024-01-03 | NaN            | NaN            |
| 1          | 2024-01-04 | 4              | 8              |

---

### After Interpolation

- Interpolation is applied using the `nearest` method:
  - For `minimum_nights`:
    - `2024-01-02` is filled with `2` (nearest value from `2024-01-01`).
    - `2024-01-03` is filled with `4` (nearest value from `2024-01-04`).
  - For `maximum_nights`:
    - `2024-01-02` is filled with `5` (nearest value from `2024-01-01`).
    - `2024-01-03` is filled with `8` (nearest value from `2024-01-04`).

| listing_id | date       | minimum_nights | maximum_nights |
|------------|------------|----------------|----------------|
| 1          | 2024-01-01 | 2              | 5              |
| 1          | 2024-01-02 | 2              | 5              |
| 1          | 2024-01-03 | 4              | 8              |
| 1          | 2024-01-04 | 4              | 8              |

---
</details>

---------

# Exports

In [6]:
# calendar.to_csv(
#     "cleaned_data/cleaned_calendar.csv",
#     index=False,
#     encoding="utf-8"
# )