imports

In [None]:
! pip install bs4

In [2]:
import pandas as pd
import numpy as np
import re
import html
from bs4 import BeautifulSoup

In [5]:
# Load CSV
csv_path = '../data/original/all_us_listings.csv'
df = pd.read_csv(csv_path)

# -------- STEP 1: Basic Info --------
print("🔍 Missing values per column:")
print(df.isnull().sum())

print("\n📊 Data types:")
print(df.dtypes)



🔍 Missing values per column:
id                            0
description                2832
neighborhood_overview    101641
bedrooms                  22500
price                         0
available_from                0
state                         0
picture_url                   1
dtype: int64

📊 Data types:
id                       float64
description               object
neighborhood_overview     object
bedrooms                 float64
price                      int64
available_from            object
state                     object
picture_url               object
dtype: object


### data preprocess 

In [6]:
# -------- STEP 2: Data Cleaning --------

# 1. Drop listings with missing critical fields
critical_fields = ['id', 'description', 'price', 'bedrooms', 'available_from', 'state', 'picture_url']
df = df.dropna(subset=critical_fields)

# 2. Bedrooms: Ensure integer and filter out unrealistic values
df['bedrooms'] = pd.to_numeric(df['bedrooms'], errors='coerce')
df = df[df['bedrooms'].between(1, 10)]

# 3. Normalize 'price' — replace with synthetic monthly rent (e.g., sample from a fixed set)
monthly_rent_options = [800, 1200, 1600, 2000, 2400, 3000]
df['price'] = np.random.choice(monthly_rent_options, size=len(df))

# 4. Normalize 'available_from' to uppercase month names
df['available_from'] = df['available_from'].astype(str).str.upper().str.strip()

# 5. Standardize 'state' values (remove extra spaces/caps)
df['state'] = df['state'].astype(str).str.upper().str.strip()


In [7]:
# -------- STEP 3: Final Checks --------
print("\n✅ Cleaned shape:", df.shape)


✅ Cleaned shape: (243305, 8)


In [8]:
# Fill missing text fields with empty strings to avoid NaNs during concatenation
df['description'] = df['description'].fillna('')
df['neighborhood_overview'] = df['neighborhood_overview'].fillna('')

# Create the new 'soft_attributes' column
df['soft_attributes'] = df['description'].str.strip() + " neighborhood " + df['neighborhood_overview'].str.strip()

# Drop the original columns
df = df.drop(columns=['description', 'neighborhood_overview'])

# Optional: Show a sample
print("\n📝 Example 'soft_attributes':")
print(df['soft_attributes'].head(3))



📝 Example 'soft_attributes':
2     30 night minimum stay<br /><br /><b>The space<...
10    Nestled on a hillside overlooking Kalapaki Bay...
73    We are opening this room to host women who are...
Name: soft_attributes, dtype: object


In [9]:
print(df.isnull().sum())

id                 0
bedrooms           0
price              0
available_from     0
state              0
picture_url        0
soft_attributes    0
dtype: int64


Data Cleaning & Preprocessing 

In [10]:
def clean_airbnb_listings(df: pd.DataFrame) -> pd.DataFrame:
    # -----------------------------
    # a. Structured Data Cleaning
    # -----------------------------

    # Drop rows with missing critical fields
    critical_fields = ['id', 'soft_attributes', 'price', 'bedrooms', 'available_from', 'state']
    df = df.dropna(subset=critical_fields)

    # Convert types
    df['price'] = pd.to_numeric(df['price'], errors='coerce')
    df['bedrooms'] = pd.to_numeric(df['bedrooms'], downcast='integer', errors='coerce')

    # Filter invalid bedrooms
    df = df[df['bedrooms'].between(1, 10)]

    # Drop outliers and invalid price entries
    df = df[df['price'].between(500, 10000)]

    # Normalize month name format (uppercase, trimmed)
    df['available_from'] = df['available_from'].astype(str).str.upper().str.strip()

    # Standardize state values
    df['state'] = df['state'].astype(str).str.upper().str.strip()

    # -----------------------------
    # b. Unstructured Text Preprocessing
    # -----------------------------

    def clean_text(text):
        if pd.isna(text):
            return ''
        # Unescape HTML entities
        text = html.unescape(text)
        # Remove HTML tags
        text = BeautifulSoup(text, "html.parser").get_text()
        # Normalize unicode
        text = text.encode("utf-8", "ignore").decode("utf-8")
        # Remove excessive whitespace
        text = re.sub(r'\s+', ' ', text)
        return text.strip()

    df['soft_attributes'] = df['soft_attributes'].apply(clean_text)

    # -----------------------------
    # c. Deduplication & Filtering
    # -----------------------------

    # Drop duplicate listings by 'id'
    df = df.drop_duplicates(subset='id')

    # Optionally: Drop listings with very short soft_attributes
    df = df[df['soft_attributes'].str.len() > 30]

    return df


In [11]:
df = clean_airbnb_listings(df)

In [12]:
df.to_csv('../data/processed/cleaned_airbnb_listings.csv', index=False)

In [13]:
df = pd.read_csv('../data/processed/cleaned_airbnb_listings.csv')
df.columns

Index(['id', 'bedrooms', 'price', 'available_from', 'state', 'picture_url',
       'soft_attributes'],
      dtype='object')