In [2]:
import requests
import pandas as pd
from datetime import datetime

In [4]:
def download_airbnb_data(city="bristol", quarters=None):
    """Download Airbnb data (listings, reviews, and geojson) for specified quarters"""
    if quarters is None:
        quarters = [
            "2024-06-26",  # Latest quarter
            "2024-03-28",  # Q1 2024
            "2023-12-25",  # Q4 2023
            "2023-09-22"   # Q3 2023
        ]
    
    all_data = {
        'listings': [],
        'reviews': []
    }
    
    # Download quarterly data (listings and reviews)
    for quarter in quarters:
        print(f"\nDownloading data for {quarter}...")
        base_url = f"http://data.insideairbnb.com/united-kingdom/england/bristol/{quarter}/data"
        
        # Define files to download
        files = {
            'listings': f"{base_url}/listings.csv",
            'reviews': f"{base_url}/reviews.csv"
        }
        
        # Download each file type
        for file_type, url in files.items():
            try:
                print(f"Downloading {file_type}...")
                df = pd.read_csv(url)
                df['quarter'] = quarter
                
                # Special processing for listings file
                if file_type == 'listings':
                    df['price'] = df['price'].str.replace('$', '').str.replace(',', '').astype(float)
                    df['last_review'] = pd.to_datetime(df['last_review'])
                
                all_data[file_type].append(df)
                print(f"Successfully downloaded {file_type}")
                
            except Exception as e:
                print(f"Error downloading {file_type}: {str(e)}")
    
    # Download GeoJSON (only need the latest one)
    latest_quarter = quarters[0]
    geojson_url = f"http://data.insideairbnb.com/united-kingdom/england/bristol/{latest_quarter}/visualisations/neighbourhoods.geojson"
    try:
        print("\nDownloading GeoJSON file...")
        response = requests.get(geojson_url)
        response.raise_for_status()  # Raises an HTTPError for bad responses
        
        # Save GeoJSON file
        with open('bristol_neighbourhoods.geojson', 'wb') as f:
            f.write(response.content)
        print("Successfully downloaded GeoJSON file")
    except Exception as e:
        print(f"Error downloading GeoJSON: {str(e)}")
    
    # Process and save listing and review data
    for file_type, data_list in all_data.items():
        if data_list:
            # Combine all quarters
            combined_df = pd.concat(data_list, ignore_index=True)
            
            # Remove duplicates (for listings and reviews)
            combined_df = combined_df.sort_values('quarter').drop_duplicates('id', keep='last')
            
            # Save to CSV
            output_file = f'bristol_airbnb_{file_type}.csv'
            combined_df.to_csv(output_file, index=False)
            print(f"\nSaved {file_type} data to {output_file}")
            
            # Store in dictionary for return
            all_data[file_type] = combined_df
        else:
            print(f"\nNo data available for {file_type}")
    
    return all_data

# Run the download if this file is run directly
if __name__ == "__main__":
    data = download_airbnb_data()
    
    # Print some basic statistics for the datasets
    if 'listings' in data:
        listings_df = data['listings']
        print("\nListings Data Overview:")
        print(f"Total listings: {len(listings_df)}")
        print(f"Average price: ${listings_df['price'].mean():.2f}")
        print(f"Number of neighborhoods: {listings_df['neighbourhood'].nunique()}")
    
    if 'reviews' in data:
        reviews_df = data['reviews']
        print("\nReviews Data Overview:")
        print(f"Total reviews: {len(reviews_df)}")
        print(f"Number of unique listings reviewed: {reviews_df['listing_id'].nunique()}")


Downloading data for 2024-06-26...
Downloading listings...
Successfully downloaded listings
Downloading reviews...
Successfully downloaded reviews

Downloading data for 2024-03-28...
Downloading listings...
Successfully downloaded listings
Downloading reviews...
Successfully downloaded reviews

Downloading data for 2023-12-25...
Downloading listings...
Successfully downloaded listings
Downloading reviews...
Successfully downloaded reviews

Downloading data for 2023-09-22...
Downloading listings...
Successfully downloaded listings
Downloading reviews...
Successfully downloaded reviews

Downloading GeoJSON file...
Successfully downloaded GeoJSON file

Saved listings data to bristol_airbnb_listings.csv

Saved reviews data to bristol_airbnb_reviews.csv

Listings Data Overview:
Total listings: 3505
Average price: $134.79
Number of neighborhoods: 42

Reviews Data Overview:
Total reviews: 152100
Number of unique listings reviewed: 2917


In [4]:
# Read just a few rows to examine the data
df = pd.read_csv('bristol_airbnb_listings.csv', nrows=5)
print("Column types:")
print(df.dtypes)
print("\nSample of 'name' column:")
print(df['name'])

Column types:
id                                                int64
listing_url                                      object
scrape_id                                         int64
last_scraped                                     object
source                                           object
                                                 ...   
calculated_host_listings_count_entire_homes       int64
calculated_host_listings_count_private_rooms      int64
calculated_host_listings_count_shared_rooms       int64
reviews_per_month                               float64
quarter                                          object
Length: 76, dtype: object

Sample of 'name' column:
0    Home in Bristol · ★4.83 · 1 bedroom · 1 bed · ...
1    Bed and breakfast in City of Bristol · ★4.35 ·...
2    Bed and breakfast in Bristol · ★4.13 · 1 bedro...
3    Bed and breakfast in England · ★4.14 · 1 bedro...
4    Bed and breakfast in Bristol · ★3.98 · 1 bedro...
Name: name, dtype: object


In [5]:
# Read the CSV
df = pd.read_csv('bristol_airbnb_listings.csv')

# Clean the name column - replace special characters
df['name'] = df['name'].str.replace('·', '-')  # Replace dot with dash
df['name'] = df['name'].str.replace('★', '')  # Remove star
df['name'] = df['name'].str.replace('  ', ' ')  # Remove double spaces

# Save cleaned CSV
df.to_csv('bristol_airbnb_listings_clean.csv', index=False, encoding='utf-8-sig')