In [None]:
from google.colab import drive
import requests
import re

drive.mount('/content/drive')

urls = [
      "https://data.insideairbnb.com/united-states/ca/los-angeles/2024-09-04/data/listings.csv.gz",
      "https://data.insideairbnb.com/united-states/ca/oakland/2024-09-21/data/listings.csv.gz",
      "https://data.insideairbnb.com/united-states/ca/pacific-grove/2024-09-30/data/listings.csv.gz",
      "https://data.insideairbnb.com/united-states/ca/san-diego/2024-09-21/data/listings.csv.gz",
      "https://data.insideairbnb.com/united-states/ca/san-francisco/2024-09-04/data/listings.csv.gz",
      "https://data.insideairbnb.com/united-states/ca/san-mateo-county/2024-09-21/data/listings.csv.gz",
      "https://data.insideairbnb.com/united-states/ca/santa-clara-county/2024-09-21/data/listings.csv.gz",
      "https://data.insideairbnb.com/united-states/ca/santa-cruz-county/2024-09-29/data/listings.csv.gz"
      ]
my_path = "/content/drive/My Drive/BA820/data/"



Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
city_names = [re.search(r"ca/([^/]+)/\d{4}-\d{2}-\d{2}", url).group(1).replace("-", " ") for url in urls]
city_names

['los angeles',
 'oakland',
 'pacific grove',
 'san diego',
 'san francisco',
 'san mateo county',
 'santa clara county',
 'santa cruz county']

In [None]:
for i in range(len(urls)):
  save_path = my_path + city_names[i] + ".csv.gz"

  response = requests.get(urls[i], stream=True)

  if response.status_code == 200:
      with open(save_path, "wb") as file:
          for chunk in response.iter_content(1024):
              file.write(chunk)
      print(f"File downloaded to: {save_path}")
  else:
      print("Download failed, status code:", response.status_code)

File downloaded to: /content/drive/My Drive/BA820/data/los angeles.csv.gz
File downloaded to: /content/drive/My Drive/BA820/data/oakland.csv.gz
File downloaded to: /content/drive/My Drive/BA820/data/pacific grove.csv.gz
File downloaded to: /content/drive/My Drive/BA820/data/san diego.csv.gz
File downloaded to: /content/drive/My Drive/BA820/data/san francisco.csv.gz
File downloaded to: /content/drive/My Drive/BA820/data/san mateo county.csv.gz
File downloaded to: /content/drive/My Drive/BA820/data/santa clara county.csv.gz
File downloaded to: /content/drive/My Drive/BA820/data/santa cruz county.csv.gz


In [None]:
import os
import pandas as pd
import gzip


folder_path = "/content/drive/My Drive/BA820/data/"
output_file = "/content/drive/My Drive/BA820/data/merged_listings.csv"

# get all .csv.gz
gz_files = [f for f in os.listdir(folder_path) if f.endswith(".csv.gz")]


df_list = []

for file in gz_files:
    file_path = os.path.join(folder_path, file)
    print(f"Reading: {file_path}")

    with gzip.open(file_path, 'rt', encoding='utf-8') as f:
        df = pd.read_csv(f)
        df_list.append(df)

# Merged
merged_df = pd.concat(df_list, ignore_index=True)

# save CSV
merged_df.to_csv(output_file, index=False)
print(f"Done and save the file to: {output_file}")

Reading: /content/drive/My Drive/BA820/data/los angeles.csv.gz
Reading: /content/drive/My Drive/BA820/data/oakland.csv.gz
Reading: /content/drive/My Drive/BA820/data/pacific grove.csv.gz
Reading: /content/drive/My Drive/BA820/data/san diego.csv.gz
Reading: /content/drive/My Drive/BA820/data/san francisco.csv.gz
Reading: /content/drive/My Drive/BA820/data/san mateo county.csv.gz
Reading: /content/drive/My Drive/BA820/data/santa clara county.csv.gz
Reading: /content/drive/My Drive/BA820/data/santa cruz county.csv.gz
Done and save the file to: /content/drive/My Drive/BA820/data/merged_listings.csv


In [None]:
merged_df = pd.read_csv("/content/drive/My Drive/BA820/data/merged_listings.csv")

  merged_df = pd.read_csv("/content/drive/My Drive/BA820/data/merged_listings.csv")


In [None]:
merged_df.head()

Unnamed: 0,id,listing_url,scrape_id,last_scraped,source,name,description,neighborhood_overview,picture_url,host_id,...,review_scores_communication,review_scores_location,review_scores_value,license,instant_bookable,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
0,670339032744709144,https://www.airbnb.com/rooms/670339032744709144,20240904164210,2024-09-05,city scrape,Westwood lovely three bedrooms three bathrooms,The whole group will enjoy easy access to ever...,,https://a0.muscache.com/pictures/miso/Hosting-...,4780152,...,,,,,f,61,4,57,0,
1,37014494,https://www.airbnb.com/rooms/37014494,20240904164210,2024-09-05,previous scrape,Spanish style lower duplex near Beverly Hills,,,https://a0.muscache.com/pictures/65db39d1-c503...,278288178,...,,,,,f,1,1,0,0,
2,1024835174766068422,https://www.airbnb.com/rooms/1024835174766068422,20240904164210,2024-09-05,city scrape,Charming Beverly Hills Home,"Charming, renovated home in the Beverly Hills ...",,https://a0.muscache.com/pictures/miso/Hosting-...,513813179,...,,,,,f,4,4,0,0,
3,850744632375448560,https://www.airbnb.com/rooms/850744632375448560,20240904164210,2024-09-04,city scrape,Tianpu's warm room with bathroom,Relax in this unique and serene retreat.,,https://a0.muscache.com/pictures/miso/Hosting-...,432956623,...,5.0,5.0,3.0,,f,4,1,3,0,0.06
4,953950676345326970,https://www.airbnb.com/rooms/953950676345326970,20240904164210,2024-09-05,city scrape,"Santa Monica apt, free parking, steps to the b...",Welcome to our stunning apartment nestled in t...,,https://a0.muscache.com/pictures/3853334b-4562...,528669205,...,5.0,4.73,4.77,Exempt,t,3,3,0,0,3.41


In [None]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80456 entries, 0 to 80455
Data columns (total 75 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   id                                            80456 non-null  int64  
 1   listing_url                                   80456 non-null  object 
 2   scrape_id                                     80456 non-null  int64  
 3   last_scraped                                  80456 non-null  object 
 4   source                                        80456 non-null  object 
 5   name                                          80455 non-null  object 
 6   description                                   78262 non-null  object 
 7   neighborhood_overview                         44400 non-null  object 
 8   picture_url                                   80456 non-null  object 
 9   host_id                                       80456 non-null 