<a href="https://colab.research.google.com/github/L-4-r-s/AirBNB_Scraping/blob/main/AirBNBInfos.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Installs

In [None]:
!pip install openrouteservice



# Imports

In [None]:
import openrouteservice
import re
from bs4 import BeautifulSoup
import requests
import pandas as pd

# Konstanten

In [None]:
# Open Route Service Key (visit https://openrouteservice.org/)
ORS_API_KEY = 'your_service_key'

# Hilfsmethoden

In [None]:
def get_bed_nr(soup):
  # Find the meta tag with the property 'og:title' that contains the number of people
  og_title_tag = soup.find('meta', {'property': 'og:title'})

  # Check if the tag is found and extract the content
  if og_title_tag:
      og_title = og_title_tag.get('content', '')

      beds_match = re.search(r'(\d+)\s*Betten', og_title)
      if beds_match:
        max_beds = beds_match.group(1)
        return max_beds
      else:
        print("Number of beds not found.")
        return None
  else:
    print("OG Title not found.")
    return None

# Relevante Infos aus HTML scrapen
def get_person_nr(string):
  # Search for the JSON-like structure containing 'overviewItems'
  overview_items_match = re.findall(r'"overviewItems":\[(.*?)\]', string)

  if overview_items_match:
      overview_items_str = overview_items_match[0]
      # Use regex to find numbers before "Gäste"
      guests_match = re.findall(r'(\d+)\s*Gäste', overview_items_str)
      if guests_match:
          # The first match will be the number of guests
          max_guests = guests_match[0]
          return max_guests
      else:
          print("Number of guests not found in overviewItems.")
          return None
  else:
      print("Overview items not found.")
      return None

def get_location(string):
  # Search for the JSON-like structure containing 'LocationSection'
  location_section_match = re.findall(r'"section":\{"__typename":"LocationSection".*?"lat":([0-9.-]+),"lng":([0-9.-]+)', string)

  if location_section_match:
      # Extract the lat and lng values
      lat, lng = location_section_match[0]
      return lat, lng
  else:
      print("Location data not found.")
      return None

def get_activities(string):
  # List of words to search for
  keywords = [
      'Tischtennisplatte',
      'Billardtisch',
      'Whirlpool',
      'Fußballtor',
      'Volleyballnetz',
      'Badminton',
      'Basketball',
      'Sauna',
      'Tischfußball',
      'Tennisplatz'
  ]

  # Create a regex pattern to match any of the words in the list
  pattern = r'\b(?:' + '|'.join(map(re.escape, keywords)) + r')\b'

  # Search for all occurrences of the keywords
  matches = re.findall(pattern, string)

  # Print the matched words (unique matches)
  matched_words = set(matches)  # Use a set to avoid duplicates
  return matched_words

In [None]:
# Land der Villa rausfinden
def get_country_from_coordinates(lat, lon):
    client = openrouteservice.Client(key=ORS_API_KEY)

    try:
        # Reverse geocode the coordinates using the 'geocode' endpoint
        result = client.pelias_reverse((lon, lat), size=1)  # Reverse geocoding
        if result['features']:
            country = result['features'][0]['properties']['country']
            return country
        else:
            return "Country not found"

    except Exception as e:
        print(f"Error getting country: {e}")
        return None

In [None]:
# Fahrtzeit bestimmen (von Marburg aus)
def get_driving_time(lat, lon):
    client = openrouteservice.Client(key=ORS_API_KEY)

    # Coordinates for Marburg, Germany (starting point)
    marburg = (8.775319, 50.809449)

    try:
        # Get directions from Marburg to destination
        routes = client.directions(
            coordinates=[marburg, (lon, lat)],
            profile='driving-car',
            format='geojson'
        )

        # Check if the response contains valid route data
        if routes['features']:
            duration = routes['features'][0]['properties']['segments'][0]['duration']  # Duration in seconds
            duration_minutes = duration / 60  # Convert to minutes
            hours = int(duration_minutes // 60)  # Extract hours
            minutes = int(duration_minutes % 60)  # Extract remaining minutes
            return f"{hours}:{minutes:02d}"  # Format as hours:minutes
        else:
            print("No route found for the given coordinates.")
            return None
    except Exception as e:
        print(f"Error fetching driving time: {e}")
        return None

# Hauptmethode

In [None]:
# get airbnb infos
def get_infos(url):
  data = {}
  response = requests.get(url)
  soup = BeautifulSoup(response.content, 'html.parser')
  string = str(soup)
  data['link'] = url
  lat, lon = get_location(string)
  data['country'] = get_country_from_coordinates(lat, lon)
  data['lat'], data['lon'] = lat, lon
  data['beds'] = get_bed_nr(soup)
  data['guests'] = get_person_nr(string)
  data['driving_time'] = get_driving_time(lat, lon)
  data['activities'] = get_activities(string)
  # Format the activities set as a string
  data["activities"] = ", ".join(sorted(data["activities"]))

  # Columns to display
  columns = ["link", "country", "driving_time", "guests", "beds", "activities", "lon", "lat"]
  df = pd.DataFrame([{col: data[col] for col in columns}])
  return df

# Airbnb link

In [None]:
url = "https://www.airbnb.de/rooms/43431749"
display(get_infos(url))

Unnamed: 0,link,country,driving_time,guests,beds,activities,lon,lat
0,https://www.airbnb.de/rooms/43431749,France,4:12,12,10,"Sauna, Whirlpool",6.39121,48.35651


# List of AirBNB links

In [None]:
links = [
    "https://www.airbnb.de/rooms/49055401",
    "https://www.airbnb.de/rooms/26794850",
    "https://www.airbnb.de/rooms/46986726",
    "https://www.airbnb.de/rooms/562004607656981846",
    "https://www.airbnb.de/rooms/1029640353337109428",
    "https://www.airbnb.de/rooms/1094722094121085656",
    "https://www.airbnb.de/rooms/33180719",
    "https://www.airbnb.de/rooms/592831712248477763",
    "https://www.airbnb.de/rooms/687548706203524707",
    "https://www.airbnb.de/rooms/48528596",
    "https://www.airbnb.de/rooms/26479752",
    "https://www.airbnb.de/rooms/52914404",
    "https://www.airbnb.de/rooms/657000380175876838",
    "https://www.airbnb.de/rooms/43886371",
    "https://www.airbnb.de/rooms/841997557272321035",
    "https://www.airbnb.de/rooms/47938266",
    "https://www.airbnb.de/rooms/42778753",
    "https://www.airbnb.de/rooms/623695778226623821",
    "https://www.airbnb.de/rooms/43204750",
    "https://www.airbnb.de/rooms/22137245",
    "https://www.airbnb.de/rooms/41915628",
    "https://www.airbnb.de/rooms/810835311374702888",
    "https://www.airbnb.de/rooms/47823466",
    "https://www.airbnb.de/rooms/1317865669944768171",
    "https://www.airbnb.de/rooms/49869683",
    "https://www.airbnb.de/rooms/37440827",
    "https://www.airbnb.de/rooms/1040730572514479792",
    "https://www.airbnb.de/rooms/48476296",
    "https://www.airbnb.de/rooms/43345696",

]

dataframes = []

# Iterate through the offers, process each one and append the resulting dataframe to the list
for offer in links:
    df = get_infos(offer)
    dataframes.append(df)

# Concatenate all the dataframes into one
full_df = pd.concat(dataframes, ignore_index=True)

# Sort the dataframe by country
full_df_sorted = full_df.sort_values(by='country')

# Display the final dataframe
display(full_df_sorted)

Number of guests not found in overviewItems.


Unnamed: 0,link,country,driving_time,guests,beds,activities,lon,lat
7,https://www.airbnb.de/rooms/592831712248477763,Croatia,10:21,14.0,9,Tischtennisplatte,15.6012,45.1298
14,https://www.airbnb.de/rooms/841997557272321035,France,7:42,15.0,10,"Tischfußball, Tischtennisplatte",4.01676,45.65788
26,https://www.airbnb.de/rooms/1040730572514479792,France,4:26,14.0,8,Billardtisch,4.8721,49.341
25,https://www.airbnb.de/rooms/37440827,France,3:04,12.0,8,"Billardtisch, Tischfußball, Tischtennisplatte",7.61371,48.85127
24,https://www.airbnb.de/rooms/49869683,France,6:10,13.0,8,Tischtennisplatte,4.32016,47.33476
23,https://www.airbnb.de/rooms/1317865669944768171,France,4:48,12.0,6,"Tischfußball, Whirlpool",6.103,47.7721
21,https://www.airbnb.de/rooms/810835311374702888,France,10:11,12.0,8,Tischtennisplatte,-1.57917,46.51435
18,https://www.airbnb.de/rooms/43204750,France,8:58,15.0,12,"Billardtisch, Tischfußball, Tischtennisplatte,...",-1.44856,48.12261
17,https://www.airbnb.de/rooms/623695778226623821,France,6:35,14.0,10,Tischtennisplatte,4.91733,46.12145
16,https://www.airbnb.de/rooms/42778753,France,10:37,12.0,7,"Billardtisch, Tischfußball, Tischtennisplatte,...",-2.99964,47.68623


# Convert to Latex table

In [None]:
df = full_df.sort_values(by='country').copy()

# Extract unique activities
all_activities = set()
for activities in df['activities']:
    all_activities.update(activities.split(', '))

# Create a new column for each activity with check marks or cross marks
for activity in all_activities:
  df[activity] = df['activities'].apply(lambda x: '$\\checkmark$' if activity in x else '$\\times$')
df['link'] = df['link'].apply(lambda x: '\\href{' + x + '}{link}')
# Drop the original 'activities' column as it's no longer needed
df = df.drop(columns=['activities'])

# Create the clickable Google Maps link for each row
df['location'] = df.apply(lambda row: '\\href{https://www.google.com/maps?q=' + str(row['lat']) + ',' + str(row['lon']) + '}{maps}', axis=1)

# Drop the latitude and longitude columns as we no longer need them
df = df.drop(columns=['lat', 'lon'])

# Generate LaTeX code for the table
latex_code = df.to_latex(index=False, escape=False)
latex_code = latex_code.replace("_", "\_")

# Print the LaTeX code
print(latex_code)

\begin{tabular}{lllllllllllllll}
\toprule
link & country & driving\_time & guests & beds & Volleyballnetz & Badminton & Sauna & Billardtisch & Fußballtor & Tischtennisplatte & Tennisplatz & Tischfußball & Whirlpool & location \\
\midrule
\href{https://www.airbnb.de/rooms/592831712248477763}{link} & Croatia & 10:21 & 14 & 9 & $\times$ & $\times$ & $\times$ & $\times$ & $\times$ & $\checkmark$ & $\times$ & $\times$ & $\times$ & \href{https://www.google.com/maps?q=45.1298,15.6012}{maps} \\
\href{https://www.airbnb.de/rooms/841997557272321035}{link} & France & 7:42 & 15 & 10 & $\times$ & $\times$ & $\times$ & $\times$ & $\times$ & $\checkmark$ & $\times$ & $\checkmark$ & $\times$ & \href{https://www.google.com/maps?q=45.65788,4.01676}{maps} \\
\href{https://www.airbnb.de/rooms/1040730572514479792}{link} & France & 4:26 & 14 & 8 & $\times$ & $\times$ & $\times$ & $\checkmark$ & $\times$ & $\times$ & $\times$ & $\times$ & $\times$ & \href{https://www.google.com/maps?q=49.341,4.8721}{maps} \\