In [1]:
import bs4
import requests
from urllib.parse import urlparse, parse_qs
import pandas as pd
import re
import unicodedata

In [2]:
names = []
urls = []
addresses = []
ids = []

for page_number in range(1, 140):
    URL = f"https://www.hellovillam.com/ara?page={page_number}"
    response = requests.get(URL)
    response.encoding = "utf-8" 
    html_parsed = bs4.BeautifulSoup(response.text, "html.parser")

    for villa_url in html_parsed.select("div.info a.villa-title"):
        urls.append("https://www.hellovillam.com"+villa_url["href"].strip())  
        villa_name = villa_url["title"].strip()
        villa_id = villa_name.lower().replace(" ", "")
        names.append(villa_name)
        ids.append(villa_id)

    for villa_url in html_parsed.select("div.region.mb-3"):
        addresses.append(villa_url.getText().strip())
        
    if page_number % 10 == 0:
        print(f"{page_number} pages have been checked!")
    
print(f"\n---OPERATION HAS BEEN COMPLETED!---")
print(f"{page_number} pages have been checked!")
print(f"Total Villa Name: {len(names)}")
print(f"Total Villa URL: {len(urls)}")

10 pages have been checked!
20 pages have been checked!
30 pages have been checked!
40 pages have been checked!
50 pages have been checked!
60 pages have been checked!
70 pages have been checked!
80 pages have been checked!
90 pages have been checked!
100 pages have been checked!
110 pages have been checked!
120 pages have been checked!
130 pages have been checked!

---OPERATION HAS BEEN COMPLETED!---
139 pages have been checked!
Total Villa Name: 2085
Total Villa URL: 2085


In [3]:
photos = []
latitudes = []
longitudes = []
bedrooms = []
guests = []
bathrooms = []
average_prices = []
q1_prices = []
q2_prices = []
q3_prices = []
q4_prices = []
descriptions = []

URL_ERRORS = []
villa_number = 0
for URL in urls:
    try:
        response = requests.get(URL)
        response.encoding = "utf-8" 
        html_parsed = bs4.BeautifulSoup(response.text, "html.parser")
        villa_number += 1
    except:
        print(f"{URL} : URL ERROR!")
        URL_ERRORS.append(URL)
        
    try:
        guest_info = html_parsed.select(".content.d-flex.align-items-center.flex-wrap div.properties.d-flex.align-items-center.justify-content-center.justify-content-md-start.col")[1]
        guest = int(guest_info.getText().strip()[0])
    except:
        print(f"{URL} : GUEST NUMBER ERROR!")
        guests.append(guest)
        
    try:
        bedroom_info = html_parsed.select(".content.d-flex.align-items-center.flex-wrap div.properties.d-flex.align-items-center.justify-content-center.justify-content-md-start.col")[0]
        bedroom = int(bedroom_info.getText().strip()[0])
    except:
        print(f"{URL} : BEDROOM NUMBER ERROR!")
        bedrooms.append(bedroom)
        
    try:
        bathroom_info = html_parsed.select(".content.d-flex.align-items-center.flex-wrap div.properties.d-flex.align-items-center.justify-content-center.justify-content-md-start.col")[2]
        bathroom = int(bathroom_info.getText().strip()[0])
    except:
        print(f"{URL} : BATHROOM NUMBER ERROR!")
        bathrooms.append(bathroom)
        
    try:
        villa_images = []
        for image_url in html_parsed.select("div.gallery-container div.item"):
            try:
                villa_images.append(image_url["data-src"])
            except:
                continue
    except:
        villa_images = None
        print(f"Image Error: {URL}")
        
    try:
        latitude = float(html_parsed.select_one("div#villa-location-maps")["data-latitude"])
        longitude = float(html_parsed.select_one("div#villa-location-maps")["data-longitude"])
    except:
        latitude = None
        longitude = None
        print(f"Location Error: {URL}")
        
    try:
        description = unicodedata.normalize("NFKD", "\n".join([desc.getText().strip() for desc in html_parsed.select(".row.pd-det-desc p")[:-1]]))
    except:
        print(f"{URL} : DESCRIPTION ERROR!")
        description = None
        
    try:
        price_elem = html_parsed.select_one(".price data").getText()
        q4_price = int(re.sub("[^0-9]", "", price_elem)) #october (Q4 price)
        avg_price = q4_price // (0.17 * 4)
        q1_price = avg_price*4 * 0.14
        q2_price = avg_price*4 * 0.32
        q3_price = avg_price*4 * 0.37
    except:
        print(f"{URL} : PRICE ERROR!")
        avg_price = None
        q1_price = None
        q2_price = None
        q3_price = None
        q4_price = None
        
    guests.append(guest)
    bedrooms.append(bedroom)
    bathrooms.append(bathroom)
    latitudes.append(latitude)
    longitudes.append(longitude)
    photos.append(villa_images)   
    descriptions.append(description)
    average_prices.append(avg_price)
    q1_prices.append(q1_price)
    q2_prices.append(q2_price)
    q3_prices.append(q3_price)
    q4_prices.append(q4_price)
        
    if villa_number % 100 == 0:
        print(f"{villa_number} villas are done!")
        
print(f"THE OPERATION IS COMPLETED!")
print(f"TOTAL VILLA NUMBER: {villa_number}")  

Location Error: https://www.hellovillam.com/villa-oykum
100 villas are done!
Location Error: https://www.hellovillam.com/villa-laurel
200 villas are done!
Location Error: https://www.hellovillam.com/villa-miracle
Location Error: https://www.hellovillam.com/villa-simizar
300 villas are done!
Location Error: https://www.hellovillam.com/villa-bardakci
400 villas are done!
500 villas are done!
600 villas are done!
700 villas are done!
800 villas are done!
900 villas are done!
1000 villas are done!
1100 villas are done!
Location Error: https://www.hellovillam.com/bungalov-sara-1
1200 villas are done!
1300 villas are done!
Location Error: https://www.hellovillam.com/villa-demetra
1400 villas are done!
Location Error: https://www.hellovillam.com/villa-hello-2
1500 villas are done!
1600 villas are done!
1700 villas are done!
1800 villas are done!
1900 villas are done!
2000 villas are done!
THE OPERATION IS COMPLETED!
TOTAL VILLA NUMBER: 2085


In [4]:
data_raw = {
    "ID": ids,
    "Name": names,
    "Address": addresses,
    "Guest Number": guests,
    "Bedroom Number": bedrooms,
    "Bathroom Number":bathrooms,
    "Latitude": latitudes,
    "Longitude": longitudes,
    "Description": descriptions,
    "Photo URLs": photos,
    "Average Price": average_prices,
    "Q1 Price": q1_prices,
    "Q2 Price": q2_prices,
    "Q3 Price": q3_prices,
    "Q4 Price": q4_prices
}

df_raw = pd.DataFrame(data_raw)
df_raw.set_index("ID", inplace=True)
df_raw.to_excel("hellovillam_raw.xlsx")

data_info = {
    "ID": ids,
    "Name": names,
    "Address": addresses,
    "Guest Number": guests,
    "Bedroom Number": bedrooms,
    "Bathroom Number": bathrooms,
    "Description": descriptions,
    "Photo URLs": photos,
    "Latitude": latitudes,
    "Longitude": longitudes
}

df_info = pd.DataFrame(data_info)
df_info.set_index("ID", inplace=True)
df_info.to_excel("hellovillam_info.xlsx")

data_financial = {
    "ID": ids,
    "Average Price": average_prices,
    "Q1 Price": q1_prices,
    "Q2 Price": q2_prices,
    "Q3 Price": q3_prices,
    "Q4 Price": q4_prices   
}

df_financial = pd.DataFrame(data_financial)
df_financial.set_index("ID", inplace=True)
df_financial.to_excel("hellovillam_financial.xlsx")