In [1]:
import bs4
import requests
import pandas as pd
import unicodedata

In [2]:
urls = []
names = []
addresses = []
ids = []

for page_number in range(1, 80):
    URL = "https://www.viravilla.com/kiralik-villa?s=" + str(page_number)
    response = requests.get(URL)
    response.encoding = "utf-8" 
    html_parsed = bs4.BeautifulSoup(response.text, "html.parser")

    for villa_info in html_parsed.select(".villa-name a"):
        villa_url = "https://www.viravilla.com" + villa_info["href"].strip()
        urls.append(villa_url)

    for villa_info in html_parsed.select(".villa-name a span"):
        name = villa_info.getText().strip()
        villa_id = name.replace(" ", "").strip().lower()
        names.append(name)
        ids.append(villa_id)
    
    for villa_info in html_parsed.select(".villa-name div span"):
        addresses.append(villa_info.getText().strip())
        
    if page_number % 10 == 0:
        print(f"{page_number} pages have been checked!")
    
print(f"\n---OPERATION HAS BEEN COMPLETED!---")
print(f"{page_number} pages have been checked!")

10 pages have been checked!
20 pages have been checked!
30 pages have been checked!
40 pages have been checked!
50 pages have been checked!
60 pages have been checked!
70 pages have been checked!

---OPERATION HAS BEEN COMPLETED!---
79 pages have been checked!


In [3]:
guests = []
bedrooms = []
rooms = []
bathrooms = []
latitudes = []
longitudes = []
descriptions = []
photos = []
average_prices = []
q1_prices = []
q2_prices = []
q3_prices = []
q4_prices = []

URL_ERRORS = []
villa_number = 0
for URL in urls:
    try:
        response = requests.get(URL)
        response.encoding = "utf-8" 
        html_parsed = bs4.BeautifulSoup(response.text, "html.parser")
        villa_number += 1
    except:
        print(f"{URL} : URL ERROR!")
        URL_ERRORS.append(URL)

    try:
        guest = [int(numeric_info.getText().strip()) for numeric_info in html_parsed.select("strong.mr-2")][0]
    except:
        print(f"{URL} : GUEST NUMBER ERROR!")
        guest = None
        
    try:
        bedroom = [int(numeric_info.getText().strip()) for numeric_info in html_parsed.select("strong.mr-2")][1]
    except:
        print(f"{URL} : BEDROOM NUMBER ERROR!")
        bedroom = None
        
    try:
        room = [int(numeric_info.getText().strip()) for numeric_info in html_parsed.select("strong.mr-2")][2]
    except:
        print(f"{URL} : ROOM NUMBER ERROR!")
        room = None
        
    try:
        bathroom = [int(numeric_info.getText().strip()) for numeric_info in html_parsed.select("strong.mr-2")][3]
    except:
        print(f"{URL} : BATHROOM NUMBER ERROR!")
        bathroom = None
        
    try:
        description_unicode = html_parsed.select_one("div div div div p").getText().strip()
        description = unicodedata.normalize("NFKD", description_unicode)
    except:
        print(f"{URL} : DESCRIPTION ERROR!")
        description = None
        
    try:
        latitude = float(html_parsed.select_one("#contactMap")["data-lat"])
        longitude = float(html_parsed.select_one("#contactMap")["data-lng"])
    except:
        print(f"{URL} : LOCATION ERROR!")
        latitude = None
        longitude = None
        
    try:
        villa_prices = [int(price.getText().replace(".", "").replace("₺", "").replace("TL", "").strip()) for price in html_parsed.select("div#fiyatlar div div div ul .price strong")[1::2]]
        avg_price = sum(villa_prices) // len(villa_prices)
        q1_price = avg_price*4 * 0.14
        q2_price = avg_price*4 * 0.32
        q3_price = avg_price*4 * 0.37
        q4_price = avg_price*4 * 0.17
    except:
        print(f"{URL} : PRICE ERROR!")
        avg_price = None
        q1_price = None
        q2_price = None
        q3_price = None
        q4_price = None
        
    try:
        villa_photos = tuple("https://www.viravilla.com" + villa_img["href"] for villa_img in html_parsed.select(".photo-gallery a"))
    except:
        print(f"{URL} : PHOTO ERROR")
        villa_photos = None
            
    guests.append(guest)
    bedrooms.append(bedroom)
    rooms.append(room)
    bathrooms.append(bathroom)
    descriptions.append(description)
    latitudes.append(latitude)
    longitudes.append(longitude)
    average_prices.append(avg_price)
    q1_prices.append(q1_price)
    q2_prices.append(q2_price)
    q3_prices.append(q3_price)
    q4_prices.append(q4_price)
    photos.append(villa_photos)   
    
    if villa_number % 75 == 0:
        print(f"{villa_number} villas have been completed!")

print(f"THE OPERATION IS COMPLETED!")
print(f"TOTAL VILLA NUMBER: {villa_number}")

https://www.viravilla.com/villa-vilanka : LOCATION ERROR!
75 villas have been completed!
https://www.viravilla.com/villa-tiny-house-2 : LOCATION ERROR!
https://www.viravilla.com/villa-meryem-ana : PRICE ERROR!
https://www.viravilla.com/villa-ozmen : PRICE ERROR!
https://www.viravilla.com/villa-seyirtepe : PRICE ERROR!
https://www.viravilla.com/villa-zeytin : PRICE ERROR!
https://www.viravilla.com/villa-dedem-b : LOCATION ERROR!
https://www.viravilla.com/villa-aybek : LOCATION ERROR!
150 villas have been completed!
https://www.viravilla.com/villa-baykan- : PRICE ERROR!
https://www.viravilla.com/villa-higul : PRICE ERROR!
https://www.viravilla.com/villa-shallwe-house : PRICE ERROR!
https://www.viravilla.com/villa-twin-1- : PRICE ERROR!
https://www.viravilla.com/villa-twin-2- : PRICE ERROR!
https://www.viravilla.com/villa-toprak : PRICE ERROR!
https://www.viravilla.com/villa-abad : PRICE ERROR!
https://www.viravilla.com/villa-ayisigi : LOCATION ERROR!
https://www.viravilla.com/villa-verde

In [4]:
data_raw = {
    "ID": ids,
    "Name": names,
    "Address": addresses,
    "Guest Number": guests,
    "Bedroom Number": bedrooms,
    "Room Number": rooms,
    "Latitude": latitudes,
    "Longitude": longitudes,
    "Description": descriptions,
    "Photo URLs": photos,
    "Average Price": average_prices,
    "Q1 Price": q1_prices,
    "Q2 Price": q2_prices,
    "Q3 Price": q3_prices,
    "Q4 Price": q4_prices
}

df_raw = pd.DataFrame(data_raw)
df_raw.set_index("ID", inplace=True)
df_raw.to_excel("viravilla_raw.xlsx")

data_info = {
    "ID": ids,
    "Name": names,
    "Address": addresses,
    "Guest Number": guests,
    "Bedroom Number": bedrooms,
    "Room Number": rooms,
    "Latitude": latitudes,
    "Longitude": longitudes,
    "Description": descriptions,
    "Photo URLs": photos  
}

df_info = pd.DataFrame(data_info)
df_info.set_index("ID", inplace=True)
df_info.to_excel("viravilla_info.xlsx")

data_financial = {
    "ID": ids,
    "Average Price": average_prices,
    "Q1 Price": q1_prices,
    "Q2 Price": q2_prices,
    "Q3 Price": q3_prices,
    "Q4 Price": q4_prices   
}

df_financial = pd.DataFrame(data_financial)
df_financial.set_index("ID", inplace=True)
df_financial.to_excel("viravilla_financial.xlsx")