In [1]:
import bs4
import requests
import pandas as pd
import re

In [2]:
urls = []
names = []
addresses = []
ids = []

for page_number in range(1, 72):
    URL = f"https://www.gotatil.com/kiralik-villa?page={page_number}"
    response = requests.get(URL)
    response.encoding = "utf-8" 
    html_parsed = bs4.BeautifulSoup(response.text, "html.parser")
    
    for villa_info in html_parsed.select(".result-villa-container .title a"):
        villa_name = villa_info.getText().strip()
        villa_url = "https://www.gotatil.com" + villa_info["href"]
        villa_id = villa_name.replace(" ", "").lower()
        
        urls.append(villa_url)
        names.append(villa_name)
        ids.append(villa_id)
        
    for villa_info in html_parsed.select(".result-villa-container .location"):
        villa_address = villa_info.getText().strip()
        addresses.append(villa_address)
        
    if page_number % 10 == 0:
        print(f"{page_number} pages have been checked!")
        
print(f"\n---OPERATION HAS BEEN COMPLETED!---")
print(f"{page_number} pages have been checked!")
print(f"Total Villa Name: {len(names)}")
print(f"Total Villa URL: {len(urls)}")

10 pages have been checked!
20 pages have been checked!
30 pages have been checked!
40 pages have been checked!
50 pages have been checked!
60 pages have been checked!
70 pages have been checked!

---OPERATION HAS BEEN COMPLETED!---
71 pages have been checked!
Total Villa Name: 1054
Total Villa URL: 1054


In [3]:
guests = []
bedrooms = []
bathrooms = []
latitudes = []
longitudes = []
descriptions = []
photos = []
average_prices = []
q1_prices = []
q2_prices = []
q3_prices = []
q4_prices = []

URL_ERRORS = []
villa_number = 0
for URL in urls:
    try:
        response = requests.get(URL)
        response.encoding = "utf-8" 
        html_parsed = bs4.BeautifulSoup(response.text, "html.parser")
        villa_number += 1
    except:
        print(f"{URL} : URL ERROR!")
        URL_ERRORS = []
        
    try:
        guest = [int(info.getText().strip()[0]) for info in html_parsed.select(".row.m-0.villa-icon-section div")[:-1]][0]
    except:
        print(f"{URL} : GUEST NUMBER ERROR!")
        guest = None
        
    try:
        bedroom = [int(info.getText().strip()[0]) for info in html_parsed.select(".row.m-0.villa-icon-section div")[:-1]][1]
    except:
        print(f"{URL} : BEDROOM NUMBER ERROR!")
        bedroom = None
    
    try:
        bathroom = [int(info.getText().strip()[0]) for info in html_parsed.select(".row.m-0.villa-icon-section div")[:-1]][2]
    except:
        print(f"{URL} : BATHROOM NUMBER ERROR!")
        bathroom = None
        
    try:
        description = ("\n".join([text_elem.getText().strip() for text_elem in html_parsed.select("div.desc p")])).strip()
    except:
        print(f"{URL} : DESCRIPTION ERROR!")
        description = None
        
    try:
        text = str(html_parsed.find_all("script"))
        pattern = r'center:\s{lat:\s([\d.]+),\s+lng:\s([\d.]+)'
        match = re.search(pattern, text)
        latitude = float(match.group(1))
        longitude = float(match.group(2))
    except:
        print(f"{URL} : LOCATION ERROR!")
        latitude = None
        longitude = None
        
    try:
        villa_photos = tuple("https://www.gotatil.com" + villa_image["data-villa-gallery-image"] for villa_image in html_parsed.select(".villa-gallery-section span"))
    except:
        print(f"{URL} : PHOTO ERROR")
        villa_photos = None
        
    try:
        villa_prices = [villa_info.getText().replace(".", "").replace("TL", "").replace("₺", "").strip() for villa_info in html_parsed.select(".col-12.content table td")][4::5]
        villa_prices = list(map(int, villa_prices))
        avg_price = sum(villa_prices) // len(villa_prices)
        q1_price = avg_price*4 * 0.14
        q2_price = avg_price*4 * 0.32
        q3_price = avg_price*4 * 0.37
        q4_price = avg_price*4 * 0.17
    except:
        print(f"{URL} : PRICE ERROR!")
        avg_price = None
        q1_price = None
        q2_price = None
        q3_price = None
        q4_price = None
        
    guests.append(guest)
    bedrooms.append(bedroom)
    bathrooms.append(bathroom)
    descriptions.append(description)
    latitudes.append(latitude)
    longitudes.append(longitude)
    average_prices.append(avg_price)
    q1_prices.append(q1_price)
    q2_prices.append(q2_price)
    q3_prices.append(q3_price)
    q4_prices.append(q4_price)
    photos.append(villa_photos)   
    
    if villa_number % 75 == 0:
        print(f"{villa_number} villas have been completed!")

print(f"THE OPERATION IS COMPLETED!")
print(f"TOTAL VILLA NUMBER: {villa_number}")

https://www.gotatil.com/villa-rise : LOCATION ERROR!
https://www.gotatil.com/villa-ebruli-2 : PRICE ERROR!
75 villas have been completed!
https://www.gotatil.com/villa-rubi : LOCATION ERROR!
150 villas have been completed!
https://www.gotatil.com/villa-asil-2 : LOCATION ERROR!
https://www.gotatil.com/villa-zagra-1 : LOCATION ERROR!
225 villas have been completed!
https://www.gotatil.com/villa-zaman : PRICE ERROR!
300 villas have been completed!
https://www.gotatil.com/villa-mask : GUEST NUMBER ERROR!
https://www.gotatil.com/villa-mask : BEDROOM NUMBER ERROR!
https://www.gotatil.com/villa-mask : BATHROOM NUMBER ERROR!
https://www.gotatil.com/villa-orko-11 : PRICE ERROR!
https://www.gotatil.com/villa-sedir-2 : LOCATION ERROR!
https://www.gotatil.com/villa-orko-9 : PRICE ERROR!
375 villas have been completed!
https://www.gotatil.com/villa-beste-2 : GUEST NUMBER ERROR!
https://www.gotatil.com/villa-beste-2 : BEDROOM NUMBER ERROR!
https://www.gotatil.com/villa-beste-2 : BATHROOM NUMBER ERRO

In [4]:
data_raw = {
    "ID": ids,
    "Name": names,
    "Address": addresses,
    "Guest Number": guests,
    "Bedroom Number": bedrooms,
    "Bathroom Number": bathrooms,
    "Latitude": latitudes,
    "Longitude": longitudes,
    "Description": descriptions,
    "Photo URLs": photos,
    "Average Price": average_prices,
    "Q1 Price": q1_prices,
    "Q2 Price": q2_prices,
    "Q3 Price": q3_prices,
    "Q4 Price": q4_prices
}

df_raw = pd.DataFrame(data_raw)
df_raw.set_index("ID", inplace=True)
df_raw.to_excel("gotatil_raw.xlsx")

data_info = {
    "ID": ids,
    "Name": names,
    "Address": addresses,
    "Guest Number": guests,
    "Bedroom Number": bedrooms,
    "Bathroom Number": bathrooms,
    "Latitude": latitudes,
    "Longitude": longitudes,
    "Description": descriptions,
    "Photo URLs": photos  
}

df_info = pd.DataFrame(data_info)
df_info.set_index("ID", inplace=True)
df_info.to_excel("gotatil_info.xlsx")

data_financial = {
    "ID": ids,
    "Average Price": average_prices,
    "Q1 Price": q1_prices,
    "Q2 Price": q2_prices,
    "Q3 Price": q3_prices,
    "Q4 Price": q4_prices   
}

df_financial = pd.DataFrame(data_financial)
df_financial.set_index("ID", inplace=True)
df_financial.to_excel("gotatil_financial.xlsx")