In [1]:
import bs4
import requests
import pandas as pd
import re
import unicodedata

In [2]:
urls = []
addresses = []

for page_number in range(1, 63):
    URL = f"https://www.kastavillam.com/kiralik-villa/{page_number}/"
    response = requests.get(URL)
    response.encoding = "utf-8" 
    html_parsed = bs4.BeautifulSoup(response.text, "html.parser")
    
    url_elements = html_parsed.select(".img.nl a")
    urls.extend([url_elem["href"] for url_elem in url_elements])
    
    addresses.extend([unicodedata.normalize("NFKD", loc_elem.getText()).strip() for loc_elem in html_parsed.select(".loc")])
    
        
    if page_number % 10 == 0:
        print(f"{page_number} pages have been checked!")
        
print(f"\n---OPERATION HAS BEEN COMPLETED!---")
print(f"{page_number} pages have been checked!")
print(f"Total Villa URL: {len(urls)}")

10 pages have been checked!
20 pages have been checked!
30 pages have been checked!
40 pages have been checked!
50 pages have been checked!
60 pages have been checked!

---OPERATION HAS BEEN COMPLETED!---
62 pages have been checked!
Total Villa URL: 735


In [5]:
names = []
ids = []
guests = []
bedrooms = []
bathrooms = []
descriptions = []
latitudes = []
longitudes = []
photos = []
average_prices = []
q1_prices = []
q2_prices = []
q3_prices = []
q4_prices = []

URL_ERRORS = []
villa_number = 0
for URL in urls:
    try:
        response = requests.get(URL)
        response.encoding = "utf-8" 
        html_parsed = bs4.BeautifulSoup(response.text, "html.parser")
        villa_number += 1
    except:
        print(f"{URL} : URL ERROR!")
        URL_ERRORS = []
        
    try:
        name = html_parsed.select_one(".box.title").getText().strip()
        villa_id = name.lower().replace(" ", "")
    except:
        print(f"{URL} : NAME ERROR!")
        name = None
        villa_id = None
        
    try:
        guest = int(re.sub("[^0-9]", "", html_parsed.select(".box.spec")[2].getText()))
    except:
        print(f"{URL} : GUEST NUMBER ERROR!")
        guest = None
        
    try:
        bedroom = int(re.sub("[^0-9]", "", html_parsed.select(".box.spec")[0].getText()))
    except:
        print(f"{URL} : BEDROOM NUMBER ERROR!")
        bedroom = None
    
    try:
        bathroom = int(re.sub("[^0-9]", "", html_parsed.select(".box.spec")[1].getText()))
    except:
        print(f"{URL} : BATHROOM NUMBER ERROR!")
        bathroom = None
        
    try:
        description = ("\n".join([text_elem.getText().strip() for text_elem in html_parsed.select("#aciklama p")])).strip()
    except:
        try:
            description = ("\n".join([text_elem.getText().strip() for text_elem in html_parsed.select("#collapseLinkExample p")])).strip()
        except:
            print(f"{URL} : DESCRIPTION ERROR!")
            description = None
        
    try:
        lat_pattern = r'map_lat\s*=\s*"([\d.-]+)"'
        lng_pattern = r'map_lon\s*=\s*"([\d.-]+)"'
        lat_match = re.search(lat_pattern, str(html_parsed))
        lng_match = re.search(lng_pattern, str(html_parsed))
        latitude = float(lat_match.group(1))
        longitude = float(lng_match.group(1))
    except:
        print(f"{URL} : LOCATION ERROR!")
        latitude = None
        longitude = None
        
    try:
        villa_photos = None
    except:
        print(f"{URL} : PHOTO ERROR")
        villa_photos = None
        
    try:
        villa_prices = [int(re.sub("[^0-9]", "", price_elem["data-day1"])) for price_elem in html_parsed.select(".price_block")]
        avg_price = sum(villa_prices) // len(villa_prices)
        q1_price = avg_price*4 * 0.14
        q2_price = avg_price*4 * 0.32
        q3_price = avg_price*4 * 0.37
        q4_price = avg_price*4 * 0.17
    except:
        print(f"{URL} : PRICE ERROR!")
        avg_price = None
        q1_price = None
        q2_price = None
        q3_price = None
        q4_price = None
    
    names.append(name)
    ids.append(villa_id)
    guests.append(guest)
    bedrooms.append(bedroom)
    bathrooms.append(bathroom)
    descriptions.append(description)
    latitudes.append(latitude)
    longitudes.append(longitude)
    average_prices.append(avg_price)
    q1_prices.append(q1_price)
    q2_prices.append(q2_price)
    q3_prices.append(q3_price)
    q4_prices.append(q4_price)
    photos.append(villa_photos)   
    
    if villa_number % 75 == 0:
        print(f"{villa_number} villas have been completed!")

print(f"THE OPERATION IS COMPLETED!")
print(f"TOTAL VILLA NUMBER: {villa_number}")

https://www.kastavillam.com/villa-sudem/ : GUEST NUMBER ERROR!
https://www.kastavillam.com/villa-sudem/ : BEDROOM NUMBER ERROR!
https://www.kastavillam.com/villa-sudem/ : BATHROOM NUMBER ERROR!
https://www.kastavillam.com/villa-sudem/ : PRICE ERROR!
https://www.kastavillam.com/villa-keskin-duo/ : PRICE ERROR!
75 villas have been completed!
150 villas have been completed!
225 villas have been completed!
300 villas have been completed!
https://www.kastavillam.com/villa-lumiere/ : GUEST NUMBER ERROR!
https://www.kastavillam.com/villa-lumiere/ : BEDROOM NUMBER ERROR!
https://www.kastavillam.com/villa-lumiere/ : BATHROOM NUMBER ERROR!
https://www.kastavillam.com/villa-can-bayindir/ : GUEST NUMBER ERROR!
https://www.kastavillam.com/villa-can-bayindir/ : BEDROOM NUMBER ERROR!
https://www.kastavillam.com/villa-can-bayindir/ : BATHROOM NUMBER ERROR!
375 villas have been completed!
https://www.kastavillam.com/villa-arda-saribelen/ : PRICE ERROR!
https://www.kastavillam.com/villa-sule/ : PRICE ER

In [6]:
data_raw = {
    "ID": ids,
    "Name": names,
    "Address": addresses,
    "Guest Number": guests,
    "Bedroom Number": bedrooms,
    "Bathroom Number": bathrooms,
    "Latitude": latitudes,
    "Longitude": longitudes,
    "Description": descriptions,
    "Photo URLs": photos,
    "Average Price": average_prices,
    "Q1 Price": q1_prices,
    "Q2 Price": q2_prices,
    "Q3 Price": q3_prices,
    "Q4 Price": q4_prices
}

df_raw = pd.DataFrame(data_raw)
df_raw.set_index("ID", inplace=True)
df_raw.to_excel("kastavillam_raw.xlsx")

data_info = {
    "ID": ids,
    "Name": names,
    "Address": addresses,
    "Guest Number": guests,
    "Bedroom Number": bedrooms,
    "Bathroom Number": bathrooms,
    "Latitude": latitudes,
    "Longitude": longitudes,
    "Description": descriptions,
    "Photo URLs": photos  
}

df_info = pd.DataFrame(data_info)
df_info.set_index("ID", inplace=True)
df_info.to_excel("kastavillam_info.xlsx")

data_financial = {
    "ID": ids,
    "Average Price": average_prices,
    "Q1 Price": q1_prices,
    "Q2 Price": q2_prices,
    "Q3 Price": q3_prices,
    "Q4 Price": q4_prices   
}

df_financial = pd.DataFrame(data_financial)
df_financial.set_index("ID", inplace=True)
df_financial.to_excel("kastavillam_financial.xlsx")