In [1]:
import bs4
import requests
from urllib.parse import urlparse, parse_qs
import pandas as pd
import re
import unicodedata

In [2]:
urls = []
names = []
ids = []
addresses = []

for page_number in range(1, 124):
    URL = f"https://kalkanvilla.org/tr/kiralik-villa/{page_number}/"
    response = requests.get(URL)
    response.encoding = "utf-8" 
    html_parsed = bs4.BeautifulSoup(response.text, "html.parser")
    
    for villa in html_parsed.find_all("div", class_="tt"):
        villa_address = tuple(villa.getText().strip().split("\n"))
        names.append(villa_address[0])
        addresses.append(villa_address[1])
        ids.append(villa_address[0].lower().replace(" ", ""))
    
    for url in html_parsed.select("div#page div.mw div.page-wrapper div.listing_1 ul li div.w a")[::5]:
        urls.append(url["href"])
        
    if page_number % 10 == 0:
        print(f"{page_number} pages have been checked!")

print(f"\n---OPERATION HAS BEEN COMPLETED!---")
print(f"{page_number} pages have been checked!")
print(f"Total Villa URL: {len(urls)}")

10 pages have been checked!
20 pages have been checked!
30 pages have been checked!
40 pages have been checked!
50 pages have been checked!
60 pages have been checked!
70 pages have been checked!
80 pages have been checked!
90 pages have been checked!
100 pages have been checked!
110 pages have been checked!
120 pages have been checked!

---OPERATION HAS BEEN COMPLETED!---
123 pages have been checked!
Total Villa URL: 1469


In [3]:
latitudes = []
longitudes = []
photos = []
average_prices = []
q1_prices = []
q2_prices = []
q3_prices = []
q4_prices = []
bedrooms = []
guests = []
bathrooms = []
descriptions = []

URL_ERRORS = []
villa_number = 0
for URL in urls:
    try:
        response = requests.get(URL)
        response.encoding = "utf-8" 
        html_parsed = bs4.BeautifulSoup(response.text, "html.parser")
        villa_number += 1
    except:
        print(f"{URL} : URL ERROR!")
        URL_ERRORS.append(URL)
        
    try:
        script_tag = html_parsed.select("div.detail-block-2 script")[0].string.strip()
        latitude = re.search(r'map_lat\s*=\s*"([-+]?\d+\.\d+)"', script_tag)
        longitude = re.search(r'map_lon\s*=\s*"([-+]?\d+\.\d+)"', script_tag)
        latitude = float(latitude.group(1))
        longitude = float(longitude.group(1))
    except:
        print(f"{URL} : LOCATION ERROR!")
        latitude = None
        longitude = None
        
    try: 
        photo = tuple(link["href"] for link in html_parsed.select("div#detail div.wht1 div.detail-slider-1 div.fotorama a"))
    except:
        print(f"{URL} PHOTO ERROR!")
        photo = None
        
    try:
        villa_prices = [int(villa_price["data-day1"].replace("₺", "").strip()) for villa_price in html_parsed.select("div.price_list ul li")]
        avg_price = sum(villa_prices) // len(villa_prices)
        q1_price = avg_price*4 * 0.14
        q2_price = avg_price*4 * 0.32
        q3_price = avg_price*4 * 0.37
        q4_price = avg_price*4 * 0.17
    except:
        avg_price = None
        q1_price = None
        q2_price = None
        q3_price = None
        q4_price = None
        print(f"{URL} : PRICE ERROR!")
        
    try:
        numeric_infos = html_parsed.select(".box.spec b")
        guest = int(numeric_infos[0].getText())
        bedroom = int(numeric_infos[1].getText())
        bathroom = int(numeric_infos[2].getText())
    except:
        print(f"{URL} : NUMERIC INFO ERROR!")
        guest = None
        bedroom = None
        bathroom = None
    
    try:
        description = unicodedata.normalize("NFKD", "\n".join([desc.getText().strip() for desc in html_parsed.select_one(".detail-text p")])).strip()
    except:
        print(f"{URL} : DESCRIPTION ERROR!")
        description = None
          
    latitudes.append(latitude)
    longitudes.append(longitude)
    photos.append(photo)
    average_prices.append(avg_price)
    q1_prices.append(q1_price)
    q2_prices.append(q2_price)
    q3_prices.append(q3_price)
    q4_prices.append(q4_price)
    guests.append(guest)
    bedrooms.append(bedroom)
    bathrooms.append(bathroom)
    descriptions.append(description)
    
    if villa_number % 80 == 0:
        print(f"{villa_number} villas are done!")
        
print(f"THE OPERATION IS COMPLETED!")
print(f"TOTAL VILLA NUMBER: {villa_number}")   

80 villas are done!
160 villas are done!
240 villas are done!
320 villas are done!
https://kalkanvilla.org/tr/villa-porthos-saribelen/ : PRICE ERROR!
https://kalkanvilla.org/tr/villa-aramis/ : PRICE ERROR!
400 villas are done!
https://kalkanvilla.org/tr/villa-uysal-duo/ : PRICE ERROR!
480 villas are done!
https://kalkanvilla.org/tr/villa-sultan/ : PRICE ERROR!
560 villas are done!
640 villas are done!
720 villas are done!
https://kalkanvilla.org/tr/villa-eastern/ : PRICE ERROR!
https://kalkanvilla.org/tr/villa-moon-1/ : PRICE ERROR!
800 villas are done!
880 villas are done!
960 villas are done!
https://kalkanvilla.org/tr/villa-alen/ : PRICE ERROR!
1040 villas are done!
1120 villas are done!
1200 villas are done!
https://kalkanvilla.org/tr/villa-moon-2/ : PRICE ERROR!
https://kalkanvilla.org/tr/villa-capella/ : PRICE ERROR!
1280 villas are done!
1360 villas are done!
https://kalkanvilla.org/tr/gokkusagi-apartments/ : PRICE ERROR!
https://kalkanvilla.org/tr/villa-patron-duo/ : PRICE ERRO

In [4]:
data_raw = {
    "ID": ids,
    "Name": names,
    "Address": addresses,
    "Guest Number": guests,
    "Bedroom Number": bedrooms,
    "Bathroom Number":bathrooms,
    "Latitude": latitudes,
    "Longitude": longitudes,
    "Description": descriptions,
    "Photo URLs": photos,
    "Average Price": average_prices,
    "Q1 Price": q1_prices,
    "Q2 Price": q2_prices,
    "Q3 Price": q3_prices,
    "Q4 Price": q4_prices
}

df_raw = pd.DataFrame(data_raw)
df_raw.set_index("ID", inplace=True)
df_raw.to_excel("kalkanvilla_raw.xlsx")

data_info = {
    "ID": ids,
    "Name": names,
    "Address": addresses,
    "Guest Number": guests,
    "Bedroom Number": bedrooms,
    "Bathroom Number": bathrooms,
    "Description": descriptions,
    "Photo URLs": photos,
    "Latitude": latitudes,
    "Longitude": longitudes
}

df_info = pd.DataFrame(data_info)
df_info.set_index("ID", inplace=True)
df_info.to_excel("kalkanvilla_info.xlsx")

data_financial = {
    "ID": ids,
    "Average Price": average_prices,
    "Q1 Price": q1_prices,
    "Q2 Price": q2_prices,
    "Q3 Price": q3_prices,
    "Q4 Price": q4_prices   
}

df_financial = pd.DataFrame(data_financial)
df_financial.set_index("ID", inplace=True)
df_financial.to_excel("kalkanvilla_financial.xlsx")