In [1]:
import bs4
import requests
import pandas as pd
import unicodedata
import re

In [2]:
urls = []
names = []
ids = []

for page_number in range(1, 129):
    URL = f"https://www.villapaketi.com/kiralik-villa?s={page_number}"
    response = requests.get(URL)
    response.encoding = "utf-8" 
    html_parsed = bs4.BeautifulSoup(response.text, "html.parser")

    villa_names = [villa_name.getText().strip() for villa_name in html_parsed.select(".product-card-title.cat-product-card-title")]
    villa_ids = [villa_id.replace(" ", "").lower().strip() for villa_id in villa_names]
    villa_urls = ["https://www.villapaketi.com"+villa_url["href"] for villa_url in html_parsed.select("#searchResult .position-relative a")]
    
    urls.extend(villa_urls)
    names.extend(villa_names)
    ids.extend(villa_ids)
    
    if page_number % 10 == 0:
        print(f"{page_number} pages have been checked!")
    
print(f"\n---OPERATION HAS BEEN COMPLETED!---")
print(f"{page_number} pages have been checked!")
print(f"Total Villa Name: {len(names)}")
print(f"Total Villa URL: {len(urls)}")

10 pages have been checked!
20 pages have been checked!
30 pages have been checked!
40 pages have been checked!
50 pages have been checked!
60 pages have been checked!
70 pages have been checked!
80 pages have been checked!
90 pages have been checked!
100 pages have been checked!
110 pages have been checked!
120 pages have been checked!

---OPERATION HAS BEEN COMPLETED!---
128 pages have been checked!
Total Villa Name: 1098
Total Villa URL: 1098


In [3]:
addresses = []
guests = []
bedrooms = []
bathrooms = []
latitudes = []
longitudes = []
descriptions = []
photos = []
average_prices = []
q1_prices = []
q2_prices = []
q3_prices = []
q4_prices = []

URL_ERRORS = []
villa_number = 0
for URL in urls:
    try:
        response = requests.get(URL)
        response.encoding = "utf-8" 
        html_parsed = bs4.BeautifulSoup(response.text, "html.parser")
        villa_number += 1
    except:
        print(f"{URL} : URL ERROR!")
        URL_ERRORS.append(URL)

    try:
        address = html_parsed.select(".product-icon-item span")[0].getText().strip()
    except:
        print(f"{URL} : ADDRESS ERROR!")
        address = None
    
    try:
        guest = int(html_parsed.select(".product-icon-item span")[3].getText().strip()[0])
    except:
        print(f"{URL} : GUEST NUMBER ERROR!")
        guest = None
        
    try:
        bedroom = int(html_parsed.select(".product-icon-item span")[1].getText().strip()[0])
    except:
        print(f"{URL} : BEDROOM NUMBER ERROR!")
        bedroom = None
        
    try:
        bathroom = int(html_parsed.select(".product-icon-item span")[2].getText().strip()[0])
    except:
        print(f"{URL} : BATHROOM NUMBER ERROR!")
        bathroom = None
        
    try:
        description = unicodedata.normalize("NFKD", "\n".join([desc.getText().strip() for desc in html_parsed.select(".row.pd-det-desc p")[:-1]]))
    except:
        print(f"{URL} : DESCRIPTION ERROR!")
        description = None
        
    latitude = None
    longitude = None
        
    try:
        avg_price = 0
        price_num = 0
        for price_elem in html_parsed.select(".tblCol.tbl-cur")[1::2]:
            price_data = int(re.sub("[^0-9]", "", price_elem.getText().strip()))
            avg_price += price_data
            price_num += 1
    
        avg_price //= price_num
        q1_price = avg_price*4 * 0.14
        q2_price = avg_price*4 * 0.32
        q3_price = avg_price*4 * 0.37
        q4_price = avg_price*4 * 0.17
    except:
        print(f"{URL} : PRICE ERROR!")
        avg_price = None
        q1_price = None
        q2_price = None
        q3_price = None
        q4_price = None
        
    try:
        villa_photos = tuple(villa_photo_url["href"] for villa_photo_url in html_parsed.select("a.d-none"))
    except:
        print(f"{URL} : PHOTO ERROR")
        villa_photos = None
            
    guests.append(guest)
    bedrooms.append(bedroom)
    addresses.append(address)
    bathrooms.append(bathroom)
    descriptions.append(description)
    latitudes.append(latitude)
    longitudes.append(longitude)
    average_prices.append(avg_price)
    q1_prices.append(q1_price)
    q2_prices.append(q2_price)
    q3_prices.append(q3_price)
    q4_prices.append(q4_price)
    photos.append(villa_photos)   
    
    if villa_number % 75 == 0:
        print(f"{villa_number} villas have been completed!")

print(f"THE OPERATION IS COMPLETED!")
print(f"TOTAL VILLA NUMBER: {villa_number}")

75 villas have been completed!
150 villas have been completed!
225 villas have been completed!
300 villas have been completed!
375 villas have been completed!
https://www.villapaketi.com/villa-neco-bati : ADDRESS ERROR!
https://www.villapaketi.com/villa-neco-bati : GUEST NUMBER ERROR!
https://www.villapaketi.com/villa-neco-bati : BEDROOM NUMBER ERROR!
https://www.villapaketi.com/villa-neco-bati : BATHROOM NUMBER ERROR!
https://www.villapaketi.com/villa-neco-bati : PRICE ERROR!
https://www.villapaketi.com/villa-idil-saribelen : ADDRESS ERROR!
https://www.villapaketi.com/villa-idil-saribelen : GUEST NUMBER ERROR!
https://www.villapaketi.com/villa-idil-saribelen : BEDROOM NUMBER ERROR!
https://www.villapaketi.com/villa-idil-saribelen : BATHROOM NUMBER ERROR!
https://www.villapaketi.com/villa-idil-saribelen : PRICE ERROR!
https://www.villapaketi.com/villa-rayna : ADDRESS ERROR!
https://www.villapaketi.com/villa-rayna : GUEST NUMBER ERROR!
https://www.villapaketi.com/villa-rayna : BEDROOM N

In [6]:
error_list = ["https://www.villapaketi.com/villa-le",
              "https://www.villapaketi.com/villa-sandak-3",
              "https://www.villapaketi.com/villa-pearl-kalkan",
              "https://www.villapaketi.com/villa-atilgan-deluxe",
              "https://www.villapaketi.com/villa-sule-",
              "https://www.villapaketi.com/villa-bogazici-mini",
              "https://www.villapaketi.com/villa-yaz-1",
              "https://www.villapaketi.com/villa-rayna",
              "https://www.villapaketi.com/villa-idil-saribelen",
              "https://www.villapaketi.com/villa-neco-bati"]

for URL in error_list:
    print(URL)

https://www.villapaketi.com/villa-le
https://www.villapaketi.com/villa-sandak-3
https://www.villapaketi.com/villa-pearl-kalkan
https://www.villapaketi.com/villa-atilgan-deluxe
https://www.villapaketi.com/villa-sule-
https://www.villapaketi.com/villa-bogazici-mini
https://www.villapaketi.com/villa-yaz-1
https://www.villapaketi.com/villa-rayna
https://www.villapaketi.com/villa-idil-saribelen
https://www.villapaketi.com/villa-neco-bati


In [4]:
data_raw = {
    "ID": ids,
    "Name": names,
    "Address": addresses,
    "Guest Number": guests,
    "Bedroom Number": bedrooms,
    "Bathroom Number": bathrooms,
    "Latitude": latitudes,
    "Longitude": longitudes,
    "Description": descriptions,
    "Photo URLs": photos,
    "Average Price": average_prices,
    "Q1 Price": q1_prices,
    "Q2 Price": q2_prices,
    "Q3 Price": q3_prices,
    "Q4 Price": q4_prices
}

df_raw = pd.DataFrame(data_raw)
df_raw.set_index("ID", inplace=True)
df_raw.to_excel("villapaketi_raw.xlsx")

data_info = {
    "ID": ids,
    "Name": names,
    "Address": addresses,
    "Guest Number": guests,
    "Bedroom Number": bedrooms,
    "Bathroom Number": bathrooms,
    "Description": descriptions,
    "Photo URLs": photos,
    "Latitude": latitudes,
    "Longitude": longitudes
}

df_info = pd.DataFrame(data_info)
df_info.set_index("ID", inplace=True)
df_info.to_excel("villapaketi_info.xlsx")

data_financial = {
    "ID": ids,
    "Average Price": average_prices,
    "Q1 Price": q1_prices,
    "Q2 Price": q2_prices,
    "Q3 Price": q3_prices,
    "Q4 Price": q4_prices   
}

df_financial = pd.DataFrame(data_financial)
df_financial.set_index("ID", inplace=True)
df_financial.to_excel("villapaketi_financial.xlsx")