<a href="https://colab.research.google.com/github/MinaAlberDS/Codveda-Internship/blob/master/Web_scrapping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Scrapping, and collecting the data

### Collecting the houses links

In [None]:
# Import the needed libraries
import requests
from bs4 import BeautifulSoup
import time
import numpy as np

# Make the function which will get the houses links
def houses_links(pages, verbose):
    """
    Scrapes real estate listing links from realestate.gov.eg.

    Args:
        pages: An integer representing the number of pages to scrape.
        verbose: A boolean indicating whether to print the progress of scraping each page.

    Returns:
        A list of strings, where each string is a URL to a real estate listing.
    """
    base_url = r"https://realestate.gov.eg/properties?page={}" # Base url template

    links = [] # links list
    for page in range(1,pages+1): # This for loop to get the 9 houses links in each page
        start_time = time.perf_counter() # start a stop watch for each page
        page_url = base_url.format(page) # Set the page url from the baseline url using f-string or format
        response = requests.get(page_url) # Get the response of the page url
        if response.status_code == 200: # check if the page can be scrapped or not
          soop = BeautifulSoup(response.content, "html.parser") # set the soop variable to scrape the links from the page html
          houses = soop.find_all("div", class_="col-md-4")[3:] # get the houses links

          for house in houses: # Add the houses links to the links list
              link = r"https://realestate.gov.eg" + house.find("a")["href"] # Set the house url
              links.append(link) # add it to the links list
          end_time = time.perf_counter() # stop the timer
          elapsed = end_time - start_time # count the elapsed time
          if verbose: # if the verbose is true, it will show some useful information about each page
              print(f"Page: {page} links have been appended in {elapsed:.4f}s")
        else: # if the page cannot be scraped
          print(f"failed to scrape this Page: {page} with code: {response.status_code}")
    return links # finally retrun the links list

In [None]:
links = houses_links(56, True) # running our function with 56 pages, and verbose = True

Page: https://realestate.gov.eg/properties?page=1 links have been appended in 1.4963s
Page: https://realestate.gov.eg/properties?page=2 links have been appended in 1.6024s
Page: https://realestate.gov.eg/properties?page=3 links have been appended in 1.3798s
Page: https://realestate.gov.eg/properties?page=4 links have been appended in 1.3941s
Page: https://realestate.gov.eg/properties?page=5 links have been appended in 1.2867s
Page: https://realestate.gov.eg/properties?page=6 links have been appended in 1.4166s
Page: https://realestate.gov.eg/properties?page=7 links have been appended in 1.2617s
Page: https://realestate.gov.eg/properties?page=8 links have been appended in 1.2477s
Page: https://realestate.gov.eg/properties?page=9 links have been appended in 1.9099s
Page: https://realestate.gov.eg/properties?page=10 links have been appended in 1.2873s
Page: https://realestate.gov.eg/properties?page=11 links have been appended in 1.2978s
Page: https://realestate.gov.eg/properties?page=12 l

In [None]:
len(links)

504

We have 504 house to scrape

In [None]:
import json
# export the houses links to a json file
with open("Links.json", "w") as links_json:
    json.dump(links, links_json, indent =4)

In [None]:
# import the json file links to scrap them
with open("Links.json", "r") as links_json:
  links = json.load(links_json)
# show the results
links[:5]

['https://realestate.gov.eg/property-details/podia/cairo/cairo/floor-ground-floor-unit-mpr-0054-bin-zayednorth-podia-cairo-cairo-egypt/E109850',
 'https://realestate.gov.eg/property-details/new-garden-city/new-administrative-capital/cairo/building-g722-floor-4th-floor-unit-g72248-new-garden-city-new-administrative-capital-cairo-cairo-egypt/E111406',
 'https://realestate.gov.eg/property-details/mazarine-apartment/north-coast/matrouh/building-mztf38-floor-1-unit-mztf3822-mazarine-apartment-north-coast-matrouh-egypt/E377760',
 'https://realestate.gov.eg/property-details/alamain-(latin-district)/north-coast/alexandria/building-a13-floor-1-unit-z05-cl12-a13-m7-s2-01-03-alamain-(latin-district)-north-coast-alexandria-egypt/E104533',
 'https://realestate.gov.eg/property-details/mazarine-apartment/north-coast/matrouh/building-mzcs22-floor-ground-floor-unit-mzcs2211-mazarine-apartment-north-coast-matrouh-egypt/E377256']

### Collect the houses data

In [56]:
def collect_house_data(houses_links):
    # Initialize a dictionary to store data for all houses
    all_houses_data = {
        'name': [],
        'price': [],
        'property_type': [],
        'bedrooms': [],
        'bathrooms': [],
        'sqm': [],
        'year_built': [],
        'features': [],
        'city': [],
        'governorate': [],
        'address': [],
        'description': []
    }

    for house_link in houses_links: # get the data from each house
        try:
            house_response = requests.get(house_link) # Get the request of the house link
            house_soop = BeautifulSoup(house_response.content, "html.parser") # make the house soop

            # Extract data, handling potential missing elements
            name = house_soop.find("h6", class_= "propertyInfo_name__bs0i7")
            all_houses_data['name'].append(name.get_text(strip=True) if name else None)

            price = house_soop.find("p", class_ = "propertyInfo_price__8ecPp")
            all_houses_data['price'].append(price.text.strip() if price else None)

            house_div = house_soop.find_all("div", class_= "propertySpecs_spec_box__xLU5D")
            if len(house_div) > 0:
                property_type = house_div[0].find("span")
                all_houses_data['property_type'].append(property_type.text.strip() if property_type else None)

                # Extract bedroom, bathroom, sqm, and year built if available
                if len(house_div[1].find_all("span")) > 3:
                    bedroom = house_div[1].find_all("span")[1]
                    all_houses_data['bedrooms'].append(bedroom.text.strip() if bedroom else None)

                    bathroom = house_div[1].find_all("span")[2]
                    all_houses_data['bathrooms'].append(bathroom.text.strip() if bathroom else None)

                    sqm = house_div[1].find_all("span")[3]
                    all_houses_data['sqm'].append(sqm.text.strip() if sqm else None)

                    year_built = house_div[1].find_all("span")[4]
                    all_houses_data['year_built'].append(year_built.text.strip() if year_built else None)
                else:
                    # Append None if these details are not found
                    all_houses_data['bedrooms'].append(None)
                    all_houses_data['bathrooms'].append(None)
                    all_houses_data['sqm'].append(None)
                    all_houses_data['year_built'].append(None)
            else:
                # Append None for property type and other specs if the main div is not found
                all_houses_data['property_type'].append(None)
                all_houses_data['bedrooms'].append(None)
                all_houses_data['bathrooms'].append(None)
                all_houses_data['sqm'].append(None)
                all_houses_data['year_built'].append(None)


            house_features = []
            house_features_dv = house_soop.find_all("div", class_ = "col-md-6")
            for house_feature in house_features_dv:
                feature_text = house_feature.find("h2")
                if feature_text:
                    house_features.append(feature_text.get_text(strip=True))
            all_houses_data['features'].append(house_features) # Append the list of features


            house_city_div = house_soop.find_all("div", class_= "propertyData_address_info_row__up8MX")
            if len(house_city_div) > 2:
                city = house_city_div[1].find_all("p")
                all_houses_data['city'].append(city[1].get_text(strip=True) if len(city) > 1 else None)

                governorate = house_city_div[2].find_all("p")
                all_houses_data['governorate'].append(governorate[1].get_text(strip=True) if len(governorate) > 1 else None)
            else:
                # Append None if city and governorate details are not found
                all_houses_data['city'].append(None)
                all_houses_data['governorate'].append(None)


            address_div = house_soop.find("div", class_="propertyData_address_info_row__up8MX propertyData_address_row__vZiYP")
            if address_div:
                address = address_div.find_all("p")
                all_houses_data['address'].append(address[1].get_text(strip=True) if len(address) > 1 else None)
            else:
                # Append None if address is not found
                all_houses_data['address'].append(None)

            description_div = house_soop.find("div", class_= "propertyData_property_long_description__mAJPf")
            if description_div:
                 # Extract text from each paragraph tag and join them
                 paragraph_texts = [p.get_text(strip=True) for p in description_div.find_all("p")]
                 full_description = "".join(paragraph_texts)
                 all_houses_data['description'].append(full_description)
            else:
                # Append None if description is not found
                all_houses_data['description'].append(None)

        except Exception as e:
            print(f"Error scraping {house_link}: {e}")
            # Append None for all fields if an error occurs during scraping
            for key in all_houses_data:
                all_houses_data[key].append(None)


    return all_houses_data

In [58]:
collect_house_data(links)

KeyboardInterrupt: 