<a href="https://colab.research.google.com/github/MinaAlberDS/Codveda-Internship/blob/master/Web_scrapping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Scrapping, and collecting the data

### Collecting the houses links

In [3]:
# Import the needed libraries
import requests
from bs4 import BeautifulSoup
import time
import numpy as np
import pandas as pd
# Make the function which will get the houses links
def houses_links(pages, verbose):
    """
    Scrapes real estate listing links from realestate.gov.eg.

    Args:
        pages: An integer representing the number of pages to scrape.
        verbose: A boolean indicating whether to print the progress of scraping each page.

    Returns:
        A list of strings, where each string is a URL to a real estate listing.
    """
    base_url = r"https://realestate.gov.eg/properties?page={}" # Base url template

    links = [] # links list
    for page in range(1,pages+1): # This for loop to get the 9 houses links in each page
        start_time = time.perf_counter() # start a stop watch for each page
        page_url = base_url.format(page) # Set the page url from the baseline url using f-string or format
        response = requests.get(page_url) # Get the response of the page url
        if response.status_code == 200: # check if the page can be scrapped or not
          soop = BeautifulSoup(response.content, "html.parser") # set the soop variable to scrape the links from the page html
          houses = soop.find_all("div", class_="col-md-4")[3:] # get the houses links

          for house in houses: # Add the houses links to the links list
              link = r"https://realestate.gov.eg" + house.find("a")["href"] # Set the house url
              links.append(link) # add it to the links list
          end_time = time.perf_counter() # stop the timer
          elapsed = end_time - start_time # count the elapsed time
          if verbose: # if the verbose is true, it will show some useful information about each page
              print(f"Page: {page} links have been appended in {elapsed:.4f}s")
        else: # if the page cannot be scraped
          print(f"failed to scrape this Page: {page} with code: {response.status_code}")
    return links # finally retrun the links list

In [None]:
links = houses_links(100, True) # running our function with 56 pages, and verbose = True

Page: 1 links have been appended in 1.6158s
Page: 2 links have been appended in 1.1227s
Page: 3 links have been appended in 1.1321s
Page: 4 links have been appended in 1.8412s
Page: 5 links have been appended in 1.1842s
Page: 6 links have been appended in 1.0489s
Page: 7 links have been appended in 1.2277s
Page: 8 links have been appended in 1.1783s
Page: 9 links have been appended in 1.2382s
Page: 10 links have been appended in 1.1127s
Page: 11 links have been appended in 1.1664s
Page: 12 links have been appended in 1.3009s
Page: 13 links have been appended in 1.0830s
Page: 14 links have been appended in 1.2597s
Page: 15 links have been appended in 1.1812s
Page: 16 links have been appended in 1.1803s
Page: 17 links have been appended in 1.3764s
Page: 18 links have been appended in 1.2706s
Page: 19 links have been appended in 1.1003s
Page: 20 links have been appended in 1.0703s
Page: 21 links have been appended in 1.1615s
Page: 22 links have been appended in 1.0774s
Page: 23 links have

In [None]:
len(links)

900

We have 504 house to scrape

In [None]:
import json
# export the houses links to a json file
with open("Links.json", "w") as links_json:
    json.dump(links, links_json, indent =4)

In [None]:
# import the json file links to scrap them
with open("Links.json", "r") as links_json:
  links = json.load(links_json)
# show the results
links[:5]

['https://realestate.gov.eg/property-details/alamain-(latin-district)/north-coast/alexandria/building-f04-floor-6-unit-z02-cl05-f04-x7-06-11-alamain-(latin-district)-north-coast-alexandria-egypt/E358149',
 'https://realestate.gov.eg/property-details/beachfront-tower---b1/mersa-matruh/matrouh/building-btb1-floor-9th-floor-unit-btb1091-beachfront-tower---b1-mersa-matruh-(k)-matrouh-egypt/E111134',
 'https://realestate.gov.eg/property-details/podia/cairo/cairo/floor-13th-unit-mpo-1304-a-bin-zayednorth-podia-cairo-cairo-egypt/E109916',
 'https://realestate.gov.eg/property-details/mazarine-apartment/mersa-matruh/matrouh/building-mzcs15-floor-3rd-floor-unit-mzcs1531-mazarine-apartment-mersa-matruh-(k)-matrouh-egypt/E111209',
 'https://realestate.gov.eg/property-details/alamain-(latin-district)/north-coast/alexandria/building-e01-floor-1-unit-z02-cl09-e01-x4-01-03-alamain-(latin-district)-north-coast-alexandria-egypt/E104257']

### Collect the houses data

In [None]:
import json

def collect_house_data(houses_links, verbose):
    # Initialize a dictionary to store data for all houses
    all_houses_data = {
        'name': [],
        'price': [],
        'property_type': [],
        'bedrooms': [],
        'bathrooms': [],
        'sqm': [],
        'year_built': [],
        'features': [],
        'city': [],
        'governorate': [],
        'address': [],
        'description': []
    }

    # Open the JSON file in append mode outside the loop
    with open("Houses_data.json", "w") as houses_data:
        # Initialize the JSON file with an empty list
        json.dump([], houses_data)


    for i, house_link in enumerate(houses_links): # get the data from each house
        try:
            house_response = requests.get(house_link) # Get the request of the house link
            house_soop = BeautifulSoup(house_response.content, "html.parser") # make the house soop

            # Extract data, handling potential missing elements
            name = house_soop.find("h6", class_= "propertyInfo_name__bs0i7")
            all_houses_data['name'].append(name.get_text(strip=True) if name else None)

            price = house_soop.find("p", class_ = "propertyInfo_price__8ecPp")
            all_houses_data['price'].append(price.text.strip() if price else None)

            house_div = house_soop.find_all("div", class_= "propertySpecs_spec_box__xLU5D")
            if len(house_div) > 0:
                property_type = house_div[0].find("span")
                all_houses_data['property_type'].append(property_type.text.strip() if property_type else None)

                # Extract bedroom, bathroom, sqm, and year built if available

                bedroom = house_div[1].find_all("span")[1]
                all_houses_data['bedrooms'].append(bedroom.text.strip() if bedroom else None)
                bathroom = house_div[2].find_all("span")[1]
                all_houses_data['bathrooms'].append(bathroom.text.strip() if bathroom else None)
                sqm = house_div[3].find_all("span")[1]
                all_houses_data['sqm'].append(sqm.text.strip() if sqm else None)
                year_built = house_div[4].find_all("span")[1]
                all_houses_data['year_built'].append(year_built.text.strip() if year_built else None)

            else:
                # Append None for property type and other specs if the main div is not found
                all_houses_data['property_type'].append(None)
                all_houses_data['bedrooms'].append(None)
                all_houses_data['bathrooms'].append(None)
                all_houses_data['sqm'].append(None)
                all_houses_data['year_built'].append(None)


            house_features = []
            house_features_dv = house_soop.find_all("div", class_ = "col-md-6")
            for house_feature in house_features_dv:
                feature_text = house_feature.find("h2")
                if feature_text:
                    house_features.append(feature_text.get_text(strip=True))
            all_houses_data['features'].append(house_features) # Append the list of features


            house_city_div = house_soop.find_all("div", class_= "propertyData_address_info_row__up8MX")
            if len(house_city_div) > 2:
                city = house_city_div[1].find_all("p")
                all_houses_data['city'].append(city[1].get_text(strip=True) if len(city) > 1 else None)

                governorate = house_city_div[2].find_all("p")
                all_houses_data['governorate'].append(governorate[1].get_text(strip=True) if len(governorate) > 1 else None)
            else:
                # Append None if city and governorate details are not found
                all_houses_data['city'].append(None)
                all_houses_data['governorate'].append(None)


            address_div = house_soop.find("div", class_="propertyData_address_info_row__up8MX propertyData_address_row__vZiYP")
            if address_div:
                address = address_div.find_all("p")
                all_houses_data['address'].append(address[1].get_text(strip=True) if len(address) > 1 else None)
            else:
                # Append None if address is not found
                all_houses_data['address'].append(None)

            description_div = house_soop.find("div", class_= "propertyData_property_long_description__mAJPf")
            if description_div:
                 # Extract text from each paragraph tag and join them
                 paragraph_texts = [p.get_text(strip=True) for p in description_div.find_all("p")]
                 full_description = "".join(paragraph_texts)
                 all_houses_data['description'].append(full_description)
            else:
                # Append None if description is not found
                all_houses_data['description'].append(None)


            # Append the data of the current house to the JSON file
            with open("Houses_data.json", "r+") as houses_data:
                data = json.load(houses_data)
                data.append({key: all_houses_data[key][-1] for key in all_houses_data})  # Append the last added data
                houses_data.seek(0) # the begining of the file
                json.dump(data, houses_data, indent=4) # Add the house data
                houses_data.truncate() # The beining of the file

            if verbose:
                print(f"Scraped data for house {i+1}/{len(houses_links)}")

        except Exception as e:
            print(f"Error scraping {house_link}: {e}")
            # Append None for all fields if an error occurs during scraping
            for key in all_houses_data:
                all_houses_data[key].append(None)


    return all_houses_data

In [None]:
Houses_data = collect_house_data(links, True)

Scraped data for house 1/900
Scraped data for house 2/900
Scraped data for house 3/900
Scraped data for house 4/900
Scraped data for house 5/900
Scraped data for house 6/900
Scraped data for house 7/900
Scraped data for house 8/900
Scraped data for house 9/900
Scraped data for house 10/900
Scraped data for house 11/900
Scraped data for house 12/900
Scraped data for house 13/900
Scraped data for house 14/900
Scraped data for house 15/900
Scraped data for house 16/900
Scraped data for house 17/900
Scraped data for house 18/900
Scraped data for house 19/900
Scraped data for house 20/900
Scraped data for house 21/900
Scraped data for house 22/900
Scraped data for house 23/900
Scraped data for house 24/900
Scraped data for house 25/900
Scraped data for house 26/900
Scraped data for house 27/900
Scraped data for house 28/900
Scraped data for house 29/900
Scraped data for house 30/900
Scraped data for house 31/900
Scraped data for house 32/900
Scraped data for house 33/900
Scraped data for ho

In [32]:
houses_df = pd.read_json(r"/content/Houses_data.json")
houses_df # convert to a dataset

Unnamed: 0,name,price,property_type,bedrooms,bathrooms,sqm,year_built,features,city,governorate,address,description
0,Alamain (Latin District),"6,594,000 EGP","Residential, Apartment",1,0,92.86,2025,"[Other, Security Gate, Electricity Available]",North Coast,Alexandria,Building F04 Floor 6 Unit Z02-CL05-F04-X7-06-1...,Latini by Saudi Egyptian Developers (SED) – Ne...
1,Beachfront Tower - B1,"66,104,000 EGP","Residential, Apartment",3,3,398.00,2025,[Other],Mersa Matruh,Matrouh,Building BTB1 Floor 9th floor Unit BTB1091 Bea...,Explore a lifestyle that allows you to have yo...
2,PODIA,"12,824,000 EGP","Commercial, Office",0,0,93.00,2025,"[Other, 24 Hour Security, Fire Alarm, Gate Com...",Cairo,Cairo,Floor 13th Unit MPO-1304-A Bin ZayedNorth PODI...,Launched by Menassat Developments in cooperati...
3,Mazarine Apartment,"12,857,000 EGP","Residential, Apartment",3,3,252.00,2025,"[24 Hour Security, Electricity Available]",Mersa Matruh,Matrouh,Building MZCS15 Floor 3rd floor Unit MZCS1531 ...,The name ‘Mazarine’ came to fruition based on ...
4,Alamain (Latin District),"8,721,000 EGP","Residential, Apartment",3,0,209.42,2025,"[Other, Security Gate, Electricity Available]",North Coast,Alexandria,Building E01 Floor 1 Unit Z02-CL09-E01-X4-01-0...,One of the main districts in this new city nei...
...,...,...,...,...,...,...,...,...,...,...,...,...
895,Latin City,"7,967,000 EGP","Residential, Apartment",2,1,134.00,2025,"[24 Hour Security, Other]",North Coast,Matrouh,Building Z3CL0401 Floor 5 Unit LC301508 Latin ...,The Latin District is an expansion of the Alex...
896,Alamain (Latin District),"7,063,000 EGP","Residential, Apartment",4,0,229.35,2025,"[Other, Security Gate, Electricity Available]",North Coast,Alexandria,Building A13 Floor 6 Unit Z05-CL11-A13-M7-06-0...,Latini by Saudi Egyptian Developers (SED) – Ne...
897,Central,"41,470,000 EGP","Commercial, Office",0,0,217.17,2025,"[Elevator(s), Gate Community, Electricity Avai...",New Cairo,Cairo,Building O10 Floor 4 Unit O10-04-B Central New...,"True to its name, Central is situated at the c..."
898,Central,"17,350,000 EGP","Commercial, Office",0,0,176.52,2025,"[Elevator(s), Gate Community, Electricity Avai...",New Cairo,Cairo,Building O5 Floor 2 Unit O5-02-M Central New C...,"True to its name, Central is situated at the c..."


In [34]:
#Export it as a csv file
houses_df.to_csv("houses_data.csv", index=False)