<a href="https://colab.research.google.com/github/MinaAlberDS/Codveda-Internship/blob/master/Level%201/Task%201/Web_scrapping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Scrapping, and collecting the data

### Collecting the Real Estates links

In [14]:
# Import the needed libraries
import requests
from bs4 import BeautifulSoup
import time
import numpy as np
import pandas as pd
# Make the function which will get the real_estates links
def real_estates_links(pages:int, verbose:bool):
    """
    Scrapes real estate listing links from realestate.gov.eg.

    Args:
        pages: An integer representing the number of pages to scrape.
        verbose: A boolean indicating whether to print the progress of scraping each page.

    Returns:
        A list of strings, where each string is a URL to a real estate listing.
    """
    base_url = r"https://realestate.gov.eg/properties?page={}" # Base url template

    links = [] # links list
    for page in range(1,pages+1): # This for loop to get the 9 real_estates links in each page
        start_time = time.perf_counter() # start a stop watch for each page
        page_url = base_url.format(page) # Set the page url from the baseline url using f-string or format
        response = requests.get(page_url) # Get the response of the page url
        if response.status_code == 200: # check if the page can be scrapped or not
          soop = BeautifulSoup(response.content, "html.parser") # set the soop variable to scrape the links from the page html
          real_estates = soop.find_all("div", class_="col-md-4")[3:] # get the real_estates links

          for real_estate in real_estates: # Add the real_estates links to the links list
              link = r"https://realestate.gov.eg" + real_estate.find("a")["href"] # Set the real_estate url
              links.append(link) # add it to the links list
          end_time = time.perf_counter() # stop the timer
          elapsed = end_time - start_time # count the elapsed time
          if verbose: # if the verbose is true, it will show some useful information about each page
              print(f"Page: {page} links have been appended in {elapsed:.4f}s")
        else: # if the page cannot be scraped
          print(f"failed to scrape this Page: {page} with code: {response.status_code}")
        time.sleep(0.5)
    return links # finally retrun the links list

In [11]:
links = real_estates_links(200, True) # running our function with 56 pages, and verbose = True

ConnectionError: HTTPSConnectionPool(host='realestate.gov.eg', port=443): Max retries exceeded with url: /properties?page=1 (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x00000145C3054A50>: Failed to resolve 'realestate.gov.eg' ([Errno 11001] getaddrinfo failed)"))

In [None]:
len(links)

900

We have 504 real_estate to scrape

In [None]:
import json
links = []
# export the real_estates links to a json file
with open("Links.json", "w") as links_json:
    json.dump(links, links_json, indent =4)

In [14]:
# import the json file links to scrap them
with open("Links(1720 row).json", "r") as links_json:
  links = json.load(links_json)
# show the results
links[:5]

['https://realestate.gov.eg/property-details/almaqsad-park/new-administrative-capital/cairo/building-mpe13-floor-6-unit-mpe1362-almaqsad-park-new-administrative-capital-cairo-cairo-egypt/E110594',
 'https://realestate.gov.eg/property-details/mazarine-commercial/mersa-matruh/matrouh/building-mzspc5-floor-1-unit-mzspc513-mazarine-commercial-mersa-matruh-(k)-matrouh-egypt/E111723',
 'https://realestate.gov.eg/property-details/alamain-(latin-district)/north-coast/alexandria/building-a02-floor-7-unit-z03-cl02-a02-x7-07-02-alamain-(latin-district)-north-coast-alexandria-egypt/E360028',
 'https://realestate.gov.eg/property-details/t--residences/north-coast/matrouh/building-c-11-floor-1-unit-14-t--residences-north-coast-matrouh-egypt/E378616',
 'https://realestate.gov.eg/property-details/begonia/new-cairo/cairo/building-building-06-floor-1-unit-mb0612-begonia-new-cairo-cairo-egypt/E109810']

### Collect the real_estates data

In [15]:
import json

def collect_real_estate_data(real_estates_links:list, verbose:bool):
    # Initialize a dictionary to store data for all real_estates
    all_real_estates_data = {
        'name': [],
        'price': [],
        'property_type': [],
        'bedrooms': [],
        'bathrooms': [],
        'sqm': [],
        'year_built': [],
        'features': [],
        'city': [],
        'governorate': [],
        'address': [],
        'description': []
    }

    # Open the JSON file in append mode outside the loop
    with open("real_estates_data.json", "w") as real_estates_data:
        # Initialize the JSON file with an empty list
        json.dump([], real_estates_data)


    for i, real_estate_link in enumerate(real_estates_links): # get the data from each real_estate
        try:
            real_estate_response = requests.get(real_estate_link) # Get the request of the real_estate link
            real_estate_soop = BeautifulSoup(real_estate_response.content, "html.parser") # make the real_estate soop

            # Extract data, handling potential missing elements
            name = real_estate_soop.find("h6", class_= "propertyInfo_name__bs0i7")
            all_real_estates_data['name'].append(name.get_text(strip=True) if name else None)

            price = real_estate_soop.find("p", class_ = "propertyInfo_price__8ecPp")
            all_real_estates_data['price'].append(price.text.strip() if price else None)

            real_estate_div = real_estate_soop.find_all("div", class_= "propertySpecs_spec_box__xLU5D")
            if len(real_estate_div) > 0:
                property_type = real_estate_div[0].find("span")
                all_real_estates_data['property_type'].append(property_type.text.strip() if property_type else None)

                # Extract bedroom, bathroom, sqm, and year built if available

                bedroom = real_estate_div[1].find_all("span")[1]
                all_real_estates_data['bedrooms'].append(bedroom.text.strip() if bedroom else None)
                bathroom = real_estate_div[2].find_all("span")[1]
                all_real_estates_data['bathrooms'].append(bathroom.text.strip() if bathroom else None)
                sqm = real_estate_div[3].find_all("span")[1]
                all_real_estates_data['sqm'].append(sqm.text.strip() if sqm else None)
                year_built = real_estate_div[4].find_all("span")[1]
                all_real_estates_data['year_built'].append(year_built.text.strip() if year_built else None)

            else:
                # Append None for property type and other specs if the main div is not found
                all_real_estates_data['property_type'].append(None)
                all_real_estates_data['bedrooms'].append(None)
                all_real_estates_data['bathrooms'].append(None)
                all_real_estates_data['sqm'].append(None)
                all_real_estates_data['year_built'].append(None)


            real_estate_features = []
            real_estate_features_dv = real_estate_soop.find_all("div", class_ = "col-md-6")
            for real_estate_feature in real_estate_features_dv:
                feature_text = real_estate_feature.find("h2")
                if feature_text:
                    real_estate_features.append(feature_text.get_text(strip=True))
            all_real_estates_data['features'].append(real_estate_features) # Append the list of features


            real_estate_city_div = real_estate_soop.find_all("div", class_= "propertyData_address_info_row__up8MX")
            if len(real_estate_city_div) > 2:
                city = real_estate_city_div[1].find_all("p")
                all_real_estates_data['city'].append(city[1].get_text(strip=True) if len(city) > 1 else None)

                governorate = real_estate_city_div[2].find_all("p")
                all_real_estates_data['governorate'].append(governorate[1].get_text(strip=True) if len(governorate) > 1 else None)
            else:
                # Append None if city and governorate details are not found
                all_real_estates_data['city'].append(None)
                all_real_estates_data['governorate'].append(None)


            address_div = real_estate_soop.find("div", class_="propertyData_address_info_row__up8MX propertyData_address_row__vZiYP")
            if address_div:
                address = address_div.find_all("p")
                all_real_estates_data['address'].append(address[1].get_text(strip=True) if len(address) > 1 else None)
            else:
                # Append None if address is not found
                all_real_estates_data['address'].append(None)

            description_div = real_estate_soop.find("div", class_= "propertyData_property_long_description__mAJPf")
            if description_div:
                 # Extract text from each paragraph tag and join them
                 paragraph_texts = [p.get_text(strip=True) for p in description_div.find_all("p")]
                 full_description = "".join(paragraph_texts)
                 all_real_estates_data['description'].append(full_description)
            else:
                # Append None if description is not found
                all_real_estates_data['description'].append(None)


            # Append the data of the current real_estate to the JSON file
            with open("real_estates_data.json", "r+") as real_estates_data:
                data = json.load(real_estates_data)
                data.append({key: all_real_estates_data[key][-1] for key in all_real_estates_data})  # Append the last added data
                real_estates_data.seek(0) # the begining of the file
                json.dump(data, real_estates_data, indent=4) # Add the real_estate data
                real_estates_data.truncate() # The beining of the file

            if verbose:
                print(f"Scraped data for real_estate {i+1}/{len(real_estates_links)}")

        except Exception as e:
            print(f"Error scraping {real_estate_link}: {e}")
            # Append None for all fields if an error occurs during scraping
            for key in all_real_estates_data:
                all_real_estates_data[key].append(None)

        time.sleep(0.5)
    return all_real_estates_data

In [16]:
real_estates_data = collect_real_estate_data(links, True)

Scraped data for real_estate 1/1720
Scraped data for real_estate 2/1720
Scraped data for real_estate 3/1720
Scraped data for real_estate 4/1720
Scraped data for real_estate 5/1720
Scraped data for real_estate 6/1720
Scraped data for real_estate 7/1720
Scraped data for real_estate 8/1720
Scraped data for real_estate 9/1720
Scraped data for real_estate 10/1720
Scraped data for real_estate 11/1720
Scraped data for real_estate 12/1720
Scraped data for real_estate 13/1720
Scraped data for real_estate 14/1720
Scraped data for real_estate 15/1720
Scraped data for real_estate 16/1720
Scraped data for real_estate 17/1720
Scraped data for real_estate 18/1720
Scraped data for real_estate 19/1720
Scraped data for real_estate 20/1720
Scraped data for real_estate 21/1720
Scraped data for real_estate 22/1720
Scraped data for real_estate 23/1720
Scraped data for real_estate 24/1720
Scraped data for real_estate 25/1720
Scraped data for real_estate 26/1720
Scraped data for real_estate 27/1720
Scraped da

KeyboardInterrupt: 

### Exporting the data

In [6]:
real_estates_df = pd.read_json(r"real_estates_data.json")
real_estates_df # convert to a dataset

Unnamed: 0,name,price,property_type,bedrooms,bathrooms,sqm,year_built,features,city,governorate,address,description
0,AlMaqsad Park,"4,945,000 EGP","Residential, Apartment",3,0,155.00,2025,"[Other, Building Security, Electricity Available]",New Administrative Capital,Cairo,Building MPE13 Floor 6 Unit MPE1362 AlMaqsad P...,Carefully designed to maximize life within nat...
1,Mazarine Commercial,"24,308,000 EGP","Commercial, Retail",0,0,217.00,2025,"[Other, 24 Hour Security, Electricity Available]",Mersa Matruh,Matrouh,Building MZSPC5 Floor 1 Unit MZSPC513 Mazarine...,"Nestled in Egypt’s flourishing Mazarine, Mazar..."
2,Alamain (Latin District),"6,827,000 EGP","Residential, Apartment",1,0,92.86,2025,"[Other, Security Gate, Electricity Available]",North Coast,Alexandria,Building A02 Floor 7 Unit Z03-CL02-A02-X7-07-0...,Latini by Saudi Egyptian Developers (SED) – Ne...
3,T- Residences,"2,945,154 EGP","Residential, Apartment",3,1,130.00,2024,"[24 Hour Security, Electricity Available, Natu...",North Coast,Matrouh,Building C-11 Floor 1 Unit 14 T- Residences No...,"""Torec Developments is a subsidiary of the New..."
4,BEGONIA,"8,405,000 EGP","Residential, Apartment",3,3,143.50,2025,"[Other, Gate Community]",New Cairo,Cairo,Building Building 06 Floor 1 Unit MB0612 BEGON...,"At Begonia, your location is a catalyst for yo..."
...,...,...,...,...,...,...,...,...,...,...,...,...
72,T- Residences,"2,668,259 EGP","Residential, Apartment",3,1,120.00,2024,"[24 Hour Security, Electricity Available, Natu...",North Coast,Matrouh,Building C-12 Floor 6 Unit 62 T- Residences No...,"""Torec Developments is a subsidiary of the New..."
73,Alamain (Latin District),"3,226,000 EGP","Residential, Apartment",1,0,94.61,2025,"[Other, Security Gate, Electricity Available]",North Coast,Alexandria,Building D15 Floor 7 Unit Z02-CL07-D15-X7-07-0...,Latini by Saudi Egyptian Developers (SED) – Ne...
74,Downtown commercial,"20,540,000 EGP","Commercial, Retail",0,0,200.00,2025,"[Other, 24 Hour Security, Electricity Available]",Mersa Matruh,Matrouh,Building DT01 Floor Ground floor Unit DTRG0130...,Designed to be an integrated residential and c...
75,Alamain (Latin District),"6,512,000 EGP","Residential, Apartment",4,0,213.53,2025,"[Other, Security Gate, Electricity Available]",North Coast,Alexandria,Building C06 Floor 1 Unit Z05-CL11-C06-M7-01-0...,Latini by Saudi Egyptian Developers (SED) – Ne...


In [7]:
#Export it as a csv file
real_estates_df.to_csv("real_estates_data.csv", index=False)

I scrapped more 283 rows, so the final output is 1183 rows

In [15]:
re_df = pd.read_csv(r"real_estates_data.csv")
re_df_77 = pd.read_csv(r"real_estates_data(1).csv")
re_df_206 = pd.read_csv(r"real_estates_data(2).csv")

# Concatenate all dataframes vertically (row-wise)
re_df = pd.concat([re_df, re_df_77, re_df_206], ignore_index=True)

re_df

Unnamed: 0,name,price,property_type,bedrooms,bathrooms,sqm,year_built,features,city,governorate,address,description
0,Alamain (Latin District),"6,594,000 EGP","Residential, Apartment",1,0,92.86,2025,"['Other', 'Security Gate', 'Electricity Availa...",North Coast,Alexandria,Building F04 Floor 6 Unit Z02-CL05-F04-X7-06-1...,Latini by Saudi Egyptian Developers (SED) – Ne...
1,Beachfront Tower - B1,"66,104,000 EGP","Residential, Apartment",3,3,398.00,2025,['Other'],Mersa Matruh,Matrouh,Building BTB1 Floor 9th floor Unit BTB1091 Bea...,Explore a lifestyle that allows you to have yo...
2,PODIA,"12,824,000 EGP","Commercial, Office",0,0,93.00,2025,"['Other', '24 Hour Security', 'Fire Alarm', 'G...",Cairo,Cairo,Floor 13th Unit MPO-1304-A Bin ZayedNorth PODI...,Launched by Menassat Developments in cooperati...
3,Mazarine Apartment,"12,857,000 EGP","Residential, Apartment",3,3,252.00,2025,"['24 Hour Security', 'Electricity Available']",Mersa Matruh,Matrouh,Building MZCS15 Floor 3rd floor Unit MZCS1531 ...,The name ‘Mazarine’ came to fruition based on ...
4,Alamain (Latin District),"8,721,000 EGP","Residential, Apartment",3,0,209.42,2025,"['Other', 'Security Gate', 'Electricity Availa...",North Coast,Alexandria,Building E01 Floor 1 Unit Z02-CL09-E01-X4-01-0...,One of the main districts in this new city nei...
...,...,...,...,...,...,...,...,...,...,...,...,...
1178,T- Residences,"2,668,259 EGP","Residential, Apartment",3,1,120.00,2024,"['24 Hour Security', 'Electricity Available', ...",North Coast,Matrouh,Building C-12 Floor 6 Unit 62 T- Residences No...,"""Torec Developments is a subsidiary of the New..."
1179,Alamain (Latin District),"3,226,000 EGP","Residential, Apartment",1,0,94.61,2025,"['Other', 'Security Gate', 'Electricity Availa...",North Coast,Alexandria,Building D15 Floor 7 Unit Z02-CL07-D15-X7-07-0...,Latini by Saudi Egyptian Developers (SED) – Ne...
1180,Downtown commercial,"20,540,000 EGP","Commercial, Retail",0,0,200.00,2025,"['Other', '24 Hour Security', 'Electricity Ava...",Mersa Matruh,Matrouh,Building DT01 Floor Ground floor Unit DTRG0130...,Designed to be an integrated residential and c...
1181,Alamain (Latin District),"6,512,000 EGP","Residential, Apartment",4,0,213.53,2025,"['Other', 'Security Gate', 'Electricity Availa...",North Coast,Alexandria,Building C06 Floor 1 Unit Z05-CL11-C06-M7-01-0...,Latini by Saudi Egyptian Developers (SED) – Ne...


In [18]:
re_df.to_csv("real_estates_data(new).csv", index=False)