## House Listing Data Scraping using python
Scrape house listing data from `buyrentkenya.com` using python.

In [46]:
#import necessary libraries
import requests
from bs4 import BeautifulSoup
import time
import random
import csv
import json

import numpy as np
import pandas as pd


In [47]:
#clearly define the base url and add extensions from there by concactinating them to the base url
BASE_URL = "https://www.buyrentkenya.com"

url_template = BASE_URL + "/houses-for-rent"



In [51]:
#end_page variable
end_page = 17

# First page
response = requests.get(url_template)
soup = BeautifulSoup(response.text, "html.parser")


In [52]:
print(soup.prettify())

<!DOCTYPE html>
<html lang="en">
 <head>
  <meta charset="utf-8"/>
  <meta content="IE=edge" http-equiv="X-UA-Compatible"/>
  <meta content="width=device-width,initial-scale=1.0,viewport-fit=cover,user-scalable=no" name="viewport"/>
  <link href="https://www.buyrentkenya.com/manifest.json" rel="manifest"/>
  <meta content="#7d0810" name="theme-color"/>
  <meta content="yes" name="mobile-web-app-capable"/>
  <meta content="BRK" name="application-name"/>
  <link href="https://assets.buyrentkenya.com/theme/brk/assets/favicon-1d324121.ico" rel="icon"/>
  <meta content="yes" name="apple-mobile-web-app-capable"/>
  <meta content="black" name="apple-mobile-web-app-status-bar-style"/>
  <meta content="BRK" name="apple-mobile-web-app-title"/>
  <link href="https://assets.buyrentkenya.com/theme/brk/assets/icon-512x512-bd19106a.png" rel="apple-touch-icon"/>
  <link href="https://assets.buyrentkenya.com/theme/brk/assets/splash-640-1136-f4e755bb.png" media="(device-width: 320px) and (device-height:

Find all the tags containing the listing URLS and associated JSON data

In [53]:
# Find all <a> tags containing the listing URLs and associated JSON data
listing_links = soup.find_all("a", {"class": "no-underline"})
print(listing_links)

[<a class="mr-3 font-bold uppercase text-white no-underline" x-bind:href="redirect" x-text="action"></a>, <a class="border-b text-black block whitespace-nowrap py-4 no-underline transition-colors duration-75 visited:text-black hover:text-secondary-500 hover:transition-colors hover:duration-300" href="https://www.buyrentkenya.com/houses-for-sale">
Houses for Sale
</a>, <a class="border-b text-black block whitespace-nowrap py-4 no-underline transition-colors duration-75 visited:text-black hover:text-secondary-500 hover:transition-colors hover:duration-300" href="https://www.buyrentkenya.com/flats-apartments-for-sale">
Apartments for Sale
</a>, <a class="border-b text-black block whitespace-nowrap py-4 no-underline transition-colors duration-75 visited:text-black hover:text-secondary-500 hover:transition-colors hover:duration-300" href="https://www.buyrentkenya.com/land-for-sale">
Land for Sale
</a>, <a class="text-black block whitespace-nowrap py-4 no-underline transition-colors duration

Create a file named `house_listings` and save it as a `.csv`.

In [54]:
# Open the CSV file in append mode
with open("house_listings.csv", "a", newline="", encoding="utf-8") as csvfile:
    writer = csv.writer(csvfile)
    
    for link in listing_links:
        href = link.get("href")  # Use get method to avoid KeyError
        if not href:
            print("Href attribute not found on the first page")
            continue  # Skip if href attribute is not present
        
        json_data = {}
        for attribute, value in link.attrs.items():
            if "wire:click.prevent" in value:
                json_data = json.loads(value.split(",")[-2])
                break
        
        if not json_data:
            print("JSON data not found for the link on the first page")
            continue  # Skip if JSON data is not found
        
        item_name = json_data.get("item_name")
        price = json_data.get("itemPrice")
        location = json_data.get("propertyArea")
        status = json_data.get("propertyStatus")
        
        # Extracting the location and unique ID from the href
        location_id = href.split("-rent-")[1]  # Extract anything after '-rent-', e.g., 'runda-3675245'
        unique_id, location_name = location_id.split("-", 1)  # Splitting the location_id into unique_id and location_name
        
        # Write data to CSV file
        writer.writerow([item_name, price, location_name, unique_id, status, BASE_URL + href])

    # Introduce a random delay before scraping subsequent pages
    time.sleep(random.uniform(1, 3))  # Wait between 1 and 3 seconds

Href attribute not found on the first page
JSON data not found for the link on the first page
JSON data not found for the link on the first page
JSON data not found for the link on the first page
JSON data not found for the link on the first page
JSON data not found for the link on the first page
JSON data not found for the link on the first page
JSON data not found for the link on the first page
JSON data not found for the link on the first page
JSON data not found for the link on the first page
JSON data not found for the link on the first page
JSON data not found for the link on the first page
JSON data not found for the link on the first page
JSON data not found for the link on the first page
JSON data not found for the link on the first page
JSON data not found for the link on the first page
JSON data not found for the link on the first page
JSON data not found for the link on the first page
JSON data not found for the link on the first page
JSON data not found for the link on the

In [34]:
for page in range(1, end_page + 1):
        response = requests.get(url_template.format(page))
        soup = BeautifulSoup(response.text, "html.parser")

        # Find all elements matching the provided XPath
        listing_elements = soup.find_all(xpath="/html/body/div[2]/div/div/div[4]/div[1]/div[1]/div/div[1]/div[3]/div/div[2]/div/div[1]/div/span/div[*]/div/div/div[2]")
# /html/body/div[2]/div/div/div[4]/div[1]/div[1]/div/div[1]/div[3]/div/div[2]/div/div[1]/div/span/div[8]/div/div/div[2]
        for element in listing_elements:
            # Extract href attribute
            href = element.get("href")
            if not href:
                print("Href attribute not found for a listing")
                continue

            # Extract JSON data
            json_data = {}
            for attribute, value in element.attrs.items():
                if "wire:click.prevent" in value:
                    json_data = json.loads(value.split(",")[-2])
                    break

            if not json_data:
                print("JSON data not found for a listing")
                continue

            item_name = json_data.get("item_name")
            price = json_data.get("itemPrice")
            location = json_data.get("propertyArea")
            status = json_data.get("propertyStatus")

            # Extracting the location and unique ID from the href
            location_id = href.split("-rent-")[1]
            unique_id, location_name = location_id.split("-", 1)

            # Write data to CSV file
            writer.writerow([item_name, price, location_name, unique_id, status, BASE_URL + href])

        # Introduce a random delay before scraping subsequent pages
        # time.sleep(random.uniform(1, 3))

In [35]:
fetch_unique_ids(url_template, end_page)


Href attribute not found on page 1


IndexError: list index out of range

In [33]:
df = pd.read_csv('./house_listings.csv')
df.head()

EmptyDataError: No columns to parse from file

In [58]:
import scrapy
import csv

class MySpider(scrapy.Spider):
    name = 'myspider'
    start_urls = ['https://www.buyrentkenya.com/houses-for-rent']

    def parse(self, response):
        # Extracting the desired element using XPath
        target_element = response.xpath("/html/body/div[2]/div/div/div[4]/div[1]/div[1]/div/div[1]/div[3]/div/div[2]/div/div[1]/div/span/div[7]/div/div/div[2]/div[1]/div[1]/h5[1]/a")

        # Extracting specific data from the target element, such as text or attribute values
        item_name = target_element.xpath("./text()").get()
        href = target_element.xpath("./@href").get()

        # Writing the data to a CSV file
        with open('data.csv', 'a', newline='', encoding='utf-8') as csvfile:
            writer = csv.writer(csvfile)
            writer.writerow([item_name, href])



In [60]:
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
# Initialize a CrawlerProcess
process = CrawlerProcess(get_project_settings())

# Start the spider
process.crawl(MySpider)
process.start()

2024-02-17 13:53:51 [scrapy.utils.log] INFO: Scrapy 2.8.0 started (bot: scrapybot)
2024-02-17 13:53:51 [scrapy.utils.log] INFO: Versions: lxml 4.9.1.0, libxml2 2.9.14, cssselect 1.1.0, parsel 1.6.0, w3lib 1.21.0, Twisted 22.10.0, Python 3.11.4 (main, Jul  5 2023, 14:15:25) [GCC 11.2.0], pyOpenSSL 23.2.0 (OpenSSL 1.1.1w  11 Sep 2023), cryptography 41.0.2, Platform Linux-6.5.0-18-generic-x86_64-with-glibc2.35
2024-02-17 13:53:51 [scrapy.crawler] INFO: Overridden settings:
{}


See the documentation of the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting for information on how to handle this deprecation.
  return cls(crawler)

2024-02-17 13:53:51 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.epollreactor.EPollReactor
2024-02-17 13:53:51 [scrapy.extensions.telnet] INFO: Telnet Password: 3e28dbcbd5fbecb4
2024-02-17 13:53:51 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage