<a href="https://colab.research.google.com/github/Joshuaiwuoha/Lekki-webscraping/blob/main/Lekki_web_scraping_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Web Scraping

In [None]:
import requests

In [None]:
# List of URLs for property listings in Lekki, Lagos
my_urls = ['https://www.propertypro.ng/property-for-rent/in/lagos/lekki','https://www.propertypro.ng/property-for-sale/in/lagos/lekki/']

# Empty list to store the HTML content of the pages
page_url = []

for each in my_urls:
  # Make a request to the URL and retrieve the HTML content
  page_url.append(requests.get(each))

In [None]:
# Empty list to store the HTML content of the pages
page_content = []

# Iterate through each page URL
for each in page_url:
   # Retrieve the text content of the page and append it to page_content
  page_content.append(each.text)

In [None]:
from bs4 import BeautifulSoup

In [None]:
# Empty list to store BeautifulSoup objects
doc = []
# Iterate through each page content
for each in page_content:
# Parse the HTML content using BeautifulSoup and append the resulting object to doc
 doc.append(BeautifulSoup(each,'html.parser'))

In [None]:
# Empty list to store the number of pages for each BeautifulSoup object
num_pages = []
# Iterate through each BeautifulSoup object
for each in doc:
  # Extract total number of pages and listings per page
  total_pages = int(each.find_all('div',class_ = 'property-number-left')[0].text.strip().split(' ')[-1])
  listing_per_page = int(each.find_all('div',class_ = 'property-number-left')[0].text.strip().split(' ')[3])
  # Calculate the number of pages and append it to num_pages
  num_pages.append(int(total_pages/listing_per_page))

In [None]:
num_pages

[120, 307]

In [None]:
# Common URL prefix
suffix = 'https://www.propertypro.ng'

# CSS class for navigation links
nav_bar_class = 'page-link'

#Initialize lists with the first pages of both URLs
first_pages = [pages[0]]
second_pages = [pages[1]]

# Counter to keep track of which set of pages is being processed
counter = 0


# Iterate through pairs of URLs and corresponding number of pages
for i,j in zip(my_urls,num_pages):

  # Iterate through the pages for each URL
  for k in range(j-1):
    if counter == 0:
      page_url = requests.get(first_pages[k],'html')
    elif counter == 1:
      page_url = requests.get(second_pages[k],'html')
    else:
      break

    # Get the HTML content of the page
    page_content = page_url.text

    # Parse the HTML content with BeautifulSoup
    doc = BeautifulSoup(page_content,'html.parser')

     # Find navigation tags for the next page
    nav_tags = doc.find_all('a', class_ = nav_bar_class, alt = 'view next property page')


    # Append the next page URL to the corresponding list
    if counter == 0:
        first_pages.append(suffix + nav_tags[0]['href'])
    elif counter == 1:
        second_pages.append(suffix + nav_tags[0]['href'])
    else:
        break


  # Increment the counter to switch to the next set of pages
  counter += 1



In [None]:
# return the length of the first_pages and second_pages
len(first_pages),len(second_pages)

(120, 307)

In [None]:
# List to store scraped property information
prices = []            # Property prices
location = []          # Property locations
apartment_type = []    # Property types
update = []            # Update status
status = []            # Furnishing status
baths = []             # Number of bathrooms
toilets = []           # Number of toilets

# CSS classes for different property attributes
price_class = 'listings-price'            # CSS class for property prices
location_class = 'single-room-text'       # CSS class for property locations
apartment_class = 'listings-property-title2'  # CSS class for property types
status_class = 'furnished-btn'            # CSS class for property status (furnished or not)
update_class = 'single-room-text'         # CSS class for property update status


def extract(webpages):
   """
    Extracts property information from a list of webpages.
    """


  # Loop through each webpage
  for each in webpages:
    # Make a request to the webpage and retrieve the HTML content
    page_url = requests.get(each,'html')
    page_content = page_url.text
    doc = BeautifulSoup(page_content,'html.parser')

    # Extract property prices
    price_tag = doc.find_all('h3',class_ = price_class)
    for each in price_tag:
      prices.append(int(each.text.strip().split(' ')[1].split('/')[0].replace(',','')))


    # Extract property locations
    loc_tag = doc.find_all('div', class_ = location_class)
    for each in loc_tag:
      location.append(' '.join(each.text.split('\n')[2].split(' ')[:]))

    # Extract property types
    apart_tag = doc.find_all('h3',class_ = apartment_class)
    for each in apart_tag:
      apartment_type.append(each.text)

    # Extract property status
    status_tag = doc.find_all('div', class_ = status_class)
    for each in status_tag:
      status.append(','.join(each.text.strip().split('\n')))

    # Extract property update status
    update_tag = doc.find_all('div', class_ = update_class)
    for each in update_tag:
      update.append(each.text.strip().split('\n')[6])

    # Extract number of bathrooms and toilets
    bed_toil = doc.find_all('div',class_ = 'fur-areea')
    for i,j in enumerate(bed_toil):
      baths.append(bed_toil[i].text.strip().split("\n")[1])
      toilets.append(bed_toil[i].text.strip().split("\n")[2])


# first page

In [None]:
extract(first_pages)

In [None]:
len(prices),len(location),len(apartment_type),len(status),len(update),len(baths),len(toilets)

(6000, 6240, 6000, 6240, 6240, 6240, 6240)

In [None]:
location = location[:6000]
status = status[:6000]
update = update[:6000]
baths = baths[:6000]
toilets = toilets[:6000]

In [None]:
import pandas as pd
frame = {'location':location,'apartment_type': apartment_type,'baths': baths,'toilets': toilets, 'status':status,'last_updated':update,'price':prices}
data_df = pd.DataFrame(frame)

In [None]:
data_df.to_csv('lekki rentals',index = False)

In [None]:
pd.read_csv('/content/lekki rentals').shape

(6000, 7)

# second pages

In [None]:
extract(second_pages)

In [None]:
len(prices),len(location),len(apartment_type),len(status),len(update),len(baths),len(toilets)

(21100, 21944, 21100, 21944, 21944, 21944, 21944)

In [None]:
location = location[:21100]
status = status[:21100]
update = update[:21100]
baths = baths[:21100]
toilets = toilets[:21100]

In [None]:
import pandas as pd
frame = {'location':location,'apartment_type': apartment_type,'baths': baths,'toilets': toilets, 'status':status,'last_updated':update,'price':prices}
data_df = pd.DataFrame(frame)

In [None]:
data_df.to_csv('lekki sales',index = False)