In [51]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np

In [52]:
page_base_url = 'https://mansiondeal.com/public/aasearchrent.php?pn='
house_base_url = 'https://mansiondeal.com/public/'

In [53]:
def scrape_house_page(house_url):
    response = requests.get(house_url)
    if response.status_code != 200:
        print('Error: page fetch  failed with {} error code'.format(response.status_code))
        return
    soup = BeautifulSoup(response.text, 'html.parser')
    try:
        location = soup.find('div', class_='property-info').find('p', class_='area').text.strip()
    except:
        location = np.nan
    try:
        description = soup.find('div', class_='col-lg-9 col-sm-8').find('div', class_='row').find('div', class_='col-lg-8').find('div', class_='spacer').find('p').text.strip()
    except:
        description = np.nan
    
    details = details = soup.find('div', class_='col-lg-9 col-sm-8').find('div', class_='row').find('div', class_='col-lg-4').find('div', class_='col-lg-12 col-sm-6')
    house_details = details.find('div', class_='listing-detail')
    try:
        bedrooms = house_details.find('span', attrs={'data-original-title': 'Bed Room'}).text.strip()
    except:
        bedrooms = np.nan
    try:
        livingrooms = house_details.find('span', attrs={'data-original-title': 'Living Room'}).text.strip()
    except:
        livingrooms = np.nan
    try:
        bathrooms = house_details.find('span', attrs={'data-original-title': 'Bathrooms'}).text.strip()
    except:
        bathrooms = np.nan
    try:
        kitchen = house_details.find('span', attrs={'data-original-title': 'Kitchen'}).text.strip()
    except:
        kitchen = np.nan
    try:
        price = details.find('div', class_='property-info').find('p', class_='price').text.strip()
    except:
        price = np.nan

    amenities = soup.find('div', class_='col-lg-3 col-sm-4 hidden-xs').find('div', class_='hot-properties hidden-xs').find_all('div', class_='row')
    house_amenities = {}
    for amenity in amenities:
        name = amenity.find('div', class_='col-lg-8 col-sm-7').find('p', class_='price').text.strip('')
        if 'Water' in name or 'Electricity' in name:
            house_amenities[name[2:]] = name[0]
        elif 'Garden' in name:
            house_amenities['Garden'] = name
        else:
            house_amenities['Pool'] = name
    house_attributes = {
        'Location': location,
        'Description': description,
        'Bedrooms': bedrooms,
        'Bathrooms': bathrooms,
        'Living Rooms': livingrooms,
        'Kitchen': kitchen,
        'Price': price        
    }
    
    house_attributes.update(house_amenities)
    
    for key, value in house_attributes.items():
        if value == 'n/a' or value == '':
            house_attributes[key] = np.nan
    return house_attributes

In [56]:
def scrape_pages(url):
    page_empty = False
    page_no = 1
    data = pd.DataFrame(columns=['Location', 'Description', 'Bedrooms', 'Bathrooms', 'Living Rooms', 'Kitchen', 'Price'])
    while page_empty == False:
        if page_no == 1:
            response = requests.get('https://mansiondeal.com/public/aredirectrent.php?s=Kenya')
        else:
            response = requests.get(f'{url}{page_no}')
        if response.status_code != 200:
            print(f'Request failed to fetch page {page_no} with Error code {response.status_code}')
        soup = BeautifulSoup(response.text, 'html.parser')
        houses = soup.find_all('div', class_="result")
        house_details = []
        for house in houses:
            relative_url = house.find('a')['href']
            house_url = house_base_url + relative_url
            house_details.append(scrape_house_page(house_url=house_url))

        if page_no == 92:
            page_empty = True
        page_no += 1
        page_dataframe = pd.DataFrame(house_details)
        data = pd.concat([data, page_dataframe])
    return data

In [57]:
data = scrape_pages(page_base_url)

In [59]:
data.to_csv('../data/mansiondeal.csv', index=False)

In [60]:
data.head()

Unnamed: 0,Location,Description,Bedrooms,Bathrooms,Living Rooms,Kitchen,Price,Water,Electricity,Garden,Pool
0,"Karen, Nairobi, Kenya",Located in the affluent Karen suburb is this s...,5,5,1,?,"ksh 400,000",✅,✅,✅ A Garden,NO Swimming Pool
1,"Spring Valley, Nairobi, Kenya",5 Bedroom standalone house available for rent ...,5,5,1,?,"ksh 260,000",✅,✅,✅ A Garden,NO Swimming Pool
2,"Riverside, Nairobi, Kenya",RIVERSIDE FURNISHED 3 BEDROOM COTTAGE FOR RENT...,3,3,1,?,"ksh 180,000",✅,✅,✅ A Garden,✅ Swimming Pool
3,"Riverside, Nairobi, Kenya",Fully furnished five bedroom house plus one be...,5,5,1,?,"ksh 500,000",✅,✅,✅ A Garden,NO Swimming Pool
4,"Kitisuru, Nairobi, Kenya",Lovely 4 bedroom townhouse to let in Kitisuru....,4,5,1,?,"ksh 750,000",✅,✅,✅ A Garden,✅ Swimming Pool


In [61]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1099 entries, 0 to 6
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Location      1099 non-null   object
 1   Description   1099 non-null   object
 2   Bedrooms      1099 non-null   object
 3   Bathrooms     1091 non-null   object
 4   Living Rooms  1099 non-null   object
 5   Kitchen       1099 non-null   object
 6   Price         1099 non-null   object
 7   Water         1099 non-null   object
 8   Electricity   1099 non-null   object
 9   Garden        1099 non-null   object
 10  Pool          1099 non-null   object
dtypes: object(11)
memory usage: 103.0+ KB


In [64]:
data['Garden'].value_counts()

✅ A Garden     1054
NO Garden        45
Name: Garden, dtype: int64