### Web Scrapping with beautiful Soup

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

properties = []
properties_list = []
amenities_dfs_lst = []
base_url = 'https://www.buyrentkenya.com'

# Loop through all pages containing properties
for i in range(1, 69):
    if i == 1:
        url = base_url + '/houses-for-rent/nairobi'
    else:
        url = f'{base_url}/houses-for-rent/nairobi?page={i}'
    
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    houses = soup.find_all('div', {'class': 'propertyTile'})  

    for house in houses:
        # Grab all available details per house
        # Grab name
        if house.find('div', {'class': 'title'}) is None: 
            name = ""
        else:
            name = house.find('div', {'class': 'title'}).text.strip()

        # Grab price
        if house.find('div', {'class': 'price'}) is None: 
            price = ""
        else:
            price = house.find('div', {'class': 'price'}).text.strip()  

        # Grab location
        if house.find('div', {'class': 'location-row'}) is None:  
            location = ""
        else:
            location = house.find('div', {'class': 'location-row'}).find('div').text.strip()  

        # Grab area/size
        if house.find('div', {'class': 'key-attr'}) is None:  
            area = ""
        else:
            area = house.find('div', {'class': 'key-attr'}).text.strip()  

        # Grab no. of bedrooms
        if house.find('div', {'class': 'bedroom'}) is None:  
            bedrooms = ""
        else:
            bedrooms = house.find('div', {'class': 'bedroom'}).text.strip()  

        # Grab no. of bathrooms
        if house.find('div', {'class': 'bathroom'}) is None:  
            bathrooms = ""
        else:
            bathrooms = house.find('div', {'class': 'bathroom'}).text.strip()  

        # Create a dictionary with property details
        property_details = {'Name': name, 'Price': price, 'Location': location,
                            'Area': area, 'Bedrooms': bedrooms, 'Bathrooms': bathrooms}

        # Append the dictionary created above to the empty list initialized on top
        properties.append(property_details)

        # Click on property to access the next page with full property details
        # Fetch next page link 'a-href'
        next_page_link = house.find('a')['href']  

        # Concatenate the next page link to the base URL and create a soup object
        next_pg_url = base_url + next_page_link
        next_pg_response = requests.get(next_pg_url)
        next_pg_soup = BeautifulSoup(next_pg_response.content, 'html.parser')

        # Locate the table containing property amenities
        amenities = next_pg_soup.find('div', {'class': 'amenities'}).find_all('div', {'class': 'list'})

        amenities_lst = []

        # Loop through the amenities available
        for amenity in amenities:
            amenity = amenity.text.strip()

            # Create dictionary with a value of 1 for each available amenity and append dict in list
            amenity_dict = {amenity: 1}
            amenities_lst.append(amenity_dict)

        # Create a mega dictionary with all dictionaries in the amenities_lst list
        amenities_dict = {key: value for d in amenities_lst for key, value in d.items()}

        # Dump the dictionary into a pandas dataframe
        amenity_df = pd.DataFrame(amenities_dict, index=[0], dtype='string')

        # Add a new column for property name and set the column as an index - to help with merging later on
        amenity_df['Name'] = name
        amenity_df = amenity_df.set_index('Name')

        # Append the dataframe to the initialized empty list
        amenities_dfs_lst.append(amenity_df)

    time.sleep(3)

# Create a mega list using all property dictionaries in the properties list
properties_list = [{key: value.strip() for key, value in properti.items()} for properti in properties]

In [None]:
#Create a pandas dataframe using the mega list with property details from the main page
property_df = pd.DataFrame(properties_list)

#Set the index column to help with merging later on
property_df.set_index('Name')

#Create a pandas dataframe using the mega list with amenities available from the second -amenities- page
amenities_df = pd.concat(amenities_dfs_lst, axis=0, ignore_index=False)
amenities_df

#Merge the two dataframes - properties and amenities - on their index column
df = pd.merge(property_df, amenities_df, on='Name')

# Save your 'df' into a csv file
df.to_csv('apartment_price.csv', index=False)

In [25]:
import pandas as pd
import numpy as np

In [26]:
df = pd.read_csv("./apartment_price.csv")

In [27]:
df.head()

Unnamed: 0,Location,Distance_to_CBD,Bedrooms,Bathrooms,Floor_Size,Price,Amenities
0,Kileleshwa,5.4,2,1,145,94322,"electric fence, parking, swimming pool, CCTV, ..."
1,Lavington,5.9,4,2,202,299704,"garbage collection, garden"
2,Utawala,22.4,3,3,219,35000,"garden, CCTV, electric fence, parking, swimmin..."
3,Muthaiga,7.0,3,3,141,364585,"garden, gym, garbage collection, electric fence"
4,Lavington,5.9,4,1,230,280463,"parking, garden"


In [24]:
# Different Locations in our dataset
df["Location"].value_counts()

Ruaka         328
Utawala       312
Kileleshwa    310
Lavington     310
Westlands     303
Ruiru         299
Juja          297
Syokimau      285
Muthaiga      278
Kilimani      278
Name: Location, dtype: int64

In [29]:
# Check the minimum and maximum No of 'Bathrooms' in Dataset
df["Bedrooms"].value_counts()

4    805
2    754
1    722
3    719
Name: Bedrooms, dtype: int64

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Location         3000 non-null   object 
 1   Distance_to_CBD  3000 non-null   float64
 2   Bedrooms         3000 non-null   int64  
 3   Bathrooms        3000 non-null   int64  
 4   Floor_Size       3000 non-null   int64  
 5   Price            3000 non-null   int64  
 6   Amenities        2630 non-null   object 
dtypes: float64(1), int64(4), object(2)
memory usage: 164.2+ KB


In [10]:
# or fill missing values with the most frequent value
df['Amenities'] = df['Amenities'].fillna(df['Amenities'].mode()[0])

In [11]:
# Assuming the column with strings separated by commas is named 'column_name'
df['Amenities'] = df['Amenities'].str.split(',')

# Create a list of your five column names
valid_columns = ['garbage collection', 'parking', 'garden', 'swimming pool', 'gym']

# Create new columns and assign 1 or 0 for each match
for column in valid_columns:
    df[column] = df['Amenities'].apply(lambda x: 1 if column in x else 0)

In [12]:
# Delete the 'Amenities' column from the dataframe
df.drop(columns='Amenities', inplace = True)

In [21]:
df.head()

Unnamed: 0,Location,Distance_to_CBD,Bedrooms,Bathrooms,Floor_Size,Price,garbage collection,parking,garden,swimming pool,gym
0,1,5.4,2,1,145,94322,0,0,0,0,0
1,3,5.9,4,2,202,299704,1,0,0,0,0
2,8,22.4,3,3,219,35000,0,0,1,0,0
3,4,7.0,3,3,141,364585,0,0,1,0,0
4,3,5.9,4,1,230,280463,0,1,0,0,0


### Machine Learning Model Building

### Import Necessary Libraries

In [13]:
import xgboost as xgb
import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_error

In [14]:
# Create an instance of LabelEncoder
encoder = LabelEncoder()

# Fit the encoder on the "location" column
df['Location'] = encoder.fit_transform(df['Location'])

In [15]:
X = df.drop('Price', axis=1) # Features
y = df['Price'] # Target column (house prices)

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


### Baseline Model
 - To know if the model you built performs well, we need to have a <b>Naive or Baseline Model</b>. A model that always makes one prediction and it tells us if we are to build a model, then atleast for every Price prediction it has to be Above or Equal to the Baseline model price prediction.

In [16]:
y_mean = y_train.mean()
y_pred_baseline = [y_mean] * len(y_train)

In [17]:
mae_baseline = mean_absolute_error(y_train, y_pred_baseline)

print("Mean apt Price:", round(y_mean, 2))
print("Baseline MAE:", round(mae_baseline, 2))

Mean apt Price: 104850.34
Baseline MAE: 73873.33


## Observation Above
- If we always predicted that an apartment price is Ksh.104850.34,our predictions would be off by an average of Ksh.73873.33. It also tells us that our model needs to have mean absolute error below Ksh.- If we always predicted that an apartment price is Ksh.104850.34,our predictions would be off by an average of Ksh.73873.33. It also tells us that our model needs to have mean absolute error below Ksh.73873.33 in order to be useful.

### XGBoost Model

In [18]:
# Convert train and test sets to DMatrix format
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test)

# Set XGBoost parameters
params = {
    'objective': 'reg:squarederror', # Use squared error as the objective function for regression
    'max_depth': 3, # Maximum depth of a tree
    'eta': 0.1, # Step size shrinkage used in each boosting step
    'subsample': 0.7, # Subsample ratio of the training instances
    'colsample_bytree': 0.7 # Subsample ratio of columns when constructing each tree
}

In [19]:
# Train the XGBoost model
model = xgb.train(params, dtrain) 

# Make predictions on the test set
predictions = model.predict(dtest)

# Evaluate the model
mae = mean_absolute_error(y_test, predictions)
# print(f"Mean Squared Error: {np.sqrt(mse)}")

In [20]:
print(f"Training MAE:{round(mae, 2)}")

Training MAE:31908.75


### Observation Above
- Good news. Our model beats the baseline by over Ksh.40,000! It's a good indicator that our model will be helpful in predicting apartment prices.

In [31]:
df["Floor_Size"].describe()

count    3000.000000
mean      154.947667
std        67.086142
min        50.000000
25%        93.000000
50%       162.000000
75%       203.000000
max       298.000000
Name: Floor_Size, dtype: float64

In [32]:
df["Distance_to_CBD"].describe()

count    3000.0000
mean       14.9659
std        10.3151
min         3.2000
25%         5.4000
50%        19.0000
75%        24.0000
max        32.8000
Name: Distance_to_CBD, dtype: float64