# Data Mining

In [3]:
from selenium import webdriver
from time import sleep
from selenium.webdriver.common.keys import Keys
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
import re

In [4]:
def load_and_accept_cookies():
        
    driver = webdriver.Chrome('chrome_driver/chromedriver') 
    URL = "https://www.zoopla.co.uk/new-homes/property/london/?q=London&results_sort=newest_listings&search_source=new-homes&page_size=10&pn=1&view_type=list"
    driver.get(URL)
    accept_cookies = driver.find_elements_by_xpath('//button[@data-responsibility="acceptAll"]')
    for button in accept_cookies:
        if button.text == "Accept all cookies":
            relevant_button = button

    relevant_button.click()
    return driver


def next_button(driver):

        sleep(3)
        bar = driver.find_element_by_xpath("//*[@id='content']/div[7]//a[contains(text(), 'Next')]")
        bar.click()


def extract_sqft(driver):
    list_of_lists = driver.find_elements_by_tag_name("li")
    for list_element in list_of_lists:
        if "sqft" in list_element.text.lower().replace(' ','') or 'sq.ft' in list_element.text.lower().replace(' ',''):
            return list_element.text
    return None

def extract_bedrooms(driver):
    list_of_lists = driver.find_elements_by_tag_name("li")
    for list_element in list_of_lists:
        if "bedroom" in list_element.text.lower().replace(' ',''):
            return list_element.text
    return None

def extract_bathrooms(driver):
    list_of_lists = driver.find_elements_by_tag_name("li")
    for list_element in list_of_lists:
        if "bathroom" in list_element.text.lower().replace(' ',''):
            return list_element.text
    return None

def extract_reception(driver):
    list_of_lists = driver.find_elements_by_tag_name("li")
    for list_element in list_of_lists:
        if "reception" in list_element.text.lower().replace(' ',''):
            return list_element.text
    return None

def new_search(postcode, n_elements):
    driver = load_and_accept_cookies()
    search_bar = driver.find_element_by_xpath('//*[@id="form-search-and-refine"]/div[1]/div/div[1]/button')
    search_bar.click()
    search_bar_input = driver.find_element_by_xpath('//*[@id="location"]')
    search_bar_input.clear()
    search_bar_input.send_keys(postcode)
    search_bar_input.send_keys(Keys.ENTER)
    properties = get_properties(n_elements, driver=driver)
    return properties

def find_number(text):
    num = re.findall(r'(\d+)', text)
    if len(num) > 1:
        return int(''.join(num))
    elif len(num) == 0:
        return None
    return int(num[0])

def get_properties(num_to_get, driver):
    '''
    DOCSTRING
    '''
    data = {"sale_price": [], "num_bedrooms": [], "num_bathrooms": [],  'reception_rooms': [], "sqft": [], "description": [], "address": []}
    PROPERTY_BASE = "//ul[@class='listing-results clearfix js-gtm-list']/li"
    i = 0
    j = 0
    while i < num_to_get:
        
        sleep(3)
        total_price_path = PROPERTY_BASE + "//a[@class='listing-results-price text-price']"
        total_price = driver.find_elements_by_xpath(total_price_path)[j].text
        data["sale_price"].append(total_price)
        XPATH = PROPERTY_BASE + "//a[@class='photo-hover']"
        current_property = driver.find_elements_by_xpath(XPATH)[j]
        current_property.click()
        sleep(3)

        data["num_bedrooms"].append(extract_bedrooms(driver))
        data["num_bathrooms"].append(extract_bathrooms(driver))
        data["reception_rooms"].append(extract_reception(driver))
        data["sqft"].append(extract_sqft(driver))     
        description_element = driver.find_element_by_class_name("dp-description__text")
        data["description"].append(description_element.text)
        address_element = driver.find_element_by_class_name("ui-property-summary__address")
        data["address"].append(address_element.text)
        driver.execute_script("window.history.go(-1)")       
        print(i+1, "houses done!")
        i += 1
        j += 1
        if j >= len(driver.find_elements_by_xpath(XPATH)):
            j = 0
            next_button(driver)
    
    return data

In [5]:
postcode_list = ['E1', 'WC1', 'EC1', 'N1', 'NW1', 'SE1', 'SW1', 'W1']
properties = {x: {} for x in postcode_list}

In [6]:
for pc in postcode_list: 
    properties[pc] = new_search(pc, 300)

WebDriverException: Message: 'chromedriver' executable needs to be in PATH. Please see https://sites.google.com/a/chromium.org/chromedriver/home


# Data Cleaning

In [254]:
houses_df = pd.DataFrame(properties)
houses_df['num_bedrooms'] = houses_df['num_bedrooms'].fillna('0')
houses_df['num_bathrooms'] = houses_df['num_bathrooms'].fillna('0')
houses_df['reception_rooms'] = houses_df['reception_rooms'].fillna('0')
houses_df['shared'] = houses_df['sale_price'].map(lambda x: 'Shared' in x) * 1

In [255]:
houses_df['num_bedrooms'] = houses_df['num_bedrooms'].map(find_number)
houses_df['num_bathrooms'] = houses_df['num_bathrooms'].map(find_number)
houses_df['reception_rooms'] = houses_df['reception_rooms'].map(find_number)
houses_df['sale_price'] = houses_df['sale_price'].map(find_number)
houses_df.dropna(subset=['sqft', 'sale_price'], inplace = True)
houses_df['sqft'] = houses_df['sqft'].map(find_number)

In [256]:
houses_clean = houses_df[houses_df['shared'] == 0]

In [257]:
# from sklearn.model_selection import train_test_split
X = houses_clean[['num_bedrooms', 'num_bathrooms',	'reception_rooms',	'sqft']]
y = houses_clean[['sale_price']]
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

In [258]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X)
# X_test = sc.transform(X_test)

# Model Training

In [259]:
# from sklearn.model_selection import GridSearchCV
# from sklearn.ensemble import RandomForestRegressor
 
# param_grid = {
#     'bootstrap': [True],
#     'max_depth': [80, 90, 100, 110],
#     'max_features': [2, 3],
#     'min_samples_leaf': [3, 4, 5],
#     'min_samples_split': [8, 10, 12],
#     'n_estimators': [100, 200, 300, 1000]
# }
# regressor = RandomForestRegressor()
# grid_search = GridSearchCV(estimator = regressor, param_grid = param_grid, 
#                           cv = 3, verbose = 2)
# grid_search.fit(X_train, y_train)

In [260]:
# best_grid = grid_search.best_estimator_
# y_pred = best_grid.predict(X_test)

In [261]:
import tensorflow as tf
from tensorflow import keras
model = keras.Sequential([
    layers.Dense(64, activation='relu', input_shape=[X_train.shape[1]]),
    layers.Dense(32, activation='relu'),
    layers.Dense(1)])

optimizer = keras.optimizers.Adam(learning_rate=0.01)
model.compile(loss='mae',  optimizer=optimizer)

In [262]:
history = model.fit(X_train, y, batch_size = 1, epochs=200, verbose=1)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

In [263]:
# from sklearn import metrics
# y_pred = model.predict(X_test)
# print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
# print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
# print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

In [278]:
under_value = houses_clean[(model.predict(X_train) - y.values)[:,0] / 10000 > 20] # Filtering those properties with a market value 200k below the predicted value 

In [281]:
under_value['predicted'] = pd.DataFrame(model.predict(X_train))[(model.predict(X_train) - y.values)[:,0] / 10000 > 20].values

In [282]:
under_value

Unnamed: 0,sale_price,num_bedrooms,num_bathrooms,reception_rooms,sqft,description,address,shared,predicted
5,435500.0,1,1,1,407,Available to purchase with a 5% deposit using ...,"Selsdon Way, London E14",0,640485.125
7,427000.0,1,1,0,596,This is a stunning 1 bedroom apartment situate...,"New Village Avenue, London E14",0,636847.6875
8,588000.0,2,2,0,844,"A striking, two-bedroom apartment situated on ...","New Village Avenue, London E14",0,790864.6875
9,419000.0,1,1,0,556,This is a prestige 1 bedroom apartment situate...,"New Village Avenue, London E14",0,636864.5625
19,560000.0,2,2,1,790,Last apartment remaining!\n\nReserve the final...,"City Island Way, London E14",0,944079.375
23,661000.0,2,2,1,903,"MA208\nMontague House, London City Island, E14...","Lyell Street, London E14",0,944035.0625
35,425000.0,1,1,1,545,Contract assignment\n\n- not available under t...,"Orchard Wharf, Silvocea Way, London E14",0,640414.8125
36,550000.0,0,0,0,486,Key features:\nSouth Quay Plaza E14\n486 Sq Ft...,"South Quay Plaza, South Quay, Canary Wharf E14",0,771265.0625
39,665000.0,2,2,1,520,This beautiful 2 bedroom apartment is in a new...,"Tiller Road, Crossharbour Plaza, Canary Wharf,...",0,944185.1875
40,380000.0,0,1,1,370,This beautiful studio apartment is in a new de...,"Tiller Road, Crossharbour Plaza, Canary Wharf,...",0,646581.5
