In [1]:
import pandas as pd 
import numpy as np
import re 
import os

from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut

geolocator = Nominatim(user_agent="RentWeb")

In [2]:
ukpostcodes = pd.read_csv('../data/ukpostcode_touse.csv')
df = pd.read_csv('../data/data.csv')[['PropertyId','Url','Address','Price','Description','PropertyType','Bedrooms','Bathrooms','FurnishedType','ListingDate','PetFriendly','Smokers','Gardens','Parking','LetAvailableDate','Deposit']]

In [4]:
def extract_amount(currency_string):
    digits = ""
    for char in currency_string:
        if char.isdigit() or char == ".":
            digits += char
    if digits == '':
        return np.nan
    return float(digits)

def extract_postcode(address):
    postcode_regex = r'\b[A-Za-z]{1,2}\d{1,2}(?:[A-Za-z]{1,2})?(?: \d[A-Za-z]{1,2})?\b'
    postcode_match = re.search(postcode_regex, address)
    if postcode_match:
        return postcode_match.group(0).upper()
    else:
        return None
    
def extract_number(number_string):
    try:
        number_string = str(number_string)
        digits = ""
        for char in number_string:
            if char.isdigit():
                digits += char
        
        if digits == '':
            return np.nan
        return int(digits)
    except:
        return np.nan

def extract_deposit(string):
    currency_regex = r'£(\d+(,\d{3})*(\.\d{2})?)'
    currency_match = re.search(currency_regex, string)
    if currency_match:
        return currency_match.group(1)
    else:
        return np.nan
    
def check_pet_mention(string):
    string_regex = r'\bpet(s)?\b'
    string_match = re.search(string_regex, string, re.IGNORECASE)
    if string_match:
        return 'Yes'
    else:
        return 'Ask Agents'
    
def check_smoker_mention(string):
    string_regex = r'\bsmoker(s)?\b'
    string_match = re.search(string_regex, string, re.IGNORECASE)
    if string_match:
        return 'Yes'
    else:
        return 'Ask Agents'
    
def check_parking_mention(string):
    string_regex = r'\bparking(s)?\b'
    string_match = re.search(string_regex, string, re.IGNORECASE)
    if string_match:
        return 'Yes'
    else:
        return 'Ask Agents'
    
def check_garden_mention(string):
    string_regex = r'\bgarden(s)?\b'
    string_match = re.search(string_regex, string, re.IGNORECASE)
    if string_match:
        return 'Yes'
    else:
        return 'Ask Agents'

def get_lat_long_from_address(address):
    try:
        if 'United Kingdom' not in address:
            address += ', United Kingdom'
            
        location = geolocator.geocode(address)
        if location:
            return location.latitude, location.longitude
        else:
            return None
    except:
        return None
    
def get_lat_long_from_postcode(postcode):
    try:
        row = ukpostcodes[ukpostcodes['postcode']==postcode.upper()]
        lat = row['latitude'].values[0]
        lon = row['longitude'].values[0]
        return lat, lon
    except:
        return None

In [4]:
df['Price'] = df['Price'].apply(extract_amount)
df['Postcode'] = df['Address'].apply(extract_postcode)
df['Deposit'] = df['Deposit'].apply(extract_deposit)
df['Deposit'] = df['Deposit'].astype(str)
df['Deposit'] = df['Deposit'].apply(extract_amount)
df['Bedrooms'] = df['Bedrooms'].apply(extract_number)
df['Bathrooms'] = df['Bathrooms'].apply(extract_number)
df['PetFriendly'] = df['PetFriendly'].apply(check_pet_mention)
df['Smokers'] = df['Smokers'].apply(check_smoker_mention)
df['Gardens'] = df['Gardens'].apply(check_garden_mention)
df['Parking'] = df['Parking'].apply(check_parking_mention)
df['Geolocation_Address'] = df['Address'].apply(get_lat_long_from_address)
df['Geolocation_Postcode'] = df['Postcode'].apply(get_lat_long_from_postcode)

In [13]:
df['Geolocation'] = df['Geolocation_Address']
df['Geolocation'].fillna(df['Geolocation_Postcode'], inplace=True)
df_filtered = df.dropna(subset=['Price', 'Geolocation'], how='any').reset_index(drop=True)

In [19]:
df_final = df_filtered[['PropertyId','Url','Address','Price','Description','PropertyType','Bedrooms','Bathrooms','FurnishedType','ListingDate','PetFriendly','Smokers','Gardens','Parking','LetAvailableDate','Deposit','Geolocation']]
df_final['latitude'] = df_final['Geolocation'].str.split(", ").str[0].str[1:]
df_final['longitude'] = df_final['Geolocation'].str.split(", ").str[1].str[:-1]
df_final.to_csv('../../final_data.csv',index=False)