### Import Packages

In [37]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import sqlean as sqlite3
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter

### Import Data

In [2]:
DATABASE_PATH = 'C:/University/6G7V0007_MSC_Project/Project/Data/joblistings_transformed.db'
con = sqlite3.connect(DATABASE_PATH)

In [3]:
job = pd.read_sql('SELECT * FROM job', con)
company = pd.read_sql('SELECT * FROM company', con)
website = pd.read_sql('SELECT * FROM website', con)

  job = pd.read_sql('SELECT * FROM job', con)
  company = pd.read_sql('SELECT * FROM company', con)
  website = pd.read_sql('SELECT * FROM website', con)


In [4]:
job.set_index('id', inplace=True)
company.set_index('id', inplace=True)
website.set_index('id', inplace=True)

In [5]:
test = job.merge(company, left_on='company_id', right_index=True)

In [6]:
test.rename(columns={'name':'company_name'}, inplace=True)

In [7]:
test.head()

Unnamed: 0_level_0,website_id,company_id,title,location,pay,description,timestamp,company_name
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,1,1,Graduate Electronics/Computer Science Software...,Cirencester (Gloucestershire),"£30,000",About the role A unique opportunity to join a ...,2024-06-28,AccuLink Technology
145,1,1,Project Engineer,Cirencester (Gloucestershire),Competitive,AccuLink Technology specialises in the design ...,2024-06-28,AccuLink Technology
2,1,2,Graduate Software Development Engineer,London,Competitive,nFocus Graduate Academy Accelerate your career...,2024-06-28,nfocus
3,1,3,Graduate Technology Consultant,Manchester (Hybrid),Competitive,About Arlanis Reply: Arlanis Reply is the Repl...,2024-06-28,Reply
20,1,3,Graduate Software Developer,London and Manchester (Hybrid),Competitive,About Arlanis Reply: Arlanis Reply is the Repl...,2024-06-28,Reply


### Clean location

In [10]:
# Remove characters in brackets
test['location'].replace(r'\(.*\)', '', regex=True, inplace=True)

In [49]:
# Remove postcodes
test['location'].replace(r'[A-Z]{2}[0-9]{1,2}[A-Z]?', '', regex=True, inplace=True)

id
1867                                     London 
1872                                      London
1873                                      Remote
1874                                 Nottingham 
1876                              United Kingdom
1881                          Welwyn Garden City
1882    The Minstry, 79-81 Borough Road, London 
1884                                      Harrow
1885                                    Croydon 
1889                        Newcastle upon Tyne 
1892                                      London
1893                                  Birmingham
1894                                      London
1897                                     Bristol
1900                                     Preston
1901                                      Remote
1909                                   Doncaster
1912                                     Cardiff
1914                   15 Cotswold Road, Sutton 
1916                                     Morley 
1917             

In [12]:
# Strip whitespace
test['location'] = test['location'].str.strip()

In [33]:
# Map multiple locations to separate category
test['location'].replace(r'.*\sand\s.*', 'Multiple Locations', regex=True, inplace=True)
test['location'].replace(r'.*\s[&+]\s.*', 'Multiple Locations', regex=True, inplace=True)
test['location'].replace('Multiple UK Locations', 'Multiple Locations', inplace=True)
test['location'].replace('Multiple Worldwide Locations', 'Multiple Locations', inplace=True)

In [34]:
test['location'].unique()

array(['Cirencester', 'London', 'Manchester', 'Multiple Locations',
       'Bristol', 'Cambridge', 'Geneva', 'Leeds', 'Warsaw', 'Brixworth',
       'Glasgow', 'Barrow-in-Furness', 'Warton', 'Broad Oak', 'Brough',
       'Scotstoun', 'Frimley', 'Filton', 'Weymouth', 'Barrow',
       'Prestwick', 'Barrow-In-Furness', 'Chippenham', 'Pontyclun',
       'Leatherhead', 'Redditch', 'Trafford Park', 'Stafford',
       'Cheltenham', 'Didsbury', 'Solihull', 'Brighton', 'Remote',
       'Malvern Hills', 'Chiswick', 'Chiswick Park', 'Luton',
       'Crewe Toll, Edinburgh', 'Basildon', 'Southampton', 'Edinburgh',
       'Leicester', 'Oxford', 'Tokyo', 'Bridgend', 'Burton',
       'London, Chesterfield, Olney', 'Silverstone', 'Derby', 'Shipley',
       'Poole', 'Newport', 'Newcastle', 'Norwich', 'West Midlands',
       'Hemel Hempstead', 'Bath', 'Reading', 'Crownhill', 'Dublin',
       'Eastbourne', 'Heathrow', 'Stockley Park, Uxbridge',
       'Wolverhampton', 'Fareham', 'East Kilbride', 'Thame', '

In [35]:
test['location'].value_counts(sort=True).head(25)

location
London                 232
Remote                  86
Multiple Locations      71
Geneva                  36
Leeds                   32
Cambridge               31
Manchester              25
London EC4N             20
Bristol                 15
Bridgend                14
Budapest                12
Edinburgh               12
Birmingham              10
Barrow-in-Furness        8
Oxford                   8
Douglas                  8
Reading                  7
Southampton              7
Glasgow                  7
Bollington               6
Liverpool                6
London W1T               5
Newcastle upon Tyne      5
Stafford                 5
Manchester M4            5
Name: count, dtype: int64

In [38]:
geolocator = Nominatim(user_agent='msc_project')
geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1)

In [None]:
def get_full_location(location, geocode):
    """Returns the full address, latitude and longitude for the given location"""
    if location in ('Remote', 'Multiple Locations'):
        return None, None, None
    else:
        full_loc = geocode(location)
        lat = full_loc.latitude
        long = full_loc.longitude

In [39]:
test['full_loc'] = test['location'].apply(geocode)

In [45]:
test[['location', 'full_loc']].tail(30)

Unnamed: 0_level_0,location,full_loc
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1867,London EC3N,"(Royal Mail, Crosswall, Aldgate, City of Londo..."
1872,London,"(London, Greater London, England, United Kingd..."
1873,Remote,"(Remote, Coos County, Oregon, United States, (..."
1874,Nottingham NG8,"(NG8 285D, Newport Drive, Old Basford, Bulwell..."
1876,United Kingdom,"(United Kingdom, (54.7023545, -3.2765753))"
1881,Welwyn Garden City,"(Welwyn Garden City, Welwyn Hatfield, Hertford..."
1882,"The Minstry, 79-81 Borough Road, London SE1",
1884,Harrow,"(London Borough of Harrow, London, Greater Lon..."
1885,Croydon CR0,"(Art & Craft CR0, 46, Surrey Street, Broad Gre..."
1889,Newcastle upon Tyne NE1,"(Royal Mail, Northumberland Street, Haymarket,..."
