### Import Packages

In [46]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import sqlean as sqlite3
from functools import partial
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter

### Import Data

In [2]:
DATABASE_PATH = 'C:/University/6G7V0007_MSC_Project/Project/Data/joblistings_transformed.db'
con = sqlite3.connect(DATABASE_PATH)

In [3]:
job = pd.read_sql('SELECT * FROM job', con)
company = pd.read_sql('SELECT * FROM company', con)
website = pd.read_sql('SELECT * FROM website', con)

  job = pd.read_sql('SELECT * FROM job', con)
  company = pd.read_sql('SELECT * FROM company', con)
  website = pd.read_sql('SELECT * FROM website', con)


In [4]:
job.set_index('id', inplace=True)
company.set_index('id', inplace=True)
website.set_index('id', inplace=True)

In [5]:
test = job.merge(company, left_on='company_id', right_index=True)

In [6]:
test.rename(columns={'name':'company_name'}, inplace=True)

In [7]:
test.head()

Unnamed: 0_level_0,website_id,company_id,title,location,pay,description,timestamp,company_name
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,1,1,Graduate Electronics/Computer Science Software...,Cirencester (Gloucestershire),"£30,000",About the role A unique opportunity to join a ...,2024-06-28,AccuLink Technology
145,1,1,Project Engineer,Cirencester (Gloucestershire),Competitive,AccuLink Technology specialises in the design ...,2024-06-28,AccuLink Technology
2,1,2,Graduate Software Development Engineer,London,Competitive,nFocus Graduate Academy Accelerate your career...,2024-06-28,nfocus
3,1,3,Graduate Technology Consultant,Manchester (Hybrid),Competitive,About Arlanis Reply: Arlanis Reply is the Repl...,2024-06-28,Reply
20,1,3,Graduate Software Developer,London and Manchester (Hybrid),Competitive,About Arlanis Reply: Arlanis Reply is the Repl...,2024-06-28,Reply


### Clean location

In [8]:
# Remove characters in brackets
test['location'].replace(r'\(.*\)', '', regex=True, inplace=True)

In [9]:
# Remove postcodes
test['location'].replace(r'[A-Z]{2}[0-9]{1,2}[A-Z]?', '', regex=True, inplace=True)

In [10]:
# Strip whitespace
test['location'] = test['location'].str.strip()

In [11]:
# Map multiple locations to separate category
test['location'].replace(r'.*\sand\s.*', 'Multiple Locations', regex=True, inplace=True)
test['location'].replace(r'.*\s[&+]\s.*', 'Multiple Locations', regex=True, inplace=True)
test['location'].replace('Multiple UK Locations', 'Multiple Locations', inplace=True)
test['location'].replace('Multiple Worldwide Locations', 'Multiple Locations', inplace=True)

In [12]:
test['location'].unique()

array(['Cirencester', 'London', 'Manchester', 'Multiple Locations',
       'Bristol', 'Cambridge', 'Geneva', 'Leeds', 'Warsaw', 'Brixworth',
       'Glasgow', 'Barrow-in-Furness', 'Warton', 'Broad Oak', 'Brough',
       'Scotstoun', 'Frimley', 'Filton', 'Weymouth', 'Barrow',
       'Prestwick', 'Barrow-In-Furness', 'Chippenham', 'Pontyclun',
       'Leatherhead', 'Redditch', 'Trafford Park', 'Stafford',
       'Cheltenham', 'Didsbury', 'Solihull', 'Brighton', 'Remote',
       'Malvern Hills', 'Chiswick', 'Chiswick Park', 'Luton',
       'Crewe Toll, Edinburgh', 'Basildon', 'Southampton', 'Edinburgh',
       'Leicester', 'Oxford', 'Tokyo', 'Bridgend', 'Burton',
       'London, Chesterfield, Olney', 'Silverstone', 'Derby', 'Shipley',
       'Poole', 'Newport', 'Newcastle', 'Norwich', 'West Midlands',
       'Hemel Hempstead', 'Bath', 'Reading', 'Crownhill', 'Dublin',
       'Eastbourne', 'Heathrow', 'Stockley Park, Uxbridge',
       'Wolverhampton', 'Fareham', 'East Kilbride', 'Thame', '

In [13]:
test['location'].value_counts(sort=True).head(25)

location
London                 289
Remote                  86
Multiple Locations      71
Cambridge               38
Leeds                   37
Geneva                  36
Manchester              25
Bristol                 17
Edinburgh               15
Bridgend                14
Budapest                12
Oxford                  11
Birmingham              10
Belfast                  8
Douglas                  8
Derby                    8
Newcastle upon Tyne      8
Barrow-in-Furness        8
Southampton              7
Reading                  7
Glasgow                  7
Liverpool                6
Bollington               6
Aberdeen                 5
Luton                    5
Name: count, dtype: int64

In [48]:
geolocator = Nominatim(user_agent='msc_project')
geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1)

In [49]:
test['full_loc'] = test['location'].apply(lambda x: partial(geocode, addressdetails=True) if x not in ('Remote') else None)

In [50]:
test['point'] = test['full_loc'].apply(lambda x: tuple(x.point) if x else None)

In [51]:
test['raw_loc'] = test['full_loc'].apply(lambda x: x.raw if x else None)

In [52]:
test[['location', 'full_loc', 'point', 'raw_loc']].tail(30)

Unnamed: 0_level_0,location,full_loc,point,raw_loc
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1867,London,"(London, Greater London, England, United Kingd...","(51.4893335, -0.14405508452768728, 0.0)","{'place_id': 242052818, 'licence': 'Data © Ope..."
1872,London,"(London, Greater London, England, United Kingd...","(51.4893335, -0.14405508452768728, 0.0)","{'place_id': 242052818, 'licence': 'Data © Ope..."
1873,Remote,"(Remote, Coos County, Oregon, United States, (...","(43.0059455, -123.8925908, 0.0)","{'place_id': 280722614, 'licence': 'Data © Ope..."
1874,Nottingham,"(City of Nottingham, Nottinghamshire, England,...","(52.9534193, -1.1496461, 0.0)","{'place_id': 240532913, 'licence': 'Data © Ope..."
1876,United Kingdom,"(United Kingdom, (54.7023545, -3.2765753))","(54.7023545, -3.2765753, 0.0)","{'place_id': 237575883, 'licence': 'Data © Ope..."
1881,Welwyn Garden City,"(Welwyn Garden City, Welwyn Hatfield, Hertford...","(51.8031083, -0.2068872, 0.0)","{'place_id': 241502290, 'licence': 'Data © Ope..."
1882,"The Minstry, 79-81 Borough Road, London",,,
1884,Harrow,"(London Borough of Harrow, London, Greater Lon...","(51.596827149999996, -0.3373046180437286, 0.0)","{'place_id': 242111003, 'licence': 'Data © Ope..."
1885,Croydon,"(London Borough of Croydon, London, Greater Lo...","(51.3550556, -0.0643103753173489, 0.0)","{'place_id': 242419284, 'licence': 'Data © Ope..."
1889,Newcastle upon Tyne,"(Newcastle upon Tyne, North of Tyne, England, ...","(54.9738474, -1.6131572, 0.0)","{'place_id': 237126205, 'licence': 'Data © Ope..."


In [66]:
test['raw_loc'].loc[1920]

{'place_id': 85530579,
 'licence': 'Data © OpenStreetMap contributors, ODbL 1.0. http://osm.org/copyright',
 'osm_type': 'way',
 'osm_id': 107900524,
 'lat': '51.9165361',
 'lon': '0.9263447',
 'class': 'highway',
 'type': 'unclassified',
 'place_rank': 26,
 'importance': 0.10000999999999993,
 'addresstype': 'road',
 'name': 'Phoenix Square',
 'display_name': 'Phoenix Square, Colchester Business Park, Highwoods, Colchester, Essex, England, CO4 9HR, United Kingdom',
 'address': {'road': 'Phoenix Square',
  'industrial': 'Colchester Business Park',
  'suburb': 'Highwoods',
  'city': 'Colchester',
  'municipality': 'Essex',
  'ISO3166-2-lvl6': 'GB-ESS',
  'county': 'Essex',
  'state': 'England',
  'ISO3166-2-lvl4': 'GB-ENG',
  'postcode': 'CO4 9HR',
  'country': 'United Kingdom',
  'country_code': 'gb'},
 'boundingbox': ['51.9155857', '51.9174571', '0.9254837', '0.9270890']}

In [69]:
def get_settlement(raw_loc):
    try:
        address = raw_loc['address']
    except:
        return None
    try:
        return address['town']
    except:
        try:
            return address['village']
        except:
            try:
                return address['city']
            except:
                return None

In [85]:
def get_state(raw_loc):
    try:
        address = raw_loc['address']
        return address['state']
    except:
        return None

In [88]:
def get_country(raw_loc):
    try:
        address = raw_loc['address']
        return address['country_code']
    except:
        return None

In [89]:
test['settlement'] = test['raw_loc'].apply(get_settlement)
test['state'] = test['raw_loc'].apply(get_state)
test['country'] = test['raw_loc'].apply(get_country)

In [75]:
test[['location', 'settlement']].tail(30)

Unnamed: 0_level_0,location,settlement
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1867,London,London
1872,London,London
1873,Remote,
1874,Nottingham,City of Nottingham
1876,United Kingdom,
1881,Welwyn Garden City,Welwyn Garden City
1882,"The Minstry, 79-81 Borough Road, London",
1884,Harrow,London
1885,Croydon,London
1889,Newcastle upon Tyne,Newcastle upon Tyne


In [76]:
test['settlement'].unique()

array(['Cirencester', 'London', 'Manchester', None, 'Bristol',
       'Cambridge', 'Genève', 'Leeds', 'Brixworth', 'Glasgow', 'Stank',
       'Lancaster', 'Broad Oak', 'Brough', 'Frimley', 'Filton',
       'Weymouth', 'Prestwick', 'Chippenham', 'Pontyclun', 'Leatherhead',
       'Redditch', 'Trafford', 'Cheltenham', 'Solihull', 'Brighton',
       'Malvern Hills', 'Chiswick', 'Luton', 'City of Edinburgh',
       'Basildon', 'Southampton', 'Leicester', 'Oxford', 'Bridgend',
       'Burton-on-Trent', 'Silverstone', 'Shipley', 'Poole', 'Newport',
       'Newcastle upon Tyne', 'Norwich', 'Hemel Hempstead', 'Bath',
       'Reading', 'Milton Keynes', 'Dublin', 'Eastbourne',
       'Wolverhampton', 'Fareham', 'East Kilbride', 'Thame', 'Birmingham',
       'Gravesend', 'Aylesbury', 'Sandhurst', 'Liverpool',
       'Stratford-upon-Avon', 'Plymouth', 'Rosyth', 'Stowmarket',
       'Pencoed', 'Cardiff', 'Telford', 'Worthing', 'Exeter', 'Stroud',
       'Belfast', 'Bollington', 'Andover', 'Maidston

In [80]:
test['settlement'].value_counts(sort=True).head(20)

settlement
London                 335
Cambridge               38
Leeds                   38
Manchester              36
Genève                  36
Bristol                 17
City of Edinburgh       16
Bridgend                14
Oxford                  12
Birmingham              11
Glasgow                 11
Newcastle upon Tyne     10
Stank                   10
Belfast                  8
Reading                  8
Liverpool                7
Southampton              7
Bollington               6
Luton                    5
Stoke-on-Trent           5
Name: count, dtype: int64

In [84]:
test.loc[test['settlement'] == 'Stank', 'raw_loc'].iloc[0]

{'place_id': 239373006,
 'licence': 'Data © OpenStreetMap contributors, ODbL 1.0. http://osm.org/copyright',
 'osm_type': 'relation',
 'osm_id': 176965,
 'lat': '54.128879600000005',
 'lon': '-3.2269008205428933',
 'class': 'boundary',
 'type': 'historic:administrative',
 'place_rank': 25,
 'importance': 0.5067591574273398,
 'addresstype': 'historic:administrative',
 'name': 'Barrow-in-Furness',
 'display_name': 'Barrow-in-Furness, Stank, Westmorland and Furness, England, United Kingdom',
 'address': {'historic:administrative': 'Barrow-in-Furness',
  'village': 'Stank',
  'county': 'Westmorland and Furness',
  'ISO3166-2-lvl6': 'GB-WES',
  'state': 'England',
  'ISO3166-2-lvl4': 'GB-ENG',
  'country': 'United Kingdom',
  'country_code': 'gb'},
 'boundingbox': ['54.0396284', '54.2184678', '-3.3248213', '-3.1242815']}

In [87]:
test['state'].unique()

array(['England', None, 'Genève', 'województwo mazowieckie',
       'Alba / Scotland', 'Georgia', 'Cymru / Wales', 'Kansas', 'Oregon',
       'Rhode Island', '香港 Hong Kong',
       'Northern Ireland / Tuaisceart Éireann', 'Minnesota',
       'Mecklenburg-Vorpommern', 'Nova Scotia', 'Illinois',
       'Massachusetts', 'Florida', 'Nouvelle-Aquitaine', 'Maine',
       'Alabama', 'Shqipëria Qendrore', 'Nebraska', 'North Carolina',
       'Virginia', 'Grand Est'], dtype=object)

In [90]:
test['country'].unique()

array(['gb', None, 'ch', 'pl', 'us', 'jp', 'ie', 'cn', 'de', 'ca', 'fr',
       'hu', 'al'], dtype=object)

In [100]:
test.loc[test['country'] == 'al']

Unnamed: 0_level_0,website_id,company_id,title,location,pay,description,timestamp,company_name,full_loc,point,raw_loc,settlement,state,country
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
660,2,308,Data Engineer (Azure),North London,"£60,000 - £75,000 a year",Data Engineer Wanted: Join the Gaming Revoluti...,2024-06-28,Metrica Recruitment,"(North london, Rruga Muhamet Gjollesha, 21 Dhj...","(41.332052, 19.8048247, 0.0)","{'place_id': 81057678, 'licence': 'Data © Open...",Tiranë,Shqipëria Qendrore,al
