### Import Packages

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import sqlean as sqlite3
from functools import partial
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter

### Import Data

In [2]:
DATABASE_PATH = 'C:/University/6G7V0007_MSC_Project/Project/Data/joblistings_transformed.db'
con = sqlite3.connect(DATABASE_PATH)

In [3]:
job = pd.read_sql('SELECT * FROM job', con)
company = pd.read_sql('SELECT * FROM company', con)
website = pd.read_sql('SELECT * FROM website', con)

  job = pd.read_sql('SELECT * FROM job', con)
  company = pd.read_sql('SELECT * FROM company', con)
  website = pd.read_sql('SELECT * FROM website', con)


In [4]:
job.set_index('id', inplace=True)
company.set_index('id', inplace=True)
website.set_index('id', inplace=True)

In [5]:
test = job.merge(company, left_on='company_id', right_index=True)

In [6]:
test.rename(columns={'name':'company_name'}, inplace=True)

In [8]:
test.tail(10)

Unnamed: 0_level_0,website_id,company_id,title,location,pay,description,timestamp,company_name
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1917,2,671,Product Manager,Bracknell,"£41,000 - £59,000 a year",Product Manager – Bracknell – Hybrid – Up to £...,2024-07-05,Step Ahead Recruitment Ltd
1920,2,672,Software Architect,"4 Phoenix Square, Colchester CO4","£52,000 - £65,000 a year","Nordson Test & Inspection, a leading provider ...",2024-07-05,Nordson
1921,2,673,Graduate Technology Consultant - PhD Level,Guildford,"£34,000 a year",This hugely successful and rapidly expanding c...,2024-07-05,Graduate Recruitment Bureau
1922,2,674,BI Analyst,Alderley Edge,,CurrentBody is seeking a BI Analyst to join ou...,2024-07-05,CurrentBody
1923,2,675,Full Stack Developer,Remote,"£45,397.82 - £55,353.46 a year",Please visit our website before applying https...,2024-07-05,PureCode Software
1926,2,676,Network Engineer,4it Recruitment Ltd in Manchester,"Up to £55,000 a year",Network Engineer – Palo Alto/Cisco - £55k – Ma...,2024-07-05,4it Recruitment Ltd
1927,2,677,Senior Software Developer,prosperIS Recruitment Ltd in Monmouth,"£40,000 - £50,000 a year",*Are you a experienced Software Developer who'...,2024-07-05,prosperIS Recruitment Ltd
1928,2,678,Junior Systems Engineer (UK-based),Didcot,,"Work location: United Kingdom, England, Didcot...",2024-07-05,D-Orbit
1929,2,679,Engineer (Test & Validation),Telford TF7,"£26,000 - £36,000 a year",ENGINEERING TECHNICIAN: TEST & VALIDATION JOB ...,2024-07-05,VA Technology Ltd.
1933,2,680,IT Resource Planning & Capacity Support Analyst,Chiswick,,Great that you're thinking about a career with...,2024-07-05,BSI


In [14]:
test.loc[test['location'].str.contains('Morley')]

Unnamed: 0_level_0,website_id,company_id,title,location,pay,description,timestamp,company_name
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1916,2,670,Manual Test Lead,Morley LS27,"From £57,500 a year",At Evri we understand that searching for your ...,2024-07-05,Evri


### Clean location

In [15]:
# Remove mentions of hybrid in location
test['location'].replace(r'\(Hybrid\)', '', regex=True, inplace=True)

In [20]:
# Remove first line of address
test['location'].replace(r'[0-9]*[a-zA-Z\s]*\,', '', regex=True, inplace=True)

In [22]:
# Strip whitespace
test['location'] = test['location'].str.strip()

In [18]:
# Map multiple locations to separate category
test['location'].replace(r'.*\sand\s.*', 'Multiple Locations', regex=True, inplace=True)
test['location'].replace(r'.*\s[&+]\s.*', 'Multiple Locations', regex=True, inplace=True)
test['location'].replace('Multiple UK Locations', 'Multiple Locations', inplace=True)
test['location'].replace('Multiple Worldwide Locations', 'Multiple Locations', inplace=True)

In [23]:
test['location'].unique()

array(['Cirencester (Gloucestershire)', 'London', 'Manchester',
       'Multiple Locations', 'Bristol', 'Cambridge',
       'Geneva (Switzerland)', 'Leeds', 'Warsaw (Poland)',
       'Brixworth (Northamptonshire)', 'Glasgow',
       'Broad Oak (Dorchester)', 'Barrow-in-Furness (Cumbria)',
       'Filton (Bristol)', 'Broad Oak (Portsmouth)', 'Frimley (Surrey)',
       'Weymouth (Dorset)', 'Barrow (Cumbria)', 'Brough (Hull)',
       'Prestwick (South Ayrshire)', 'Barrow-In-Furness', 'Brough',
       'Chippenham (Wiltshire)', 'Pontyclun (South Wales)',
       'Leatherhead (Surrey)', 'Pontyclun (Wales)',
       'Redditch (Worcestershire)', 'Trafford Park (Manchester)',
       'Stafford (West Midlands)', 'Cheltenham (Gloucestershire)',
       'Didsbury (Manchester)', 'Solihull', 'Brighton', 'Remote',
       'Malvern Hills (Worcestershire)', 'Chiswick (London)',
       'Chiswick Park', 'Luton (Bedfordshire)', 'Edinburgh',
       'Basildon (Essex)', 'Southampton', 'Leicester', 'Oxford',
     

In [24]:
test['location'].value_counts(sort=True).head(25)

location
London                         245
Remote                          86
Multiple Locations              84
Geneva (Switzerland)            36
Leeds                           32
Cambridge                       31
Manchester                      25
London EC4N                     20
Bristol                         15
Bridgend                        14
Edinburgh                       13
Budapest                        12
Birmingham                      10
Oxford                           9
Douglas                          8
Southampton                      7
Glasgow                          7
Reading                          7
Bollington (Cheshire)            6
Barrow-in-Furness (Cumbria)      6
London SE1                       6
Liverpool                        6
Swindon                          5
Newcastle upon Tyne              5
Stafford (West Midlands)         5
Name: count, dtype: int64

In [25]:
geolocator = Nominatim(user_agent='msc_project')
geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1)

In [29]:
test['full_loc'] = test['location'].apply(partial(geocode, language='en', addressdetails=True))

RateLimiter caught an error, retrying (0/2 tries). Called with (*('Brighton BN1',), **{'language': 'en', 'addressdetails': True}).
Traceback (most recent call last):
  File "c:\Users\walte\AppData\Local\Programs\Python\Python312\Lib\site-packages\urllib3\connectionpool.py", line 536, in _make_request
    response = conn.getresponse()
               ^^^^^^^^^^^^^^^^^^
  File "c:\Users\walte\AppData\Local\Programs\Python\Python312\Lib\site-packages\urllib3\connection.py", line 461, in getresponse
    httplib_response = super().getresponse()
                       ^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\walte\AppData\Local\Programs\Python\Python312\Lib\http\client.py", line 1411, in getresponse
    response.begin()
  File "c:\Users\walte\AppData\Local\Programs\Python\Python312\Lib\http\client.py", line 324, in begin
    version, status, reason = self._read_status()
                              ^^^^^^^^^^^^^^^^^^^
  File "c:\Users\walte\AppData\Local\Programs\Python\Python312\Lib\http\clie

In [63]:
test['lat'] = test['full_loc'].apply(lambda x: x.latitude if x else None)
test['long'] = test['full_loc'].apply(lambda x: x.longitude if x else None)

In [31]:
test['raw_loc'] = test['full_loc'].apply(lambda x: x.raw if x else None)

In [32]:
test[['location', 'full_loc', 'point', 'raw_loc']].tail(30)

Unnamed: 0_level_0,location,full_loc,point,raw_loc
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1867,London EC3N,"(Royal Mail, Crosswall, Aldgate, City of Londo...","(51.5115049, -0.0772257, 0.0)","{'place_id': 243623237, 'licence': 'Data © Ope..."
1872,London,"(London, Greater London, England, United Kingd...","(51.5074456, -0.1277653, 0.0)","{'place_id': 243408926, 'licence': 'Data © Ope..."
1873,Remote,"(Remote, Coos County, Oregon, United States, (...","(43.0059455, -123.8925908, 0.0)","{'place_id': 281400128, 'licence': 'Data © Ope..."
1874,Nottingham NG8,"(NG8 285D, Newport Drive, Old Basford, Bulwell...","(52.9756471, -1.1823866, 0.0)","{'place_id': 271634807, 'licence': 'Data © Ope..."
1876,United Kingdom,"(United Kingdom, (54.7023545, -3.2765753))","(54.7023545, -3.2765753, 0.0)","{'place_id': 268752651, 'licence': 'Data © Ope..."
1881,Welwyn Garden City,"(Welwyn Garden City, Welwyn Hatfield, Hertford...","(51.8031083, -0.2068872, 0.0)","{'place_id': 272671780, 'licence': 'Data © Ope..."
1882,79- London SE1,,,
1884,Harrow,"(London Borough of Harrow, London, Greater Lon...","(51.596827149999996, -0.3373046180437286, 0.0)","{'place_id': 274023033, 'licence': 'Data © Ope..."
1885,Croydon CR0,"(Art & Craft CR0, 46, Surrey Street, Broad Gre...","(51.3725551, -0.1009094, 0.0)","{'place_id': 390117006, 'licence': 'Data © Ope..."
1889,Newcastle upon Tyne NE1,"(Royal Mail, Northumberland Street, Haymarket,...","(54.976966, -1.6131654, 0.0)","{'place_id': 268182257, 'licence': 'Data © Ope..."


In [33]:
test['raw_loc'].loc[1920]

{'place_id': 390617660,
 'licence': 'Data © OpenStreetMap contributors, ODbL 1.0. http://osm.org/copyright',
 'osm_type': 'node',
 'osm_id': 11934478504,
 'lat': '51.9235869',
 'lon': '0.9236342',
 'class': 'amenity',
 'type': 'post_box',
 'place_rank': 30,
 'importance': 6.33745326924089e-05,
 'addresstype': 'amenity',
 'name': 'Royal Mail',
 'display_name': 'Royal Mail, The Crescent, Colchester Business Park, Highwoods, Colchester, Essex, England, CO4 9QQ, United Kingdom',
 'address': {'amenity': 'Royal Mail',
  'road': 'The Crescent',
  'industrial': 'Colchester Business Park',
  'suburb': 'Highwoods',
  'city': 'Colchester',
  'municipality': 'Essex',
  'ISO3166-2-lvl6': 'GB-ESS',
  'county': 'Essex',
  'state': 'England',
  'ISO3166-2-lvl4': 'GB-ENG',
  'postcode': 'CO4 9QQ',
  'country': 'United Kingdom',
  'country_code': 'gb'},
 'boundingbox': ['51.9235369', '51.9236369', '0.9235842', '0.9236842']}

In [34]:
def get_settlement(raw_loc):
    try:
        address = raw_loc['address']
    except:
        return None
    try:
        return address['town']
    except:
        try:
            return address['village']
        except:
            try:
                return address['city']
            except:
                return None

In [35]:
def get_state(raw_loc):
    try:
        address = raw_loc['address']
        return address['state']
    except:
        return None

In [36]:
def get_country(raw_loc):
    try:
        address = raw_loc['address']
        return address['country_code']
    except:
        return None

In [37]:
test['settlement'] = test['raw_loc'].apply(get_settlement)
test['state'] = test['raw_loc'].apply(get_state)
test['country'] = test['raw_loc'].apply(get_country)

In [38]:
test[['location', 'settlement']].tail(30)

Unnamed: 0_level_0,location,settlement
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1867,London EC3N,City of London
1872,London,London
1873,Remote,
1874,Nottingham NG8,Bulwell
1876,United Kingdom,
1881,Welwyn Garden City,Welwyn Garden City
1882,79- London SE1,
1884,Harrow,London
1885,Croydon CR0,London
1889,Newcastle upon Tyne NE1,Newcastle upon Tyne


In [39]:
test['settlement'].unique()

array(['Cirencester', 'London', 'Manchester', None, 'Bristol',
       'Cambridge', 'Geneva', 'Leeds', 'Warsaw', 'Brixworth', 'Glasgow',
       'Barrow-in-Furness', 'Filton', 'Portsmouth', 'Frimley', 'Weymouth',
       'Brantingham', 'Prestwick', 'Stank', 'Brough', 'Chippenham',
       'Leatherhead', 'Pontyclun', 'Redditch', 'Trafford', 'Stafford',
       'Cheltenham', 'Metropolitan Borough of Solihull', 'Brighton',
       'Malvern Hills', 'City of Edinburgh', 'Basildon', 'Southampton',
       'Leicester', 'Oxford', 'Bridgend', 'Burton-on-Trent', 'Derby',
       'Shipley', 'Poole', 'Newport', 'Newcastle upon Tyne', 'Norwich',
       'Hemel Hempstead', 'Bath', 'Reading', 'Milton Keynes', 'Dublin',
       'Eastbourne', 'Wolverhampton', 'Fareham', 'Nerston Village',
       'Thame', 'Birmingham', 'Gravesend', 'Aylesbury', 'Clermont',
       'Liverpool', 'Stratford-upon-Avon', 'Plymouth', 'Rosyth',
       'Stowmarket', 'Cardiff', 'Telford', 'Worthing', 'Exeter', 'Stroud',
       'Belfast', '

In [40]:
test['settlement'].value_counts(sort=True).head(20)

settlement
London                 312
Cambridge               38
Manchester              36
Geneva                  36
Leeds                   35
City of London          31
City of Edinburgh       17
Bristol                 17
Bridgend                15
Oxford                  12
Budapest                12
Birmingham              11
Newcastle upon Tyne     10
Glasgow                 10
Derby                    9
Liverpool                9
Barrow-in-Furness        8
Belfast                  8
Reading                  7
Southampton              7
Name: count, dtype: int64

In [43]:
test.loc[test['settlement'] == 'Stank']

Unnamed: 0_level_0,website_id,company_id,title,location,pay,description,timestamp,company_name,full_loc,point,raw_loc,settlement,state,country
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
431,2,14,Graduate Software Engineer,Barrow-In-Furness,"£34,000 a year",Description Job Title Graduate Submarines Soft...,2024-06-28,BAE Systems,"(Barrow-in-Furness, Stank, Westmorland and Fur...","(54.128879600000005, -3.2269008205428933, 0.0)","{'place_id': 239373006, 'licence': 'Data © Ope...",Stank,England,gb


In [82]:
test['state'].value_counts(dropna=False)

state
England                   837
None                      141
Oregon                     86
Scotland                   42
Geneva                     36
Wales                      27
Illinois                    9
Northern Ireland            9
Nouvelle-Aquitaine          2
KwaZulu-Natal               1
Hong Kong                   1
Rhode Island                1
Mecklenburg-Vorpommern      1
Nova Scotia                 1
Masovian Voivodeship        1
Maine                       1
Alabama                     1
Central Albania             1
Nebraska                    1
Virginia                    1
Name: count, dtype: int64

In [44]:
test['state'].unique()

array(['England', None, 'Geneva', 'Masovian Voivodeship', 'Scotland',
       'Wales', 'Oregon', 'Rhode Island', 'KwaZulu-Natal', 'Hong Kong',
       'Northern Ireland', 'Mecklenburg-Vorpommern', 'Nova Scotia',
       'Illinois', 'Nouvelle-Aquitaine', 'Maine', 'Alabama',
       'Central Albania', 'Nebraska', 'Virginia'], dtype=object)

In [45]:
test['country'].unique()

array(['gb', None, 'ch', 'pl', 'us', 'jp', 'ie', 'za', 'cn', 'de', 'ca',
       'fr', 'hu', 'al'], dtype=object)

In [60]:
test.loc[test['country'] == 'gb']

Unnamed: 0_level_0,website_id,company_id,title,location,pay,description,timestamp,company_name,full_loc,point,raw_loc,settlement,state,country
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1,1,1,Graduate Electronics/Computer Science Software...,Cirencester (Gloucestershire),"£30,000",About the role A unique opportunity to join a ...,2024-06-28,AccuLink Technology,"(Cirencester, Chesterton, Cotswold District, G...","(51.7171029, -1.9661616, 0.0)","{'place_id': 244681365, 'licence': 'Data © Ope...",Cirencester,England,gb
145,1,1,Project Engineer,Cirencester (Gloucestershire),Competitive,AccuLink Technology specialises in the design ...,2024-06-28,AccuLink Technology,"(Cirencester, Chesterton, Cotswold District, G...","(51.7171029, -1.9661616, 0.0)","{'place_id': 244681365, 'licence': 'Data © Ope...",Cirencester,England,gb
2,1,2,Graduate Software Development Engineer,London,Competitive,nFocus Graduate Academy Accelerate your career...,2024-06-28,nfocus,"(London, Greater London, England, United Kingd...","(51.5074456, -0.1277653, 0.0)","{'place_id': 243408926, 'licence': 'Data © Ope...",London,England,gb
3,1,3,Graduate Technology Consultant,Manchester,Competitive,About Arlanis Reply: Arlanis Reply is the Repl...,2024-06-28,Reply,"(Manchester, Greater Manchester, England, Unit...","(53.4794892, -2.2451148, 0.0)","{'place_id': 239720255, 'licence': 'Data © Ope...",Manchester,England,gb
21,1,3,Graduate Software Developer,Manchester,Competitive,About Reply: Reply specialises in the design a...,2024-06-28,Reply,"(Manchester, Greater Manchester, England, Unit...","(53.4794892, -2.2451148, 0.0)","{'place_id': 239720255, 'licence': 'Data © Ope...",Manchester,England,gb
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1921,2,673,Graduate Technology Consultant - PhD Level,Guildford,"£34,000 a year",This hugely successful and rapidly expanding c...,2024-07-05,Graduate Recruitment Bureau,"(Guildford, Surrey, England, United Kingdom, (...","(51.2356068, -0.5732063, 0.0)","{'place_id': 275885144, 'licence': 'Data © Ope...",Guildford,England,gb
1922,2,674,BI Analyst,Alderley Edge,,CurrentBody is seeking a BI Analyst to join ou...,2024-07-05,CurrentBody,"(Alderley Edge, Cheshire East, England, United...","(53.3058355, -2.2375778, 0.0)","{'place_id': 271044802, 'licence': 'Data © Ope...",Alderley Edge,England,gb
1928,2,678,Junior Systems Engineer (UK-based),Didcot,,"Work location: United Kingdom, England, Didcot...",2024-07-05,D-Orbit,"(Didcot, East Hagbourne, South Oxfordshire, Ox...","(51.6056908, -1.2467923, 0.0)","{'place_id': 274225985, 'licence': 'Data © Ope...",Didcot,England,gb
1929,2,679,Engineer (Test & Validation),Telford TF7,"£26,000 - £36,000 a year",ENGINEERING TECHNICIAN: TEST & VALIDATION JOB ...,2024-07-05,VA Technology Ltd.,"(TF7 136D, Ironbridge Road, Woodside, Madeley,...","(52.6328659, -2.4723725, 0.0)","{'place_id': 394921967, 'licence': 'Data © Ope...",Madeley,England,gb


In [83]:
test['settlement'].value_counts()

settlement
London           312
Cambridge         38
Manchester        36
Geneva            36
Leeds             35
                ... 
Newmarket          1
Newport            1
Belper CP          1
Carrickfergus      1
Chiswick           1
Name: count, Length: 202, dtype: int64

### Plot on map

In [68]:
import plotly.express as px
import plotly.io as pio

In [69]:
pio.renderers.default = 'notebook_connected'

In [104]:
geodf = test.groupby('settlement').agg({'lat':'mean', 'long':'mean', 'settlement':'count'})

In [107]:
geodf.rename(columns={'settlement':'num_jobs'}, inplace=True)

In [131]:
fig = px.scatter_mapbox(geodf, lat='lat', lon='long', size='num_jobs', size_max=25, hover_name=geodf.index, zoom=4, mapbox_style='open-street-map')
fig.show()

In [126]:
fig = px.density_mapbox(geodf, lat='lat', lon='long', z='num_jobs', opacity=0.9, radius=30, center=dict(lat=52, lon=0), zoom=4, mapbox_style='open-street-map')
fig.show()