In [2]:
# Import multiple librarys for Data request and cleanup

import requests # this library handles http requests... for eg API GET requests.
import os # enables me to retrieve api key from another file
from dotenv import load_dotenv # allows me to access env file with api key
import pandas as pd # for all things pandas
import pgeocode
 # no longer required from geopy.geocoders import Nominatim # to clean up location data
import json # to parse json
import numpy as np
from datetime import datetime

load_dotenv("store.env") # loads data from env file

True

In [3]:
versionnumber = "1.0"
keywords = "data"
locationName = "birmingham" # location search criterea
employerId = ""
distanceinmiles = "40"
resultsToTake = 100
resultsToSkip = 0
reed_api_key = os.getenv("reed_api_key") #retrieves api_key from .env file. This means i don't need to hard code my API key into script for improved security.
today = datetime.today().strftime('%d.%m.%Y') # used in filename when exporting to excel
url = f"https://www.reed.co.uk/api/{versionnumber}/search?keywords={keywords}&locationName={locationName}&employerId={employerId}&distanceFromLocation={distanceinmiles}&resultsToTake={resultsToTake}&resultsToSkip={resultsToSkip}"
url

'https://www.reed.co.uk/api/1.0/search?keywords=data&locationName=birmingham&employerId=&distanceFromLocation=40&resultsToTake=100&resultsToSkip=0'

In [4]:
response = requests.get(url, auth=(reed_api_key, ''))
json_for_df = [] # initialise list to extend reed_data to.
if response.status_code == 200:
    print(f"Response Success {response.status_code}") 
    reed_data = response.json() # get json and set it to reed data. Add reed data to json_for_df below
    json_for_df.extend(reed_data["results"])
    resultsToSkip = resultsToSkip + resultsToTake # increment results to skip so i don't append the same data in the loop below
    total_results = (reed_data["totalResults"])
else:
    raise Exception (f"Unsuccesfull API request: {response.status_code}") # stops execution of code and exits with error message.




Response Success 200


In [None]:
total_pages_minus_1 = ((total_results + resultsToTake - 1) // resultsToTake) - 1  # this gives the total number of pages minues 1 (because i already have the first API page) to retrieve data from.

for num_page in range(total_pages_minus_1):  # ie from 0 to the total number of pages
    print("-----")
    url = f"https://www.reed.co.uk/api/{versionnumber}/search?keywords={keywords}&locationName={locationName}&employerId={employerId}&distanceFromLocation={distanceinmiles}&resultsToTake={resultsToTake}&resultsToSkip={resultsToSkip}"
    response = requests.get(url, auth=(reed_api_key, ''))
    reed_data = response.json()
    json_for_df.extend(reed_data["results"])
    print("requesting:", url)
    resultsToSkip = resultsToSkip + resultsToTake # increment results to skip by 100 for every page
    if resultsToSkip >= 9900:
        print("API request reached limit. exiting loop")
        break

In [None]:
df = pd.DataFrame(json_for_df)
# Data clean-up:
df.fillna(np.nan, inplace=True) # this fills empty values with NaN
df['date'] = pd.to_datetime(df['date'], format="%d/%m/%Y") # turn the date into an actual date-time date

df = df.drop_duplicates(subset='jobId')

df_rows = len(df)

print(df_rows)

In [7]:
# retrieve UK postcode data from pgeocode API and save in a df. ill do a merge between main df & geo_df to get location data
country = "GB"

download_url = f"https://symerio.github.io/postal-codes-data/data/geonames/{country}.txt"

column_names = [
    "countryCode",
    "outwardCode", 
    "nonPostcodeValues", 
    "countryName", 
    "countryAbv", 
    "countyName", 
    "admin_1", 
    "districtName", 
    "onsCode", 
    "latitude", 
    "longitude", 
    "accuracy"
]

response = requests.get(download_url)

if response.status_code == 200:
    print(f'Response sucess {response.status_code}')
    df_geo = pd.read_csv(download_url, sep="\t", header=None, names=column_names, dtype=str)
    
else:
    print(f'Unsuccesful {response.status_code}')

# drop duplicate rows for outward_code to avoid multiple matches at the join stage:
df_geo_unique = df_geo.drop_duplicates(subset='outwardCode')
# drop unwanted columns from df_geo_unique:
df_geo_unique = df_geo_unique.drop(columns=['countryAbv', 'admin_1', 'districtName', 'onsCode', 'latitude', 'longitude', 'accuracy'])

Response sucess 200


In [8]:
def postcode_breakdown (value): # this function finds postcodes (if contains digit assume postcode) & returns just the outward code. I will then use this outward code to reference the pgeocode DF with.
    if isinstance(value, str) and len(value) <= 8 and any(character.isdigit() for character in value):
        outward_code = value[:-3]
        return outward_code
    else:
        return np.nan

In [9]:
df['outwardCode'] = df['locationName'].apply(postcode_breakdown)
df['nonPostcodeValues'] = df["locationName"].where(pd.isna(df["outwardCode"]))



In [10]:
df_postcode = df[df['outwardCode'].notna()]


df_non_postcode = df[df['outwardCode'].isna()]


In [11]:

df_postcode_merged = pd.merge(df_postcode, df_geo_unique, on='outwardCode', how='left')
df_postcode_merged = df_postcode_merged.drop(columns='nonPostcodeValues_x')
df_postcode_merged = df_postcode_merged.rename(columns={'nonPostcodeValues_y': 'nonPostcodeValues'})
len(df_postcode_merged)


1002

In [12]:
df_non_postcode_merged = pd.merge(df_non_postcode, df_geo_unique, on='nonPostcodeValues', how='left')
df_non_postcode_merged =df_non_postcode_merged.drop_duplicates(subset='jobId')
df_non_postcode_merged = df_non_postcode_merged.drop(columns='outwardCode_x')
df_non_postcode_merged = df_non_postcode_merged.rename(columns={'outwardCode_y': 'outwardCode'})
len(df_non_postcode_merged)


1604

In [13]:
if (len(df_postcode_merged) + len(df_non_postcode_merged)) == df_rows:
    df_export = pd.concat([df_postcode_merged, df_non_postcode_merged])
else:
    raise Exception ('data lost or gained')


In [14]:
new_col_order = ['jobId', 'employerId', 'employerName', 'employerProfileId',
                'employerProfileName', 'jobTitle', 'locationName', 'minimumSalary',
                'maximumSalary', 'currency', 'date', 'expirationDate', 'applications', 'outwardCode',
                'countryCode', 'nonPostcodeValues', 'countryName', 'countyName', 'jobUrl', 'jobDescription']

df_export = df_export[new_col_order]

In [None]:
file_path = f"C:/Users/Henry/Documents/Programming/Python/Python Jobs Data v1/reed_scrapper/results/reed_{keywords}_{locationName}_{df_rows}_jobs_{today}.xlsx"
print(f'File contains: {df_rows}')
print(f'File location: {file_path}')
df_export.to_excel(file_path, index=False)