# AI-Powered Career Recommandation System

In [131]:
# Import libraries
import requests
import json
import time
import re
import os

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

In [93]:
app_id = os.env.API_ID
app_key = os.env.API_KEY
base_url = f"http://api.adzuna.com/v1/api/jobs/"
countries = ["gb", "us", "br", "ca", "au", "nl", "it", "be", "mx"]

In [94]:
def get_job_data(country, page_number):
    url = f"{base_url}{country}/search/{page_number}?app_id={app_id}&app_key={app_key}&content-type=application/json"
    try:
        response = requests.get(url)
        response.raise_for_status()
        return response.json()
    except requests.exceptions.RequestException as e:
        print(f"Error fetching {country} page {page_number}: {e}")
        return None

In [95]:
def fetch_all_jobs_from_all_countries():
    all_jobs = []
    for country in countries:
        print(f"Fetching jobs from {country}...")
        page_number = 1
        while True:
            page_data = get_job_data(country, page_number)

            if page_data and 'results' in page_data and page_data['results']:
                all_jobs.extend(page_data['results'])
                page_number += 1
            else:
                if page_data and 'results' in page_data and not page_data['results']:
                    print(f"No results on {country} page {page_number}. Stopping.")
                elif page_data is None:
                    print(f"Error on {country} page {page_number}. Stopping.")
                break

            time.sleep(1)

    return all_jobs

In [96]:
all_job_data = fetch_all_jobs_from_all_countries()

if all_job_data:
    print(f"Successfully retrieved {len(all_job_data)} job postings from all countries.")
    with open("all_jobs_all_countries.json", "w", encoding="utf-8") as f:
        json.dump(all_job_data, f, indent=4, ensure_ascii=False)
else:
    print("Failed to retrieve job data from one or more countries.")

Fetching jobs from gb...
Error fetching gb page 38: 503 Server Error: Service Temporarily Unavailable for url: https://api.adzuna.com:443/v1/api/jobs/gb/search/38?app_id=55daa0f8&app_key=1c87360eec3e3f190f41be4cd0e178ea&content-type=application/json
Error on gb page 38. Stopping.
Fetching jobs from us...
Error fetching us page 14: 503 Server Error: Service Temporarily Unavailable for url: https://api.adzuna.com:443/v1/api/jobs/us/search/14?app_id=55daa0f8&app_key=1c87360eec3e3f190f41be4cd0e178ea&content-type=application/json
Error on us page 14. Stopping.
Fetching jobs from br...
Error fetching br page 17: 503 Server Error: Service Temporarily Unavailable for url: https://api.adzuna.com:443/v1/api/jobs/br/search/17?app_id=55daa0f8&app_key=1c87360eec3e3f190f41be4cd0e178ea&content-type=application/json
Error on br page 17. Stopping.
Fetching jobs from ca...
Error fetching ca page 6: 503 Server Error: Service Temporarily Unavailable for url: https://api.adzuna.com:443/v1/api/jobs/ca/searc

In [97]:
all_job_data

[{'latitude': 52.497398,
  '__CLASS__': 'Adzuna::API::Response::Job',
  'title': 'Quality Inspector',
  'salary_is_predicted': '0',
  'adref': 'eyJhbGciOiJIUzI1NiJ9.eyJpIjoiNTAzNTI0NjM5NCIsInMiOiJhT2stVTU3cjd4R0Z2N1RWNkttTF9RIn0.sxzFB66VlvKaA8C1ZlUPWwaKGNB9GWJu6xRXd5ZFQes',
  'company': {'__CLASS__': 'Adzuna::API::Response::Company',
   'display_name': 'Premier Technical Recruitment'},
  'location': {'display_name': 'Coleshill, Birmingham',
   '__CLASS__': 'Adzuna::API::Response::Location',
   'area': ['UK', 'West Midlands', 'Birmingham', 'Coleshill']},
  'created': '2025-02-03T14:32:54Z',
  'longitude': -1.70547,
  'salary_max': 30000,
  'description': 'Quality Inspector Near Coleshill, Warwickshire c£26k - £30k neg dep exp  benefits Our client has been established for almost 40 years and are recognised market leaders in the design and development, manufacture and distribution of an impressive portfolio of cabling and connection solutions for supply throughout the world, and as a resu

In [98]:
all_job_data[0]

{'latitude': 52.497398,
 '__CLASS__': 'Adzuna::API::Response::Job',
 'title': 'Quality Inspector',
 'salary_is_predicted': '0',
 'adref': 'eyJhbGciOiJIUzI1NiJ9.eyJpIjoiNTAzNTI0NjM5NCIsInMiOiJhT2stVTU3cjd4R0Z2N1RWNkttTF9RIn0.sxzFB66VlvKaA8C1ZlUPWwaKGNB9GWJu6xRXd5ZFQes',
 'company': {'__CLASS__': 'Adzuna::API::Response::Company',
  'display_name': 'Premier Technical Recruitment'},
 'location': {'display_name': 'Coleshill, Birmingham',
  '__CLASS__': 'Adzuna::API::Response::Location',
  'area': ['UK', 'West Midlands', 'Birmingham', 'Coleshill']},
 'created': '2025-02-03T14:32:54Z',
 'longitude': -1.70547,
 'salary_max': 30000,
 'description': 'Quality Inspector Near Coleshill, Warwickshire c£26k - £30k neg dep exp  benefits Our client has been established for almost 40 years and are recognised market leaders in the design and development, manufacture and distribution of an impressive portfolio of cabling and connection solutions for supply throughout the world, and as a result of continue

In [102]:
# Convert json data to dataframe
df = pd.DataFrame().from_dict(all_job_data).set_index("id")
df.head()

Unnamed: 0_level_0,latitude,__CLASS__,title,salary_is_predicted,adref,company,location,created,longitude,salary_max,description,contract_time,salary_min,redirect_url,contract_type,category
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
5035246394,52.497398,Adzuna::API::Response::Job,Quality Inspector,0,eyJhbGciOiJIUzI1NiJ9.eyJpIjoiNTAzNTI0NjM5NCIsI...,{'__CLASS__': 'Adzuna::API::Response::Company'...,"{'display_name': 'Coleshill, Birmingham', '__C...",2025-02-03T14:32:54Z,-1.70547,30000.0,"Quality Inspector Near Coleshill, Warwickshire...",full_time,26000.0,https://www.adzuna.co.uk/jobs/details/50352463...,permanent,"{'tag': 'engineering-jobs', 'label': 'Engineer..."
5005137865,50.2136,Adzuna::API::Response::Job,Self-Employed Childminder (Part Time),0,eyJhbGciOiJIUzI1NiJ9.eyJpIjoiNTAwNTEzNzg2NSIsI...,"{'display_name': 'Tiney', '__CLASS__': 'Adzuna...","{'area': ['UK', 'South West England', 'Cornwal...",2025-01-10T11:07:10Z,-5.30078,20000.0,Become a Childminder with Tiney – Unlock a Rew...,part_time,20000.0,https://www.adzuna.co.uk/jobs/details/50051378...,,"{'label': 'Part time Jobs', '__CLASS__': 'Adzu..."
5005138141,51.572201,Adzuna::API::Response::Job,Self-Employed Childcare Provider (Full Time),0,eyJhbGciOiJIUzI1NiJ9.eyJpIjoiNTAwNTEzODE0MSIsI...,"{'display_name': 'Tiney', '__CLASS__': 'Adzuna...","{'area': ['UK', 'Eastern England', 'Essex', 'B...",2025-01-10T11:07:11Z,0.462461,40000.0,Become a Childminder with Tiney – Unlock a Rew...,full_time,40000.0,https://www.adzuna.co.uk/jobs/details/50051381...,,"{'tag': 'social-work-jobs', '__CLASS__': 'Adzu..."
5005138479,50.921101,Adzuna::API::Response::Job,Self-Employed Childcare Provider (Full Time),0,eyJhbGciOiJIUzI1NiJ9.eyJpIjoiNTAwNTEzODQ3OSIsI...,{'__CLASS__': 'Adzuna::API::Response::Company'...,"{'area': ['UK', 'South West England', 'Somerse...",2025-01-10T11:07:13Z,-3.08184,40000.0,Become a Childminder with Tiney – Unlock a Rew...,full_time,40000.0,https://www.adzuna.co.uk/jobs/details/50051384...,,"{'label': 'Social work Jobs', '__CLASS__': 'Ad..."
5005137332,51.6106,Adzuna::API::Response::Job,Self-Employed Childminder (Part Time),0,eyJhbGciOiJIUzI1NiJ9.eyJzIjoiYU9rLVU1N3I3eEdGd...,"{'display_name': 'Tiney', '__CLASS__': 'Adzuna...","{'area': ['UK', 'Eastern England', 'Essex', 'W...",2025-01-10T11:07:07Z,0.523168,20000.0,Become a Childminder with Tiney – Unlock a Rew...,part_time,20000.0,https://www.adzuna.co.uk/jobs/details/50051373...,,"{'tag': 'part-time-jobs', '__CLASS__': 'Adzuna..."


In [100]:
print(df.shape)

(1310, 17)


In [112]:
df.to_csv("jobs.csv", index=True)

In [114]:
df_p = pd.read_csv("jobs.csv")
print(df_p.shape)
print(df_p.info())

(1310, 17)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1310 entries, 0 to 1309
Data columns (total 17 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   id                   1310 non-null   int64  
 1   latitude             978 non-null    float64
 2   __CLASS__            1310 non-null   object 
 3   title                1310 non-null   object 
 4   salary_is_predicted  1310 non-null   int64  
 5   adref                1310 non-null   object 
 6   company              1310 non-null   object 
 7   location             1310 non-null   object 
 8   created              1310 non-null   object 
 9   longitude            978 non-null    float64
 10  salary_max           700 non-null    float64
 11  description          1310 non-null   object 
 12  contract_time        594 non-null    object 
 13  salary_min           701 non-null    float64
 14  redirect_url         1310 non-null   object 
 15  contract_type        152 no

In [115]:
df_p.head()

Unnamed: 0,id,latitude,__CLASS__,title,salary_is_predicted,adref,company,location,created,longitude,salary_max,description,contract_time,salary_min,redirect_url,contract_type,category
0,5035246394,52.497398,Adzuna::API::Response::Job,Quality Inspector,0,eyJhbGciOiJIUzI1NiJ9.eyJpIjoiNTAzNTI0NjM5NCIsI...,{'__CLASS__': 'Adzuna::API::Response::Company'...,"{'display_name': 'Coleshill, Birmingham', '__C...",2025-02-03T14:32:54Z,-1.70547,30000.0,"Quality Inspector Near Coleshill, Warwickshire...",full_time,26000.0,https://www.adzuna.co.uk/jobs/details/50352463...,permanent,"{'tag': 'engineering-jobs', 'label': 'Engineer..."
1,5005137865,50.2136,Adzuna::API::Response::Job,Self-Employed Childminder (Part Time),0,eyJhbGciOiJIUzI1NiJ9.eyJpIjoiNTAwNTEzNzg2NSIsI...,"{'display_name': 'Tiney', '__CLASS__': 'Adzuna...","{'area': ['UK', 'South West England', 'Cornwal...",2025-01-10T11:07:10Z,-5.30078,20000.0,Become a Childminder with Tiney – Unlock a Rew...,part_time,20000.0,https://www.adzuna.co.uk/jobs/details/50051378...,,"{'label': 'Part time Jobs', '__CLASS__': 'Adzu..."
2,5005138141,51.572201,Adzuna::API::Response::Job,Self-Employed Childcare Provider (Full Time),0,eyJhbGciOiJIUzI1NiJ9.eyJpIjoiNTAwNTEzODE0MSIsI...,"{'display_name': 'Tiney', '__CLASS__': 'Adzuna...","{'area': ['UK', 'Eastern England', 'Essex', 'B...",2025-01-10T11:07:11Z,0.462461,40000.0,Become a Childminder with Tiney – Unlock a Rew...,full_time,40000.0,https://www.adzuna.co.uk/jobs/details/50051381...,,"{'tag': 'social-work-jobs', '__CLASS__': 'Adzu..."
3,5005138479,50.921101,Adzuna::API::Response::Job,Self-Employed Childcare Provider (Full Time),0,eyJhbGciOiJIUzI1NiJ9.eyJpIjoiNTAwNTEzODQ3OSIsI...,{'__CLASS__': 'Adzuna::API::Response::Company'...,"{'area': ['UK', 'South West England', 'Somerse...",2025-01-10T11:07:13Z,-3.08184,40000.0,Become a Childminder with Tiney – Unlock a Rew...,full_time,40000.0,https://www.adzuna.co.uk/jobs/details/50051384...,,"{'label': 'Social work Jobs', '__CLASS__': 'Ad..."
4,5005137332,51.6106,Adzuna::API::Response::Job,Self-Employed Childminder (Part Time),0,eyJhbGciOiJIUzI1NiJ9.eyJzIjoiYU9rLVU1N3I3eEdGd...,"{'display_name': 'Tiney', '__CLASS__': 'Adzuna...","{'area': ['UK', 'Eastern England', 'Essex', 'W...",2025-01-10T11:07:07Z,0.523168,20000.0,Become a Childminder with Tiney – Unlock a Rew...,part_time,20000.0,https://www.adzuna.co.uk/jobs/details/50051373...,,"{'tag': 'part-time-jobs', '__CLASS__': 'Adzuna..."


In [120]:
df_p.columns

Index(['id', 'latitude', '__CLASS__', 'title', 'salary_is_predicted', 'adref',
       'company', 'location', 'created', 'longitude', 'salary_max',
       'description', 'contract_time', 'salary_min', 'redirect_url',
       'contract_type', 'category'],
      dtype='object')

### Data Wranggling

In [121]:
# Basic Information about the data 
print("Shape: ", df_p.shape)
df_p.info()

Shape:  (1310, 17)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1310 entries, 0 to 1309
Data columns (total 17 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   id                   1310 non-null   int64  
 1   latitude             978 non-null    float64
 2   __CLASS__            1310 non-null   object 
 3   title                1310 non-null   object 
 4   salary_is_predicted  1310 non-null   int64  
 5   adref                1310 non-null   object 
 6   company              1310 non-null   object 
 7   location             1310 non-null   object 
 8   created              1310 non-null   object 
 9   longitude            978 non-null    float64
 10  salary_max           700 non-null    float64
 11  description          1310 non-null   object 
 12  contract_time        594 non-null    object 
 13  salary_min           701 non-null    float64
 14  redirect_url         1310 non-null   object 
 15  contract_type      

In [122]:
# Remove the unnecessary columns
df_p.drop(columns=["latitude", "__CLASS__", "adref", "location", "longitude", "redirect_url"], inplace=True)
print("Shape: ", df_p.shape)
df_p.head()

Shape:  (1310, 11)


Unnamed: 0,id,title,salary_is_predicted,company,created,salary_max,description,contract_time,salary_min,contract_type,category
0,5035246394,Quality Inspector,0,{'__CLASS__': 'Adzuna::API::Response::Company'...,2025-02-03T14:32:54Z,30000.0,"Quality Inspector Near Coleshill, Warwickshire...",full_time,26000.0,permanent,"{'tag': 'engineering-jobs', 'label': 'Engineer..."
1,5005137865,Self-Employed Childminder (Part Time),0,"{'display_name': 'Tiney', '__CLASS__': 'Adzuna...",2025-01-10T11:07:10Z,20000.0,Become a Childminder with Tiney – Unlock a Rew...,part_time,20000.0,,"{'label': 'Part time Jobs', '__CLASS__': 'Adzu..."
2,5005138141,Self-Employed Childcare Provider (Full Time),0,"{'display_name': 'Tiney', '__CLASS__': 'Adzuna...",2025-01-10T11:07:11Z,40000.0,Become a Childminder with Tiney – Unlock a Rew...,full_time,40000.0,,"{'tag': 'social-work-jobs', '__CLASS__': 'Adzu..."
3,5005138479,Self-Employed Childcare Provider (Full Time),0,{'__CLASS__': 'Adzuna::API::Response::Company'...,2025-01-10T11:07:13Z,40000.0,Become a Childminder with Tiney – Unlock a Rew...,full_time,40000.0,,"{'label': 'Social work Jobs', '__CLASS__': 'Ad..."
4,5005137332,Self-Employed Childminder (Part Time),0,"{'display_name': 'Tiney', '__CLASS__': 'Adzuna...",2025-01-10T11:07:07Z,20000.0,Become a Childminder with Tiney – Unlock a Rew...,part_time,20000.0,,"{'tag': 'part-time-jobs', '__CLASS__': 'Adzuna..."


In [123]:
df_p.set_index("id")
df_p.head(2)

Unnamed: 0,id,title,salary_is_predicted,company,created,salary_max,description,contract_time,salary_min,contract_type,category
0,5035246394,Quality Inspector,0,{'__CLASS__': 'Adzuna::API::Response::Company'...,2025-02-03T14:32:54Z,30000.0,"Quality Inspector Near Coleshill, Warwickshire...",full_time,26000.0,permanent,"{'tag': 'engineering-jobs', 'label': 'Engineer..."
1,5005137865,Self-Employed Childminder (Part Time),0,"{'display_name': 'Tiney', '__CLASS__': 'Adzuna...",2025-01-10T11:07:10Z,20000.0,Become a Childminder with Tiney – Unlock a Rew...,part_time,20000.0,,"{'label': 'Part time Jobs', '__CLASS__': 'Adzu..."


In [129]:
print("Number of Unique values in [salary_is_predicted]", df_p.salary_is_predicted.nunique())
print("Values ", df_p.salary_is_predicted.unique())


Number of Unique values in [salary_is_predicted] 2
Values  [0 1]


In [130]:
df_p[df_p["contract_time"].isnull()].head()

Unnamed: 0,id,title,salary_is_predicted,company,created,salary_max,description,contract_time,salary_min,contract_type,category
340,5029873208,Clinical Development Nurse - South West,1,{'__CLASS__': 'Adzuna::API::Response::Company'...,2025-01-30T15:19:23Z,47990.71,ABOUT THE ROLE As a Regional Clinical Developm...,,47990.71,permanent,"{'tag': 'healthcare-nursing-jobs', '__CLASS__'..."
351,5049655700,Head of Maintenance - Care Home,0,{'__CLASS__': 'Adzuna::API::Response::Company'...,2025-02-14T11:39:46Z,27456.0,ABOUT THE ROLE As a Head of Maintenance at a B...,,27456.0,permanent,"{'label': 'Maintenance Jobs', 'tag': 'maintena..."
370,4981204709,"Field Nurse Practitioner - Mecklenburg County,...",1,{'__CLASS__': 'Adzuna::API::Response::Company'...,2024-12-17T12:57:07Z,107347.25,About Advantmed Advantmed is a leading provide...,,107347.25,contract,"{'tag': 'healthcare-nursing-jobs', '__CLASS__'..."
371,4981204722,"Field Nurse Practitioner - Ingham County, Mich...",1,{'__CLASS__': 'Adzuna::API::Response::Company'...,2024-12-17T12:57:07Z,105508.36,About Advantmed Advantmed is a leading provide...,,105508.36,contract,"{'label': 'Healthcare & Nursing Jobs', 'tag': ..."
372,4981204752,"Field Nurse Practitioner - Polk County, Florid...",1,"{'display_name': 'Advantmed', '__CLASS__': 'Ad...",2024-12-17T12:57:12Z,102617.13,About Advantmed Advantmed is a leading provide...,,102617.13,contract,"{'label': 'Healthcare & Nursing Jobs', '__CLAS..."


In [132]:
# Create a function to remove paranthesis in column content
def remove_parantheses(text):
    return re.sub(r'\(.*?\)', '', text).strip()

In [133]:
df_p["title"] = df_p["title"].apply(remove_parantheses)
df_p.head() 

Unnamed: 0,id,title,salary_is_predicted,company,created,salary_max,description,contract_time,salary_min,contract_type,category
0,5035246394,Quality Inspector,0,{'__CLASS__': 'Adzuna::API::Response::Company'...,2025-02-03T14:32:54Z,30000.0,"Quality Inspector Near Coleshill, Warwickshire...",full_time,26000.0,permanent,"{'tag': 'engineering-jobs', 'label': 'Engineer..."
1,5005137865,Self-Employed Childminder,0,"{'display_name': 'Tiney', '__CLASS__': 'Adzuna...",2025-01-10T11:07:10Z,20000.0,Become a Childminder with Tiney – Unlock a Rew...,part_time,20000.0,,"{'label': 'Part time Jobs', '__CLASS__': 'Adzu..."
2,5005138141,Self-Employed Childcare Provider,0,"{'display_name': 'Tiney', '__CLASS__': 'Adzuna...",2025-01-10T11:07:11Z,40000.0,Become a Childminder with Tiney – Unlock a Rew...,full_time,40000.0,,"{'tag': 'social-work-jobs', '__CLASS__': 'Adzu..."
3,5005138479,Self-Employed Childcare Provider,0,{'__CLASS__': 'Adzuna::API::Response::Company'...,2025-01-10T11:07:13Z,40000.0,Become a Childminder with Tiney – Unlock a Rew...,full_time,40000.0,,"{'label': 'Social work Jobs', '__CLASS__': 'Ad..."
4,5005137332,Self-Employed Childminder,0,"{'display_name': 'Tiney', '__CLASS__': 'Adzuna...",2025-01-10T11:07:07Z,20000.0,Become a Childminder with Tiney – Unlock a Rew...,part_time,20000.0,,"{'tag': 'part-time-jobs', '__CLASS__': 'Adzuna..."


In [153]:
# Print the content of a company column
cell_value = df_p.loc[340, "company"]
print(cell_value)

Barchester Healthcare


In [138]:
# Function to extract company name
def extract_display_name(text):
    # Use regular expression to find the value after 'display_name':
    match = re.search(r"'display_name':\s*'([^']*)'", text)
    if match:
        return match.group(1)
    return None

In [139]:
# Apply the function to the company column
df_p["company"] = df_p["company"].apply(extract_display_name)
df_p.head()

Unnamed: 0,id,title,salary_is_predicted,company,created,salary_max,description,contract_time,salary_min,contract_type,category
0,5035246394,Quality Inspector,0,Premier Technical Recruitment,2025-02-03T14:32:54Z,30000.0,"Quality Inspector Near Coleshill, Warwickshire...",full_time,26000.0,permanent,"{'tag': 'engineering-jobs', 'label': 'Engineer..."
1,5005137865,Self-Employed Childminder,0,Tiney,2025-01-10T11:07:10Z,20000.0,Become a Childminder with Tiney – Unlock a Rew...,part_time,20000.0,,"{'label': 'Part time Jobs', '__CLASS__': 'Adzu..."
2,5005138141,Self-Employed Childcare Provider,0,Tiney,2025-01-10T11:07:11Z,40000.0,Become a Childminder with Tiney – Unlock a Rew...,full_time,40000.0,,"{'tag': 'social-work-jobs', '__CLASS__': 'Adzu..."
3,5005138479,Self-Employed Childcare Provider,0,Tiney,2025-01-10T11:07:13Z,40000.0,Become a Childminder with Tiney – Unlock a Rew...,full_time,40000.0,,"{'label': 'Social work Jobs', '__CLASS__': 'Ad..."
4,5005137332,Self-Employed Childminder,0,Tiney,2025-01-10T11:07:07Z,20000.0,Become a Childminder with Tiney – Unlock a Rew...,part_time,20000.0,,"{'tag': 'part-time-jobs', '__CLASS__': 'Adzuna..."


In [157]:
cell_value2 = df_p.loc[3, "category"]
print(cell_value2)

{'label': 'Social work Jobs', '__CLASS__': 'Adzuna::API::Response::Category', 'tag': 'social-work-jobs'}


In [158]:
# Function to extract company name
def extract_job_category(text):
    # Use regular expression to find the value after 'label':
    match = re.search(r"'label':\s*'([^']*)'", text)
    if match:
        return match.group(1)
    return None

In [159]:
# Apply the function to this
df_p["category"] = df_p["category"].apply(extract_job_category)
df_p.head()

Unnamed: 0,id,title,salary_is_predicted,company,created,salary_max,description,contract_time,salary_min,contract_type,category
0,5035246394,Quality Inspector,0,Premier Technical Recruitment,2025-02-03T14:32:54Z,30000.0,"Quality Inspector Near Coleshill, Warwickshire...",full_time,26000.0,permanent,Engineering Jobs
1,5005137865,Self-Employed Childminder,0,Tiney,2025-01-10T11:07:10Z,20000.0,Become a Childminder with Tiney – Unlock a Rew...,part_time,20000.0,,Part time Jobs
2,5005138141,Self-Employed Childcare Provider,0,Tiney,2025-01-10T11:07:11Z,40000.0,Become a Childminder with Tiney – Unlock a Rew...,full_time,40000.0,,Social work Jobs
3,5005138479,Self-Employed Childcare Provider,0,Tiney,2025-01-10T11:07:13Z,40000.0,Become a Childminder with Tiney – Unlock a Rew...,full_time,40000.0,,Social work Jobs
4,5005137332,Self-Employed Childminder,0,Tiney,2025-01-10T11:07:07Z,20000.0,Become a Childminder with Tiney – Unlock a Rew...,part_time,20000.0,,Part time Jobs


In [161]:
df_p.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1310 entries, 0 to 1309
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   id                   1310 non-null   int64  
 1   title                1310 non-null   object 
 2   salary_is_predicted  1310 non-null   int64  
 3   company              1307 non-null   object 
 4   created              1310 non-null   object 
 5   salary_max           700 non-null    float64
 6   description          1310 non-null   object 
 7   contract_time        594 non-null    object 
 8   salary_min           701 non-null    float64
 9   contract_type        152 non-null    object 
 10  category             1310 non-null   object 
dtypes: float64(2), int64(2), object(7)
memory usage: 112.7+ KB


In [167]:
# A regulare expresion fuction to remove dash in a content
def remove_dash_space(text):
    return re.sub(r' - [^ ]*$', '', text)

In [170]:
df_p["title"] = df_p["title"].str.replace(r' - [^ -]*$', '', regex=True)
df_p[df_p["contract_time"].isnull()].head()

Unnamed: 0,id,title,salary_is_predicted,company,created,salary_max,description,contract_time,salary_min,contract_type,category
340,5029873208,Clinical Development Nurse - South West,1,Barchester Healthcare,2025-01-30T15:19:23Z,47990.71,ABOUT THE ROLE As a Regional Clinical Developm...,,47990.71,permanent,Healthcare & Nursing Jobs
351,5049655700,Head of Maintenance - Care Home,0,Barchester Healthcare,2025-02-14T11:39:46Z,27456.0,ABOUT THE ROLE As a Head of Maintenance at a B...,,27456.0,permanent,Maintenance Jobs
370,4981204709,"Field Nurse Practitioner - Mecklenburg County,...",1,Advantmed,2024-12-17T12:57:07Z,107347.25,About Advantmed Advantmed is a leading provide...,,107347.25,contract,Healthcare & Nursing Jobs
371,4981204722,"Field Nurse Practitioner - Ingham County, Mich...",1,Advantmed,2024-12-17T12:57:07Z,105508.36,About Advantmed Advantmed is a leading provide...,,105508.36,contract,Healthcare & Nursing Jobs
372,4981204752,"Field Nurse Practitioner - Polk County, Florid...",1,Advantmed,2024-12-17T12:57:12Z,102617.13,About Advantmed Advantmed is a leading provide...,,102617.13,contract,Healthcare & Nursing Jobs


In [169]:
df_p.head()

Unnamed: 0,id,title,salary_is_predicted,company,created,salary_max,description,contract_time,salary_min,contract_type,category
0,5035246394,Quality Inspector,0,Premier Technical Recruitment,2025-02-03T14:32:54Z,30000.0,"Quality Inspector Near Coleshill, Warwickshire...",full_time,26000.0,permanent,Engineering Jobs
1,5005137865,Self-Employed Childminder,0,Tiney,2025-01-10T11:07:10Z,20000.0,Become a Childminder with Tiney – Unlock a Rew...,part_time,20000.0,,Part time Jobs
2,5005138141,Self-Employed Childcare Provider,0,Tiney,2025-01-10T11:07:11Z,40000.0,Become a Childminder with Tiney – Unlock a Rew...,full_time,40000.0,,Social work Jobs
3,5005138479,Self-Employed Childcare Provider,0,Tiney,2025-01-10T11:07:13Z,40000.0,Become a Childminder with Tiney – Unlock a Rew...,full_time,40000.0,,Social work Jobs
4,5005137332,Self-Employed Childminder,0,Tiney,2025-01-10T11:07:07Z,20000.0,Become a Childminder with Tiney – Unlock a Rew...,part_time,20000.0,,Part time Jobs


In [171]:
# Add new columns
df_p['Self-Employed'] = df_p["title"].str.contains("Self-Employed").astype(int)
df_p['Contracted'] = (~df_p["title"].str.contains("Self-Employed").astype(int))

df_p.head()

Unnamed: 0,id,title,salary_is_predicted,company,created,salary_max,description,contract_time,salary_min,contract_type,category,Self-Employed,Contracted
0,5035246394,Quality Inspector,0,Premier Technical Recruitment,2025-02-03T14:32:54Z,30000.0,"Quality Inspector Near Coleshill, Warwickshire...",full_time,26000.0,permanent,Engineering Jobs,0,-1
1,5005137865,Self-Employed Childminder,0,Tiney,2025-01-10T11:07:10Z,20000.0,Become a Childminder with Tiney – Unlock a Rew...,part_time,20000.0,,Part time Jobs,1,-2
2,5005138141,Self-Employed Childcare Provider,0,Tiney,2025-01-10T11:07:11Z,40000.0,Become a Childminder with Tiney – Unlock a Rew...,full_time,40000.0,,Social work Jobs,1,-2
3,5005138479,Self-Employed Childcare Provider,0,Tiney,2025-01-10T11:07:13Z,40000.0,Become a Childminder with Tiney – Unlock a Rew...,full_time,40000.0,,Social work Jobs,1,-2
4,5005137332,Self-Employed Childminder,0,Tiney,2025-01-10T11:07:07Z,20000.0,Become a Childminder with Tiney – Unlock a Rew...,part_time,20000.0,,Part time Jobs,1,-2


In [172]:
# Remove more unncessary columns
df_p.drop(columns=["Contracted", "contract_type"], inplace=True)
df_p.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1310 entries, 0 to 1309
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   id                   1310 non-null   int64  
 1   title                1310 non-null   object 
 2   salary_is_predicted  1310 non-null   int64  
 3   company              1307 non-null   object 
 4   created              1310 non-null   object 
 5   salary_max           700 non-null    float64
 6   description          1310 non-null   object 
 7   contract_time        594 non-null    object 
 8   salary_min           701 non-null    float64
 9   category             1310 non-null   object 
 10  Self-Employed        1310 non-null   int32  
dtypes: float64(2), int32(1), int64(2), object(6)
memory usage: 107.6+ KB


In [178]:
# Add an average salary
df_p.drop(columns=["salary_avg"], inplace=True)
df_p.head(1)

Unnamed: 0,id,title,salary_is_predicted,company,created,salary_max,description,contract_time,salary_min,category,Self-Employed
0,5035246394,Quality Inspector,0,Premier Technical Recruitment,2025-02-03T14:32:54Z,30000.0,"Quality Inspector Near Coleshill, Warwickshire...",full_time,26000.0,Engineering Jobs,0
