Scrape the Python ITJobsWatch page. Showcase:
1. Data Ingestion
2. Data Wrangling
3. Data Analysis
4. Data Visualisation

In [142]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

x = requests.get('https://www.itjobswatch.co.uk/jobs/uk/python.do')
soup = BeautifulSoup(x.text)

Parse a single table into pandas

In [143]:
def get_info_from_row(row):
    return [tag.get_text() for tag in row.find_all('tr') if len(tag.get_text()) < 50]

In [144]:
def get_info_from_section(section):
    return [get_info_from_row(row) for row in section.find_all('td') if len(get_info_from_row(row)) != 0][:14]

In [225]:
def get_skill_stats_df(skill, soup):
    related_skill_texts = [text.split("%") for table in get_info_from_section(soup.find(attrs={'id':'related_skills'})) for text in table]

    dict = {'Primary Skill' : skill,
        'Secondary Skill' : [skill_text[1][1:].strip() for skill_text in related_skill_texts],
        'Percentage' : [skill_text[0].split("(")[1] for skill_text in related_skill_texts]}

    return pd.DataFrame(dict)

In [146]:
print(get_skill_stats_df(soup))

   Percentage                    Skill
0       38.57                      AWS
1       33.40                      SQL
2       32.02                    Agile
3       30.93     Software Engineering
4       28.88                    Azure
5       28.06                  Finance
6       27.35                     Java
7       25.72                   DevOps
8       22.29          Problem-Solving
9       22.14                    CI/CD
10      20.62                   Degree
11      20.26            Social Skills
12      20.04                    Linux
13      19.24               JavaScript
14      17.03               Kubernetes
15      16.65         Machine Learning
16      16.35                   Docker
17      15.34                Analytics
18      14.98                      GCP
19      14.70         Security Cleared
20      14.53                     Bash
21      13.19                       C#
22      13.16  Artificial Intelligence
23      13.00         Computer Science
24      12.97            

In [147]:
def get_category_stats_df(soup, i, name):
    info = get_info_from_section(soup.find_all('table')[6].find_all('table')[i])
    
    dict =  {
            'Category' : name,
            'Ranking' : [row[0] for row in info],
            'Percentage' : [row[1] for row in info],
            'Skill' : [row[2] for row in info]
            }

    return pd.DataFrame(dict)

Scrape results page to get Skill pages in descending order
- Build df of Skill features
- Build association table

In [175]:
page_num = 1
search_page = 'https://www.itjobswatch.co.uk/default.aspx?ql=&ll=&id=0&p=6&e=200&page=' + str(page_num) + '&sortby=0&orderby=0'
search_soup = BeautifulSoup(requests.get(search_page).text)

In [183]:
skill_pages = [(tag.a.get_text(), tag.a['href']) for tag in search_soup.find_all(attrs={'class':'c2'})]
print(skill_pages)

[('Social Skills', '/jobs/uk/social%20skills.do'), ('Agile', '/jobs/uk/agile.do'), ('Finance', '/jobs/uk/finance.do'), ('Azure', '/jobs/uk/azure.do'), ('Microsoft', '/jobs/uk/microsoft.do'), ('Developer', '/jobs/uk/developer.do'), ('Problem-Solving', '/jobs/uk/problem-solving.do'), ('Senior', '/jobs/uk/senior.do'), ('Degree', '/jobs/uk/degree.do'), ('SQL', '/jobs/uk/sql.do'), ('AWS', '/jobs/uk/aws.do'), ('Analyst', '/jobs/uk/analyst.do'), ('Software Engineering', '/jobs/uk/software%20engineering.do'), ('Python', '/jobs/uk/python.do'), ('Windows', '/jobs/uk/windows.do'), ('DevOps', '/jobs/uk/devops.do'), ('Security Cleared', '/jobs/uk/security%20cleared.do'), ('C#', '/jobs/uk/csharp.do'), ('JavaScript', '/jobs/uk/javascript.do'), ('Analytical Skills', '/jobs/uk/analytical%20skills.do'), ('Java', '/jobs/uk/java.do'), ('Microsoft 365', '/jobs/uk/microsoft%20365.do'), ('Lead', '/jobs/uk/lead.do'), ('.NET', '/jobs/uk/.net.do'), ('Mentoring', '/jobs/uk/mentoring.do'), ('Project Management', 

In [232]:
skills = [skill for skill, _ in skill_pages]
data = []
for skill_1 in skills:
    for skill_2 in skills:
        data.append([skill_1, skill_2, 0])
df_association = pd.DataFrame(columns=["Primary Skill", "Secondary Skill", "Percentage"], data=data)
print(df_association)

      Primary Skill          Seconday Skill  Percentage
0     Social Skills           Social Skills           0
1     Social Skills                   Agile           0
2     Social Skills                 Finance           0
3     Social Skills                   Azure           0
4     Social Skills               Microsoft           0
...             ...                     ...         ...
2495     Kubernetes                  Retail           0
2496     Kubernetes  Full Stack Development           0
2497     Kubernetes  Stakeholder Management           0
2498     Kubernetes           Microservices           0
2499     Kubernetes              Kubernetes           0

[2500 rows x 3 columns]


Create df containing the following features for a skill:
- Name of skill
- Rank change
- % of all permanent jobs
- Category
- % of category
- Median annual salary
- Median annual salary (excl London)


In [219]:
def get_job_stats(skill, soup):
    info = [tag.get_text() for tag in soup.find('table').find_all('td')]
    return [skill, info[1], info[5], info[13], info[16].split("As % of the ")[1][:-9], info[17], info[33], info[49]]

In [217]:
columns = ["Skill", "Rank", "Rank Change", "% Jobs", "Category", "% Category", "Median Salary", "Median Salary (Excluding London)"]
data = []
for skill, page in skill_pages:
    soup = BeautifulSoup(requests.get("https://www.itjobswatch.co.uk/" + page).text)
    features = get_job_stats(skill, soup)
    data.append(features)
df = pd.DataFrame(columns=columns, data=data)
print(df)
    

                      Skill Rank Rank Change  % Jobs  \
0             Social Skills    1          +1  25.30%   
1                     Agile    2          -1  21.21%   
2                   Finance    3           0  20.67%   
3                     Azure    4           0  19.68%   
4                 Microsoft    5           0  18.09%   
5                 Developer    6          +3  15.76%   
6           Problem-Solving    7          +6  15.57%   
7                    Senior    8          +2  14.58%   
8                    Degree    9          -1  14.51%   
9                       SQL   10          -3  13.99%   
10                      AWS   11          -5  12.53%   
11                  Analyst   12          +3  11.18%   
12     Software Engineering   13          +3  11.06%   
13                   Python   14          -3  10.72%   
14                  Windows   15          +5  10.48%   
15                   DevOps   16          -4   9.77%   
16         Security Cleared   17         +21   9

In [227]:
skill_association = get_skill_stats_df("Kubernetes", soup)
print(skill_association)

   Primary Skill         Secondary Skill Percentage
0     Kubernetes                  Docker      69.52
1     Kubernetes                     AWS      51.74
2     Kubernetes                  DevOps      51.05
3     Kubernetes                   Azure      45.93
4     Kubernetes                   Agile      44.71
5     Kubernetes                   CI/CD      40.96
6     Kubernetes                  Python      38.35
7     Kubernetes                    Java      38.00
8     Kubernetes               Terraform      35.44
9     Kubernetes    Software Engineering      28.44
10    Kubernetes                 Finance      25.92
11    Kubernetes           Microservices      25.54
12    Kubernetes                 Jenkins      23.53
13    Kubernetes        Containerisation      23.07
14    Kubernetes                   Linux      22.98
15    Kubernetes  Infrastructure as Code      21.59
16    Kubernetes                 Ansible      19.44
17    Kubernetes                     GCP      19.06
18    Kubern

In [150]:
df_genr = get_category_stats_df(soup, 7, "General")
df_jobs = get_category_stats_df(soup, 8, "Job")
df_libs = get_category_stats_df(soup, 9, "Library")
df_lang = get_category_stats_df(soup, 13, "Language")

df = pd.concat([df_genr, df_jobs, df_libs, df_lang])