Scrape the Python ITJobsWatch page. Showcase:
1. Data Ingestion
2. Data Wrangling
3. Data Analysis
4. Data Visualisation

In [331]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

x = requests.get('https://www.itjobswatch.co.uk/jobs/uk/python.do')
soup = BeautifulSoup(x.text)

Parse a single table into pandas

In [332]:
def get_info_from_row(row):
    return [tag.get_text() for tag in row.find_all('tr') if len(tag.get_text()) < 50]

In [333]:
def get_info_from_section(section):
    return [get_info_from_row(row) for row in section.find_all('td') if len(get_info_from_row(row)) != 0][:14]

In [334]:
def get_skill_stats_df(skill, soup):
    section = soup.find(attrs={'id':'related_skills'})
    if section == None:
        section = soup.find(attrs={'id':'skill-set'})
    related_skill_texts = [text.split("%") for table in get_info_from_section(section) for text in table]

    dict = {'Primary Skill' : skill,
        'Secondary Skill' : [skill_text[1][1:].strip() for skill_text in related_skill_texts],
        'Percentage' : [skill_text[0].split("(")[1] for skill_text in related_skill_texts]}

    return pd.DataFrame(dict)

Scrape results page to get Skill pages in descending order
- Build df of Skill features
- Build association table

In [344]:
def get_skill_pages(page_num):
    search_page = 'https://www.itjobswatch.co.uk/default.aspx?ql=&ll=&id=0&p=6&e=200&page=' + str(page_num) + '&sortby=0&orderby=0'
    search_soup = BeautifulSoup(requests.get(search_page).text)
    return [(tag.a.get_text(), tag.a['href']) for tag in search_soup.find_all(attrs={'class':'c2'})]

Initialisation

In [348]:
skill_pages = [get_skill_pages(page_num) for page_num in range(1, 4)]
skills = [skill for page in skill_pages for skill, _ in page]
data = []
for skill_1 in skills:
    for skill_2 in skills:
        data.append([skill_1, skill_2, 0])
df_association = pd.DataFrame(columns=["Primary Skill", "Secondary Skill", "Percentage"], data=data)

Create df containing the following features for a skill:
- Name of skill
- Rank change
- % of all permanent jobs
- Category
- % of category
- Median annual salary
- Median annual salary (excl London)


In [338]:
def get_job_stats(skill, soup):
    info = [tag.get_text() for tag in soup.find('table').find_all('td')]
    return [skill, info[1], info[5], info[13], info[16].split("As % of the ")[1][:-9], info[17], info[33], info[49]]

In [351]:
columns = ["Skill", "Rank", "Rank Change", "% Jobs", "Category", "% Category", "Median Salary", "Median Salary (Excluding London)"]
data = []
for search_results_page in skill_pages:
    for skill, page in search_results_page:
        soup = BeautifulSoup(requests.get("https://www.itjobswatch.co.uk/" + page).text)
        features = get_job_stats(skill, soup)
        print(features)
        data.append(features)
        # skill association
        df_skills = get_skill_stats_df(skill, soup)
        df_association = df_association.merge(df_skills, on=["Primary Skill", "Secondary Skill"], how="left")
        df_association['Percentage'] = df_association['Percentage_y'].fillna(df_association['Percentage_x'])
        df_association = df_association.drop(['Percentage_x', 'Percentage_y'], axis=1)

df = pd.DataFrame(columns=columns, data=data)


['Social Skills', '1', '+1', '25.30%', 'General', '41.10%', '£50,000', '£43,500']
['Agile', '2', '-1', '21.21%', 'Processes & Methodologies', '23.76%', '£65,000', '£59,526']
['Finance', '3', '0', '20.67%', 'General', '33.57%', '£65,000', '£55,000']
['Azure', '4', '0', '19.68%', 'Cloud Services', '50.21%', '£60,000', '£53,426']
['Microsoft', '5', '0', '18.09%', 'Vendors', '46.70%', '£49,000', '£45,000']
['Developer', '6', '+3', '15.76%', 'Job Titles', '16.54%', '£60,000', '£52,500']
['Problem-Solving', '7', '+6', '15.57%', 'Processes & Methodologies', '17.45%', '£50,000', '£45,000']
['Senior', '8', '+2', '14.58%', 'Job Titles', '15.31%', '£65,000', '£60,000']
['Degree', '9', '-1', '14.51%', 'Qualifications', '49.50%', '£55,000', '£50,000']
['SQL', '10', '-3', '13.99%', 'Programming Languages', '36.66%', '£57,500', '£50,000']
['AWS', '11', '-5', '12.53%', 'Cloud Services', '31.98%', '£70,000', '£60,000']
['Analyst', '12', '+3', '11.18%', 'Job Titles', '11.74%', '£45,000', '£40,000']
['So

Explore with starting skill

In [361]:
df_association = df_association[df_association['Percentage'] != 0]

In [419]:
def query_primary_skill(skill):
    percentages_df = df_association[df_association['Primary Skill'] == skill]
    percentages_df = percentages_df.drop("Primary Skill", axis=1).rename(columns={"Secondary Skill":"Skill"})
    df_1 = percentages_df.merge(df, on="Skill")
    
    df_1["% Jobs"] = df_1["% Jobs"].apply(lambda x : float(x[:-1]))
    df_1["% Category"] = df_1["% Category"].apply(lambda x : float(x[:-1]))
    df_1["Percentage"] = df_1["Percentage"].apply(lambda x : float(x))
    
    return df_1.sort_values(["% Jobs"], ascending=False)

In [422]:
def query_secondary_skill(skill):
    percentages_df = df_association[df_association['Secondary Skill'] == skill]
    percentages_df = percentages_df.drop("Secondary Skill", axis=1).rename(columns={"Primary Skill":"Skill"})
    df_1 = percentages_df.merge(df, on="Skill")
    
    df_1["% Jobs"] = df_1["% Jobs"].apply(lambda x : float(x[:-1]))
    df_1["% Category"] = df_1["% Category"].apply(lambda x : float(x[:-1]))
    df_1["Percentage"] = df_1["Percentage"].apply(lambda x : float(x))
    df_1["Weighted Percentage"] = df_1["Percentage"] * df_1["% Jobs"] / df_1["% Jobs"].sum()
    
    return df_1.sort_values(["Weighted Percentage"], ascending=False)

In [424]:
print(query_secondary_skill("DevOps"))

                Skill  Percentage Rank Rank Change  % Jobs  \
3               Azure       29.43    4           0   19.68   
1               Agile       23.12    2          -1   21.21   
10                AWS       27.57   11          -5   12.53   
21              CI/CD       50.65   29          -7    6.66   
69       Azure DevOps       99.10   98          +2    2.91   
..                ...         ...  ...         ...     ...   
66   Service Delivery        8.07   95         +16    2.96   
73  Change Management        8.78  107          +1    2.71   
87    Line Management        9.64  137         -32    2.28   
90         DV Cleared       10.07  141        +220    2.18   
92         Validation       10.39  145          +2    2.10   

                     Category  % Category Median Salary  \
3              Cloud Services       50.21       £60,000   
1   Processes & Methodologies       23.76       £65,000   
10             Cloud Services       31.98       £70,000   
21  Processes & Met

In [420]:
print(query_primary_skill("Azure"))

                     Skill  Percentage Rank Rank Change  % Jobs  \
0            Social Skills       21.56    1          +1   25.30   
1                    Agile       34.12    2          -1   21.21   
2                  Finance       20.06    3           0   20.67   
3                Microsoft       42.53    5           0   18.09   
4          Problem-Solving       14.90    7          +6   15.57   
5                   Degree       11.77    9          -1   14.51   
6                      SQL       24.35   10          -3   13.99   
7                      AWS       30.43   11          -5   12.53   
8     Software Engineering       11.74   13          +3   11.06   
9                   Python       15.73   14          -3   10.72   
10                 Windows       20.34   15          +5   10.48   
11                  DevOps       29.43   16          -4    9.77   
12        Security Cleared       10.06   17         +21    9.40   
13                      C#       22.07   18          +1    9.2