Scrape the Python ITJobsWatch page. Showcase:
1. Data Ingestion
2. Data Wrangling
3. Data Analysis
4. Data Visualisation

In [331]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

x = requests.get('https://www.itjobswatch.co.uk/jobs/uk/python.do')
soup = BeautifulSoup(x.text)

Parse a single table into pandas

In [332]:
def get_info_from_row(row):
    return [tag.get_text() for tag in row.find_all('tr') if len(tag.get_text()) < 50]

In [333]:
def get_info_from_section(section):
    return [get_info_from_row(row) for row in section.find_all('td') if len(get_info_from_row(row)) != 0][:14]

In [334]:
def get_skill_stats_df(skill, soup):
    section = soup.find(attrs={'id':'related_skills'})
    if section == None:
        section = soup.find(attrs={'id':'skill-set'})
    related_skill_texts = [text.split("%") for table in get_info_from_section(section) for text in table]

    dict = {'Primary Skill' : skill,
        'Secondary Skill' : [skill_text[1][1:].strip() for skill_text in related_skill_texts],
        'Percentage' : [skill_text[0].split("(")[1] for skill_text in related_skill_texts]}

    return pd.DataFrame(dict)

Scrape results page to get Skill pages in descending order
- Build df of Skill features
- Build association table

In [344]:
def get_skill_pages(page_num):
    search_page = 'https://www.itjobswatch.co.uk/default.aspx?ql=&ll=&id=0&p=6&e=200&page=' + str(page_num) + '&sortby=0&orderby=0'
    search_soup = BeautifulSoup(requests.get(search_page).text)
    return [(tag.a.get_text(), tag.a['href']) for tag in search_soup.find_all(attrs={'class':'c2'})]

Initialisation

In [348]:
skill_pages = [get_skill_pages(page_num) for page_num in range(1, 4)]
skills = [skill for page in skill_pages for skill, _ in page]
data = []
for skill_1 in skills:
    for skill_2 in skills:
        data.append([skill_1, skill_2, 0])
df_association = pd.DataFrame(columns=["Primary Skill", "Secondary Skill", "Percentage"], data=data)

Create df containing the following features for a skill:
- Name of skill
- Rank change
- % of all permanent jobs
- Category
- % of category
- Median annual salary
- Median annual salary (excl London)


In [338]:
def get_job_stats(skill, soup):
    info = [tag.get_text() for tag in soup.find('table').find_all('td')]
    return [skill, info[1], info[5], info[13], info[16].split("As % of the ")[1][:-9], info[17], info[33], info[49]]

In [339]:
columns = ["Skill", "Rank", "Rank Change", "% Jobs", "Category", "% Category", "Median Salary", "Median Salary (Excluding London)"]
data = []
for page_num in range(1, 4):
    for skill, page in get_skill_pages(page_num):
        soup = BeautifulSoup(requests.get("https://www.itjobswatch.co.uk/" + page).text)
        features = get_job_stats(skill, soup)
        print(features)
        data.append(features)
        # skill association
        df_skills = get_skill_stats_df(skill, soup)
        df_association = df_association.merge(df_skills, on=["Primary Skill", "Secondary Skill"], how="left")
        df_association['Percentage'] = df_association['Percentage_y'].fillna(df_association['Percentage_x'])
        df_association = df_association.drop(['Percentage_x', 'Percentage_y'], axis=1)

df = pd.DataFrame(columns=columns, data=data)


['Social Skills', '1', '+1', '25.30%', 'General', '41.10%', '£50,000', '£43,500']
['Agile', '2', '-1', '21.21%', 'Processes & Methodologies', '23.76%', '£65,000', '£59,526']
['Finance', '3', '0', '20.67%', 'General', '33.57%', '£65,000', '£55,000']
['Azure', '4', '0', '19.68%', 'Cloud Services', '50.21%', '£60,000', '£53,426']
['Microsoft', '5', '0', '18.09%', 'Vendors', '46.70%', '£49,000', '£45,000']
['Developer', '6', '+3', '15.76%', 'Job Titles', '16.54%', '£60,000', '£52,500']
['Problem-Solving', '7', '+6', '15.57%', 'Processes & Methodologies', '17.45%', '£50,000', '£45,000']
['Senior', '8', '+2', '14.58%', 'Job Titles', '15.31%', '£65,000', '£60,000']
['Degree', '9', '-1', '14.51%', 'Qualifications', '49.50%', '£55,000', '£50,000']
['SQL', '10', '-3', '13.99%', 'Programming Languages', '36.66%', '£57,500', '£50,000']
['AWS', '11', '-5', '12.53%', 'Cloud Services', '31.98%', '£70,000', '£60,000']
['Analyst', '12', '+3', '11.18%', 'Job Titles', '11.74%', '£45,000', '£40,000']
['So

In [342]:
df_association.describe()

Unnamed: 0,Primary Skill,Secondary Skill,Percentage
count,2500,2500,2500
unique,50,50,861
top,JavaScript,JavaScript,0
freq,50,50,1481
