# manoalga.lt Scraper

Extract salary information

In [11]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from time import sleep

# Position categories

In [24]:
def get_response(url):
    # request the raw html .. check for valid request
    try:
        response = requests.get(url)
        if response.status_code != 200:
            return None
    except response.exceptions.ConnectionError:
        return None

    return response   

In [21]:
def extract_categories():
    categories_url = 'https://www.manoalga.lt/en/salaryinfo'
    response_categories = get_response(categories_url)
    if response_categories == None:
        return []

    # parse the html and extract categories data
    categories = extract_group_items(response_categories)

    # categories_column_names = ['category_name', 'category_url']
    # return pd.DataFrame(categories, columns=categories_column_names)
    return categories

In [4]:
def extract_category_info(category_url):
    response_category = get_response(category_url)

    category_info = extract_salary(response_category)
    positions = extract_group_items(response_category)
    
    category_info['positions'] = positions
    return category_info


In [5]:
def extract_group_items(response):
    soup = BeautifulSoup(response.text, 'html.parser')
    item_soup = soup.find_all("a", class_="category-full")

    items = []
    for i in item_soup:
        if i:
            name = i.find('h3').get_text()
            url = i.get('href')
            items.append({'name': name, 'url': url})
    return items

# Positions

In [31]:
def extract_salary(response):
    # get min and max salary
    soup = BeautifulSoup(response.text, 'html.parser')
    salary_range = soup.find_all("span", class_="value")
    if len(salary_range) == 2:
        salary_min = salary_range[0].get('data-monthly-value')
        salary_max = salary_range[1].get('data-monthly-value')
        return {'min_salary': float(salary_min), 'max_salary': float(salary_max)}
    return{}

In [7]:
# request the raw html of position
# response_position = requests.get(positions['position_url'][0])

In [8]:
# get min and max salary for position
# position_soup = BeautifulSoup(response_position.text, 'html.parser')
# position_salary_range = position_soup.find_all("span", class_="value")
# position_salary_min = position_salary_range[0].get('data-monthly-value')
# position_salary_max = position_salary_range[1].get('data-monthly-value')
# positions.loc[0, 'min_salary'] = float(position_salary_min)
# positions.loc[0, 'max_salary'] = float(position_salary_max)
# positions['avg_salary'] = (positions['min_salary'] + positions['max_salary'])/2
# positions.loc[[0]]

# Data scraping functions

In [22]:
def main():
    categories = extract_categories()
    
    positions = []
    for category_index, category in enumerate(categories):
        category_info = extract_category_info(category['url'])
        for position_index, position in enumerate(category_info.get('positions')):
            position_salary = extract_salary(get_response(position['url']))
            positions.append({
                                'position_name': position['name'],
                                'position_url': position['url'],
                                'position_min_salary': position_salary.get('min_salary'),
                                'position_max_salary': position_salary.get('max_salary'),
                                'category_name': category['name'],
                                'category_url': category['url'],
                                'category_min_salary': category_info.get('min_salary'),
                                'category_max_salary': category_info.get('max_salary'),
                            })
            sleep(0.5)
        sleep(0.5)
    
    pd.DataFrame(positions).to_csv('position_list.csv', index=False)

In [32]:
main()