In [200]:
import requests
import scrapy
import time
import pandas as pd
from pprint import pprint
import re
import json
import datetime

In [201]:
base_url = "https://staff.am"
main_url = base_url + "/en/jobs"

In [202]:
# Function to relative paths to job postings from a given page response
def get_relative_paths_from_one_page(page_response):
    
    relative_paths = page_response.css("div.web_item_card.hs_job_list_item a.load-more::attr(href)").getall()
    
    return relative_paths

In [203]:
# Function to get relative paths for all pages
def get_relative_paths_for_all_pages(first_page_response, base_url, delay=3):
    
    response = first_page_response
    relative_paths_for_all_pages = []
    
    while response.css("li.next a").get() is not None:
        
        relative_paths_for_all_pages.extend(get_relative_paths_from_one_page(response))
        rs = requests.get(base_url + response.css("li.next a::attr(href)").get())
        response = scrapy.http.HtmlResponse(url=rs.url, body=rs.text, encoding="utf-8")
        time.sleep(delay)
    
    # Adding relative paths of last page
    relative_paths_for_all_pages.extend(get_relative_paths_from_one_page(response))
        
    return relative_paths_for_all_pages

In [204]:
# Function to collect data from one job posting
def get_info_from_one_posting(absolute_path):
    
    rs = requests.get(absolute_path)
    response = scrapy.http.HtmlResponse(url=rs.url, body=rs.text, encoding="utf-8")

    all_default_keys = (
        "Company_Title", "Total_views", "Followers", "Active_Jobs",
        "Jobs_History", "Job_Views", "Job_Title", "Application_Deadline",
        "Industry", "Employment_term", "Job_Category", "Job_type",
        "Job_Location", "Job_description", "Job_responsibilities", "Required_qualifications", 
        "Required_candidate_level", "Salary", "Additional_information", "Professional_skills",
        "Soft_skills"
    )
    
    extracted_data = {
        "Company_Title": response.css("h1.job_company_title::text").get(),
        "Total_views": response.css("div.col-lg-7.company_info_container p.company-page-views span::text").getall()[0],
        "Followers": response.css("div.col-lg-7.company_info_container p.company-page-views span::text").getall()[1],
        "Active_Jobs": response.css("p.company-active-job span::text").get(),
        "Jobs_History": response.css("p.company-job-history span::text").get(),
        "Job_Views": re.search("[0-9]+", response.css("div.statistics p::text").get()).group(),
        "Job_Title": response.css("div.col-lg-8 h2::text").get(),
        "Application_Deadline": re.search(r"Deadline: (.*)\s", response.css("div.col-lg-4.apply-btn-top p::text").get().replace("\n", " ")).group(1)
    }
    
    try:
        extracted_data["Industry"] = response.css("div.col-lg-7.company_info_container p.professional-skills-description span::text").getall()[-1]
    except IndexError:
        extracted_data["Industry"] = "None"

    job_info = [i.strip() for i in response.css("div.col-lg-6.job-info p::text").getall() if i != "\n"]
    
    extracted_data["Employment_term"] = job_info[0]
    extracted_data["Job_Category"] = job_info[1]
    extracted_data["Job_type"] = job_info[2]
    extracted_data["Job_Location"] = job_info[3]
    
    default_job_list_keys = [
        "Job_description", "Job_responsibilities", "Required_qualifications", 
        "Required_candidate_level", "Salary", "Additional_information"
    ]
    job_list = response.css("div.job-list-content-desc.hs_line_break")
    job_list_keys = [i.strip().replace(":", "").replace(" ", "_") for i in job_list.css("h3::text").getall()]
    
    if len(default_job_list_keys) < len(job_list_keys):
        print("New 'h3' fields in job description list are present: \nCheck the output.")
    
    for key in default_job_list_keys:
        if key not in job_list_keys:
            extracted_data[key] = "None"
                
    for cnt, h3 in enumerate(job_list.css('h3'), start=1):
        
        key = h3.css("::text").get().strip().replace(":", "").replace(" ", "_")
        
        if key in ["Required_candidate_level", "Salary"]:
            extracted_data[key] = h3.css("span::text").get()
        else:
            values = h3.xpath("following-sibling::*[count(preceding-sibling::h3)=$cnt]", cnt=cnt)[:-1].css("::text").getall()
            extracted_data[key] = [value.strip() for value in values if value != "\n"]
            
    skills_info_keys = [i.replace(" ", "_") for i in response.css("div.soft-skills-list.clearfix h3::text").getall()]
    
    if "Professional_skills" in skills_info_keys:
        ind = skills_info_keys.index("Professional_skills")
        extracted_data["Professional_skills"] = response.css("div.soft-skills-list.clearfix")[ind].css("p span::text, .p::text").getall()
    else:
        extracted_data["Professional_skills"] = "None"
    if "Soft_skills" in skills_info_keys:
        ind = skills_info_keys.index("Soft_skills")
        extracted_data["Soft_skills"] = response.css("div.soft-skills-list.clearfix")[ind].css("p span::text, .p::text").getall()
    else:
        extracted_data["Soft_skills"] = "None"

    for key in all_default_keys:
        if key not in extracted_data.keys():
            print("Default field: '{}' is missing in extracted data.".format(key))
            extracted_data[key] = "None"

    return extracted_data

In [205]:
# Function to collect data from all job postings given absolute paths
def crawl_all_postings(absolute_paths, delay=5):
    
    extracted_data = {}
    
    i = 1
    
    for path in absolute_paths:

        print(path)
        print(str(i) + "/" + str(len(absolute_paths)), "Est.: " + str(round((10 * (len(absolute_paths) - i))/60, 2)) + "m.")
        i += 1
        extracted_data_from_posting = get_info_from_one_posting(path)
        
        for key, value in extracted_data_from_posting.items():
            
            if key not in extracted_data.keys():
                
                extracted_data[key] = []
                
            extracted_data[key].append(extracted_data_from_posting[key])
        
        with open("log.json", "w") as l:
            l.write(json.dumps(extracted_data))
        
        time.sleep(delay)
    
    return extracted_data

In [206]:
# Function to collect data about companies
def crawl_all_companies(absolute_paths_companies):
    
    # TODO

    return ""

In [207]:
# Function to make dict data to be saved as csv
def format_to_csv(file):
    
    keys_to_format = (
        "Job_description", "Job_responsibilities", "Required_qualifications", 
        "Additional_information", "Professional_skills", "Soft_skills"
    )
    
    for key, value in file.items():
        if key in keys_to_format:
            file[key] = "\n".join(file[key])
            
    return file

In [208]:
# Function to save the data in a given file format
def save_files(file, *formats):
    
    date = datetime.date.today().strftime("%d.%m.%y")
    
    for ext in formats:
        
        if ext not in ("csv", "json"):
            print("Can't save in {} format.\nSaving both to 'csv' and 'json'.".format(ext))
            save_files(file, "csv", "json")
            break
        elif ext == "csv":
            file_csv = format_to_csv(file)
            try:
                file_csv = pd.DataFrame(file_csv)
                file_csv.to_csv("staff_" + date + ".csv")
            except ValueError:
                print("Unable to save 'csv': missing values are present.\nSaving to 'json'")
                save_files(file, "json")
                break
        elif ext == "json":
            with open("staff_" + date + ".json", "w") as f:
                f.write(json.dumps(file))

In [209]:
# Main function
def main():
    
    start_time = time.ctime()
    print(start_time)
    print("Starting crawling '{}'.".format(base_url))

    rs = requests.get(main_url)
    response = scrapy.http.HtmlResponse(url=rs.url, body=rs.text, encoding="utf-8")

    relative_paths_for_all_pages = get_relative_paths_for_all_pages(response, base_url, delay=5)
    absolute_paths = [base_url + path for path in relative_paths_for_all_pages]

    print(str(len(absolute_paths)) + " jobs detected.")
    print("Starting crawling job postings..")
    
    extracted_data = crawl_all_postings(absolute_paths)

    end_time = time.ctime()
    print(end_time)
    print("Ended crawling '{}'.".format(base_url))

    print(str(len(extracted_data["Company_Title"])) + "/" + str(len(absolute_paths)) + " jobs scraped.")
    print("Saving files.")

    for key in data.keys():
        print(key + ": " + str(len(data[key])))

    save_files(extracted_data, "csv", "json")
    
    return extracted_data

In [196]:
main()

Sun Feb  2 07:44:47 2020
Starting crawling 'https://staff.am'.
796 jobs detected.
Starting crawling job postings..
https://staff.am/en/brand-manager-60
1/796 Est.: 132.5m.
https://staff.am/en/1c-operator-14
2/796 Est.: 132.33m.
https://staff.am/en/sales-consultant-252
3/796 Est.: 132.17m.
https://staff.am/en/user-success-support-specialist-6
4/796 Est.: 132.0m.
https://staff.am/en/senior-ios-developer-63
5/796 Est.: 131.83m.
https://staff.am/en/java-engineer-33
6/796 Est.: 131.67m.
https://staff.am/en/it-monitoring-specialist-247-shift-9
7/796 Est.: 131.5m.
https://staff.am/en/senior-c-developer-91
8/796 Est.: 131.33m.
https://staff.am/en/mobile-software-engineer-in-test-15
9/796 Est.: 131.17m.
https://staff.am/en/release-engineer-mobile-18
10/796 Est.: 131.0m.
https://staff.am/en/dizayner-22
11/796 Est.: 130.83m.
https://staff.am/en/loan-analyst-8
12/796 Est.: 130.67m.
https://staff.am/en/horeca-manager-3
13/796 Est.: 130.5m.
https://staff.am/en/data-analyst-41
14/796 Est.: 130.33m.
h

https://staff.am/en/accountant-349
112/796 Est.: 114.0m.
https://staff.am/en/mediaplaner-2
113/796 Est.: 113.83m.
https://staff.am/en/manual-qa-engineer-78
114/796 Est.: 113.67m.
https://staff.am/en/wordpress-developer-68
115/796 Est.: 113.5m.
https://staff.am/en/hr-specialist-91
116/796 Est.: 113.33m.
https://staff.am/en/hasvetar-operator-22
117/796 Est.: 113.17m.
https://staff.am/en/sr-software-engineer-front-end-21
118/796 Est.: 113.0m.
https://staff.am/en/javascript-developer-vuejs
119/796 Est.: 112.83m.
https://staff.am/en/inspektor-po-personalu-kontrol-kacestva
120/796 Est.: 112.67m.
https://staff.am/en/pomosnik-v-tehniceskij-otdel-specialist-po-rng
121/796 Est.: 112.5m.
https://staff.am/en/senior-golang-developer-4
122/796 Est.: 112.33m.
https://staff.am/en/ip-and-transportation-networks-exploitation-senior-engineer-1
123/796 Est.: 112.17m.
https://staff.am/en/junior-computer-vision-engineer
124/796 Est.: 112.0m.
https://staff.am/en/leasing-leading-specialist-1
125/796 Est.: 111

https://staff.am/en/vacarki-menejer-79
223/796 Est.: 95.5m.
https://staff.am/en/office-activity-coordinator
224/796 Est.: 95.33m.
https://staff.am/en/chief-accountant-62
225/796 Est.: 95.17m.
https://staff.am/en/web-engineer-3
226/796 Est.: 95.0m.
https://staff.am/en/vacarki-timi-gekavar-2
227/796 Est.: 94.83m.
https://staff.am/en/accountant-347
228/796 Est.: 94.67m.
https://staff.am/en/hh-armaviri-marzi-vagarsapati-m-gorku-anvan-tiv-5-avag-dproc-poak-i-tnoreni-paston-zbagecnelu-hamar-4
229/796 Est.: 94.5m.
https://staff.am/en/contract-management-senior-lawyer-5
230/796 Est.: 94.33m.
https://staff.am/en/marketing-communications-and-events-manager-2
231/796 Est.: 94.17m.
https://staff.am/en/technical-support-specialist---night-shift-7
232/796 Est.: 94.0m.
https://staff.am/en/midsenior-nodejs-developer
233/796 Est.: 93.83m.
https://staff.am/en/ios-developer-161
234/796 Est.: 93.67m.
https://staff.am/en/manager-34
235/796 Est.: 93.5m.
https://staff.am/en/artadrutyan-hasvetar-2
236/796 Est

https://staff.am/en/digital-product-owner-4
331/796 Est.: 77.5m.
https://staff.am/en/hh-siraki-marzi-gusanagyugi-mijnakarg-dproc-poak-i-gorcadir-marmni-tnoreni-paston-zbagecnelu-hamar
332/796 Est.: 77.33m.
https://staff.am/en/junior-customer-success-manager-6
333/796 Est.: 77.17m.
https://staff.am/en/senior-accountant-47
334/796 Est.: 77.0m.
https://staff.am/en/javadata-engineer
335/796 Est.: 76.83m.
https://staff.am/en/copywriter-43
336/796 Est.: 76.67m.
https://staff.am/en/data-scientist-48
337/796 Est.: 76.5m.
https://staff.am/en/google-adwords-facebook-ads-expert
338/796 Est.: 76.33m.
https://staff.am/en/sales-consultant-248
339/796 Est.: 76.17m.
https://staff.am/en/specialist-po-zakupkam-19
340/796 Est.: 76.0m.
https://staff.am/en/service-specialist-burberry-armani-ermenegildo-zegna
341/796 Est.: 75.83m.
https://staff.am/en/incoming-tour-manager-22
342/796 Est.: 75.67m.
https://staff.am/en/hardware-services-project-manager
343/796 Est.: 75.5m.
https://staff.am/en/senior-qa-enginee

https://staff.am/en/hh-tavusi-marzi-baganisi-mijnakarg-dproc-poak-i-tnoreni-paston-zbagecnelu-hamar
443/796 Est.: 58.83m.
https://staff.am/en/inzener-18
444/796 Est.: 58.67m.
https://staff.am/en/office-manager-123
445/796 Est.: 58.5m.
https://staff.am/en/accountant-341
446/796 Est.: 58.33m.
https://staff.am/en/finansakan-bazhni-ognakan-8
447/796 Est.: 58.17m.
https://staff.am/en/database-developer-ms-sql-1
448/796 Est.: 58.0m.
https://staff.am/en/incouming-tour-manager
449/796 Est.: 57.83m.
https://staff.am/en/ui-coder-frontend-developer
450/796 Est.: 57.67m.
https://staff.am/en/housekeeping-manager-6
451/796 Est.: 57.5m.
https://staff.am/en/graphic-designer-297
452/796 Est.: 57.33m.
https://staff.am/en/retail-store-representative
453/796 Est.: 57.17m.
https://staff.am/en/internal-sales-representative
454/796 Est.: 57.0m.
https://staff.am/en/mid-net-engineer
455/796 Est.: 56.83m.
https://staff.am/en/users-support-and-incidents-management-engineer-1
456/796 Est.: 56.67m.
https://staff.a

https://staff.am/en/quality-engineering-manager-23614
542/796 Est.: 42.33m.
https://staff.am/en/senior-rd-engineer-23332
543/796 Est.: 42.17m.
https://staff.am/en/junior-asic-digital-design-engineer-23322
544/796 Est.: 42.0m.
https://staff.am/en/senior-qa-engineer-23102
545/796 Est.: 41.83m.
https://staff.am/en/junior-ams-circuit-design-engineer-23084
546/796 Est.: 41.67m.
https://staff.am/en/mid-level-rd-engineer-22796
547/796 Est.: 41.5m.
https://staff.am/en/ams-layout-design-supervisor-22744
548/796 Est.: 41.33m.
https://staff.am/en/mid-level-ams-circuit-design-engineer22378br-9
549/796 Est.: 41.17m.
https://staff.am/en/mid-level-ams-layout-design-engineer22346br-6
550/796 Est.: 41.0m.
https://staff.am/en/project-admin-manager22313br-12
551/796 Est.: 40.83m.
https://staff.am/en/junior-quality-engineer20793br-12
552/796 Est.: 40.67m.
https://staff.am/en/senior-software-engineer22294br-12
553/796 Est.: 40.5m.
https://staff.am/en/senior-ams-circuit-design-engineer22155br-12
554/796 Est

https://staff.am/en/mid-graphic-designer-1
660/796 Est.: 22.67m.
https://staff.am/en/software-engineering-manager-6
661/796 Est.: 22.5m.
https://staff.am/en/senior-reactjs-engineer
662/796 Est.: 22.33m.
https://staff.am/en/zangeri-kentroni-operatorner-2
663/796 Est.: 22.17m.
https://staff.am/en/senior-ios-developer-57
664/796 Est.: 22.0m.
https://staff.am/en/nergna-tur-menejer-germaneren-lezvi-imacutyamb
665/796 Est.: 21.83m.
https://staff.am/en/tumo-center-manager-gyumri
666/796 Est.: 21.67m.
https://staff.am/en/fundraising-officer
667/796 Est.: 21.5m.
https://staff.am/en/database-administrator-73
668/796 Est.: 21.33m.
https://staff.am/en/it-security-specialist-8
669/796 Est.: 21.17m.
https://staff.am/en/contact-center-specialist-25
670/796 Est.: 21.0m.
https://staff.am/en/videotesahskman-masnaget-1
671/796 Est.: 20.83m.
https://staff.am/en/customer-service-specialist-51
672/796 Est.: 20.67m.
https://staff.am/en/net-developer-127
673/796 Est.: 20.5m.
https://staff.am/en/cto
674/796 Es

https://staff.am/en/front-end-developer-204
773/796 Est.: 3.83m.
https://staff.am/en/qa-4
774/796 Est.: 3.67m.
https://staff.am/en/hacaxordneri-spasarkman-masnaget-20
775/796 Est.: 3.5m.
https://staff.am/en/hacaxordneri-spasarkman-masnaget-17
776/796 Est.: 3.33m.
https://staff.am/en/full-stack-javascript-developer-10
777/796 Est.: 3.17m.
https://staff.am/en/incoming-and-outgoing-tourism-manager-airtickets-specialist-1
778/796 Est.: 3.0m.
https://staff.am/en/graphic-designer-293
779/796 Est.: 2.83m.
https://staff.am/en/arevayin-elektrakayanneri-spasarkog
780/796 Est.: 2.67m.
https://staff.am/en/inzener-naxagcog-energetikayi-olortum-4
781/796 Est.: 2.5m.
https://staff.am/en/mardkayin-resursneri-karavarman-masnaget-hr
782/796 Est.: 2.33m.
https://staff.am/en/artcraft-teacher
783/796 Est.: 2.17m.
https://staff.am/en/logistics-company-branch-manager-all-state
784/796 Est.: 2.0m.
https://staff.am/en/salesperson-9
785/796 Est.: 1.83m.
https://staff.am/en/it-project-manager-55
786/796 Est.: 1.

TypeError: can only concatenate str (not "int") to str