In [1]:

import numpy as np
import bs4
from bs4 import BeautifulSoup
import pandas as pd
import time
import requests

In [2]:
def extract_title(div):
	"""
	Wyszukanie informacji o tytule ogloszenia
	:param div:znacznik html w obrebie ktorego poszukujemy tytulu
	:return: tytul w formie stringa
	"""
	titles = 0
	for title in div.find_all(name='div', attrs={'class': 'title'}):
		for a in title.find_all(name='a', attrs={'data-tn-element': 'jobTitle'}):
			titles = (a['title'])
	return (titles)

In [3]:
def extract_company(div):
	"""
	Wyszukanie informacji o nazwie firmy
	:param div: znacznik html w obrebie ktorego poszukujemy nazwy firmy
	:return: nazwa firmy w formie stringa, lub informacja o jej nie znalezieniu
	"""
	companies=0
	for compa in div.find_all(name='div',attrs={'class':'sjcl'}):
		for comp in compa.find_all(name='span',attrs={'class':'company'}):
			if comp.find(name='a',attrs={'data-tn-element':'companyName'}):
				for company in comp.find_all(name='a',attrs={'data-tn-element':'companyName'}):
					companies=(company.text.strip())
			else:
				if(len(comp.text)>0):
					companies=(comp.text.strip())
				else:
					companies='company not found'
	return(companies)


In [4]:
def extract_location(div):
	"""
	Wyszukanie informacji o lokalizacji
	:param div: znacnzik html w obrebie ktorego poszukujemy lokalizacji
	:return: lokalizacja w formie stringa
	"""
	locations=0
	for location in div.find_all(name='div',attrs={'class':'sjcl'}):
		for loc in location.find_all(name='span',attrs={'class':'location'}):
			locations=loc.text.strip()
	return(locations)
	

In [5]:
def extract_summary(div):
	"""
	wyszukanie opisu stanowiska z ogloszenia
	:param div: znacznik html w obrebie ktorego poszukujemy opisu
	:return: opis w formie stringa
	"""
	summary=0
	for summ in div.find_all(name='div',attrs={'class','summary'}):
		summary=summ.text.strip()
	return(summary)

In [6]:
def extract_salary(div):
	"""
	Wyszukanie informacji o stawce w ogloszeniu
	:param div: znacnzik html w obrebie ktorego poszukujemy stawki
	:return: stawka w formie stringa
	"""
	salary=0
	if div.find('div',attrs={'class':'salarySnippet'}):
		for salar in div.find_all(name='div',attrs={'class':'salarySnippet'}):
			for span in salar.find_all(name='span',attrs={'class':'salary'}):
				salary=span.text.strip()
	else:
		salary='nothing found'
	return(salary)

In [7]:
def scrap_web(cities,max_results_per_city,name):
	"""
	Funkcja sluzaca do scrapowania informacji ze strony indeed.com
	:param cities: tabela miast w ktorych chcemy wyszukac ogloszenia (w przypadku miasta z 2 czlonami podajemy je z '+',np New+York)
	:param max_results_per_city: liczba okreslajaca ile maksymalnie ogloszen dla danego miasta chcemy przejrzec
	:param name: nazwa pliku csv do ktorego chcemy zapsiac nasze wyniki
	:return: none
	"""
	max_results_per_city=max_results_per_city
	cities=cities
	columns=['city','job_title','company_name','location','summary','salary']
	excel_name_=name
	sample_df=pd.DataFrame(columns=columns)
	for city in cities:
		for start in range(0,max_results_per_city,10):
			page=requests.get('https://www.indeed.com/jobs?q=data+scientist&l='+str(city)+'&start='+str(start))
			time.sleep(1)
			soup=BeautifulSoup(page.text,'lxml',from_encoding='utf-8')
			for div in soup.find_all(name='div',attrs={'class':'row'}):
				num=(len(sample_df)+1)
				job_post=[]
				job_post.append(city)
				job_post.append(extract_title(div))
				job_post.append(extract_company(div))
				job_post.append(extract_location(div))
				job_post.append(extract_summary(div))
				job_post.append(extract_salary(div))

				sample_df.loc[num]=job_post
		
	sample_df.to_csv(f'{name}.csv',encoding='utf-8')

In [9]:
cities=['New+York','Wrocław','Radom']
n=100
name='test'
scrap_web(cities,n,name)





In [10]:
df=pd.read_csv('test.csv')

In [11]:
df

Unnamed: 0.1,Unnamed: 0,city,job_title,company_name,location,summary,salary
0,1,New+York,"Data Scientist,Creator Marketplace – all levels",Spotify,"New York, NY 10011 (Chelsea area)",The Creator Product Insights Team is a thrivin...,nothing found
1,2,New+York,"Data Scientist, Creator Marketing",Spotify,"New York, NY 10011 (Chelsea area)",The Creator Product Insights Team is a thrivin...,nothing found
2,3,New+York,MODA Data Scientist,New York City DEPT OF INFO TECH & TELECOMM,"Manhattan, NY",MODA seeks a Data Scientist to join a diverse ...,"$52,524 - $79,000 a year"
3,4,New+York,Head of Data Science,EQUINOX,"New York, NY","Collaborate hands on with business, data scien...",nothing found
4,5,New+York,Junior Data Scientist,Viacom,"New York, NY 10036",Familiarity with statistical modeling and mach...,nothing found
5,6,New+York,"Data Scientist, Studios Analytics",Spotify,"New York, NY 10011 (Chelsea area)",2+ years (5+ years for senior role) of relevan...,nothing found
6,7,New+York,Data Scientist – Personalization,Spotify,"New York, NY 10011 (Chelsea area)",Contribute to the development of the Product I...,nothing found
7,8,New+York,Data Scientist,1-800-Flowers,"Carle Place, NY 11514",This individual will manage one or more analys...,nothing found
8,9,New+York,"Data Scientist, Risk",Square,"New York, NY 10032 (Washington Heights area)",Experience developing and deploying machine le...,nothing found
9,10,New+York,Data Scientist,AETNA,"New York, NY 10016 (Gramercy area)",Aetna's Provider & Network Analytics team is f...,nothing found
