In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [None]:
page_number = 1
initial_url = 'https://jobinja.ir/jobs?&b=&filters%5Bjob_categories%5D%5B0%5D=&filters%5Bkeywords%5D%5B0%5D=&filters%5Blocations%5D%5B0%5D='

# Create an empty list to store job data dictionaries
job_data_list = []

for i in range(1,10):
    url = initial_url + f'&page={page_number}'
    response = requests.get(url)

    if response.status_code != 200:
        break  # Stop crawling if the page is not accessible or doesn't exist

    soup = BeautifulSoup(response.content, 'html.parser')
    job_links = soup.find_all('a', class_='c-jobListView__titleLink')

    if not job_links:
        break  # Stop crawling if there are no job links on the page

    for job_link in job_links:
        job_url = job_link['href']
        job_response = requests.get(job_url)

        if job_response.status_code == 200:
            job_soup = BeautifulSoup(job_response.content, 'html.parser')
            company_name_element = job_soup.find('h2', class_='c-companyHeader__name')
    
            # Get the company name (both Persian and English)
            company_name = company_name_element.text.strip().split('|')[0].strip()
    
            # Find the company meta information
            company_meta_elements = job_soup.find_all('span', class_='c-companyHeader__metaItem')
            
            # i wish whoever came up with this name tagging idea for the website a painful death i had
            # to start over two times after crawling through the entire website, scraping 26k data because
            # around 2.5k fields (around 10% of data) in company category/size had their values switched up...
            if len(company_meta_elements) > 3:
                print(len(company_meta_elements))
                
                company_category = company_meta_elements[1].find('a', class_='c-companyHeader__metaLink').text.strip()
                company_size = company_meta_elements[2].text.strip()
                company_website = company_meta_elements[3].find('a', class_='c-companyHeader__metaLink').text.strip()
            else:
                # Get the company category (only the Persian phrase)
                company_category_element = company_meta_elements[0].find('a', class_='c-companyHeader__metaLink')
                company_category = company_category_element.text.strip() if company_category_element else ''
        
                # Get the company size (only the Persian phrase)
                company_size = company_meta_elements[1].text.strip() if len(company_meta_elements) > 1 else ''
        
                # Get the company website (if available)
                company_website_element = company_meta_elements[-1].find('a', class_='c-companyHeader__metaLink', target='_blank')
                company_website = company_website_element['href'].replace('https://', '').replace('http://', '').replace('/', '') if company_website_element else ''
                
            
    
            # Get the job position
            job_position_element = job_soup.find('div', class_='c-jobView__title').find('h1')
            job_position = job_position_element.text.strip() if job_position_element else ''
            
            # Pulling all 5 elements from the job info box
            info_box_element = job_soup.find('ul', class_='c-jobView__firstInfoBox c-infoBox').find_all('li', class_='c-infoBox__item')
            
            # Job category
            job_category = info_box_element[0].find('span', class_='black').text.strip()
            
            # Job location
            job_location = info_box_element[1].find('span', class_='black').text.strip().split('،')[0].strip()
            
            # Job employment type
            job_employment_type = info_box_element[2].find('span', class_='black').text.strip()
            
            # Job least experience needed
            job_experience = info_box_element[3].find('span', class_='black').text.strip()
            
            # Job salary
            job_salary = info_box_element[4].find('span', class_='black').text.strip()
            
            
            # Find the skills box
            skills_box_element = job_soup.find('ul', class_='c-infoBox u-mB0')
            skills_li_element = skills_box_element.find('h4', text='مهارت‌های مورد نیاز').parent
            # Get all the skill tags with class "black" within the skills box
            skill_tags = skills_li_element.find_all('span', class_='black')
            # Extract the skills and join them with a comma
            skills = ', '.join(skill_tag.text.strip() for skill_tag in skill_tags)
    
            # Find the gender box
            try:
                gender_li_element = skills_box_element.find('h4', text='جنسیت').parent
                gender = gender_li_element.find('span', class_='black').text.strip()
            except AttributeError:
                gender = ''
    
            # Find the military service box (is None when gender is set to 'female')
            try:
                military_service_li_element = skills_box_element.find('h4', text='وضعیت نظام وظیفه').parent
                military_service = military_service_li_element.find('span', class_='black').text.strip()
            except AttributeError:
                military_service = ''
    
            # Find the education box
            try:
                education_li_element = skills_box_element.find('h4', text='حداقل مدرک تحصیلی').parent
                education = education_li_element.find('span', class_='black').text.strip()
            except AttributeError:
                education = ''
    
            # Find the job description
            job_description_element = job_soup.find('div', class_='o-box__text s-jobDesc c-pr40p')
            job_description = job_description_element.get_text(strip=True) if job_description_element else ''
            
            
            # Append the job data to the list
            job_data_list.append({
                'Job Position': job_position,
                'Job Category': job_category,
                'Job Location': job_location,
                'Employment Type': job_employment_type,
                'Experience': job_experience,
                'Salary': job_salary,
                'Company Name': company_name,
                'Company Category': company_category,
                'Company Size': company_size,
                'Company Website': company_website,
                'Skills': skills,
                'Gender': gender,
                'Military Service': military_service,
                'Education': education,
                'Job Description': job_description,
                'Job URL': job_url
            })
    
            # Checks
            print(company_name)
            print(company_category)
            print(company_size)
            print(company_website)
            print(job_position)
            print(job_category)
            print(job_location)
            print(job_employment_type)
            print(job_experience)
            print(job_salary)
            print(skills)
            print(gender)
            print(military_service)
            print(education)
            print(job_description)
            print(job_url)
            print('-' * 50)

# Create a DataFrame from the collected job data list
df = pd.DataFrame(job_data_list)

# Print the DataFrame with the collected job specifications
print(df)

In [None]:
def extract_job_data(job_soup):
    # Get the company name (both Persian and English)
    company_name_element = job_soup.find('h2', class_='c-companyHeader__name')
    company_name = company_name_element.text.strip().split('|')[0].strip()

    # Find the company meta information
    company_meta_elements = job_soup.find_all('span', class_='c-companyHeader__metaItem')

    # Get the company category (only the Persian phrase)
    company_category_element = company_meta_elements[0].find('a', class_='c-companyHeader__metaLink')
    company_category = company_category_element.text.strip() if company_category_element else ''

    # Get the company size (only the Persian phrase)
    company_size = company_meta_elements[1].text.strip() if len(company_meta_elements) > 1 else ''

    # Get the company website (if available)
    company_website_element = company_meta_elements[-1].find('a', class_='c-companyHeader__metaLink', target='_blank')
    company_website = company_website_element['href'].replace('https://', '').replace('http://', '').replace('/', '') if company_website_element else ''

    # Get the job position
    job_position_element = job_soup.find('div', class_='c-jobView__title').find('h1')
    job_position = job_position_element.text.strip() if job_position_element else ''

    # Find the skills box
    skills_box_element = job_soup.find('ul', class_='c-infoBox u-mB0')
    skills_li_element = skills_box_element.find('h4', text='مهارت‌های مورد نیاز').parent
    # Get all the skill tags with class "black" within the skills box
    skill_tags = skills_li_element.find_all('span', class_='black')
    # Extract the skills and join them with a comma
    skills = ', '.join(skill_tag.text.strip() for skill_tag in skill_tags)

    # Find the gender box
    try:
        gender_li_element = skills_box_element.find('h4', text='جنسیت').parent
        gender = gender_li_element.find('span', class_='black').text.strip()
    except AttributeError:
        gender = ''

    # Find the military service box (is None when gender is set to 'female')
    try:
        military_service_li_element = skills_box_element.find('h4', text='وضعیت نظام وظیفه').parent
        military_service = military_service_li_element.find('span', class_='black').text.strip()
    except AttributeError:
        military_service = ''

    # Find the education box
    try:
        education_li_element = skills_box_element.find('h4', text='حداقل مدرک تحصیلی').parent
        education = education_li_element.find('span', class_='black').text.strip()
    except AttributeError:
        education = ''

    # Find the job description
    job_description_element = job_soup.find('div', class_='o-box__text s-jobDesc c-pr40p')
    job_description = job_description_element.get_text(strip=True) if job_description_element else ''
    
    return {
        'Job Category': job_category,
        'Job Position': job_position,
        'Company Name': company_name,
        'Company Category': company_category,
        'Company Size': company_size,
        'Company Website': company_website,
        'Skills': skills,
        'Gender': gender,
        'Military Service': military_service,
        'Education': education,
        'Job Description': job_description
    }

In [None]:
def crawl_job_data():
    initial_url = 'https://jobinja.ir/jobs?&b=&filters%5Bjob_categories%5D%5B0%5D=&filters%5Bkeywords%5D%5B0%5D=&filters%5Blocations%5D%5B0%5D='

    # Create an empty list to store job data dictionaries
    job_data_list = []
    page_number = 1

    while page_number <= 5:
        url = initial_url + f'&page={page_number}'
        response = requests.get(url)

        if response.status_code != 200:
            break  # Stop crawling if the page is not accessible or doesn't exist

        soup = BeautifulSoup(response.content, 'html.parser')
        job_links = soup.find_all('a', class_='c-jobListView__titleLink')

        if not job_links:
            break  # Stop crawling if there are no job links on the page

        for entry, job_link in enumerate(job_links, 1):
            job_url = job_link['href']
            job_response = requests.get(job_url)

            if job_response.status_code == 200:
                job_soup = BeautifulSoup(job_response.content, 'html.parser')
                job_data = extract_job_data(job_soup)
                job_data['Job URL'] = job_url

                # Append the job data to the list
                job_data_list.append(job_data)

                # Print processing information for each job entry
                print(f"Processing Page: {page_number} - Entry: {entry}")

        page_number += 1
        print(f'20 Entries from {page_number} appended to the DataFrame')


    return job_data_list

In [None]:
page_number = 1
job_data_list = crawl_job_data()

# Create a DataFrame from the collected job data list
df = pd.DataFrame(job_data_list)

# Print the DataFrame with the collected job specifications
print(df)

In [None]:
df.head()

In [None]:
df['Employment Type']

In [None]:
df.to_csv('Jobinja.csv', index=False, encoding='utf-8-sig')# Print the DataFrame with the collected job specifications