In [2]:
import requests
from bs4 import BeautifulSoup
import time
import re
import urllib3
import json
import scipy.io as sio
import os
import pandas as pd
from queue import Queue
import logging
from logging.handlers import RotatingFileHandler

# Configure logging with a rotating file handler
log_handler = RotatingFileHandler('jobinja_scraper.log', maxBytes=5*1024*1024, backupCount=2)
log_handler.setLevel(logging.ERROR)
log_formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
log_handler.setFormatter(log_formatter)
logger = logging.getLogger()
logger.setLevel(logging.ERROR)
logger.addHandler(log_handler)

class Jobinja:
    def __init__(self, base_url="https://jobinja.ir/", save_dir="JobInja/v3"):
        self.base_url = base_url
        self.headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
        }
        self.save_dir = save_dir
        os.makedirs(self.save_dir, exist_ok=True)  # Create the directory if it doesn't exist
        self.all_jobs_data = []
        urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)  # Suppress InsecureRequestWarning

    def get_links(self):
        try:
            # Send a GET request to the main page
            response = requests.get(self.base_url, headers=self.headers, verify=False, timeout=10)
            response.raise_for_status()  # Raise an error if the request failed
            
            # Parse the HTML content using BeautifulSoup
            soup = BeautifulSoup(response.text, 'html.parser')
            
            # Extract all the links to tabs or subpages
            links = soup.find_all('a', href=True)
            
            # Collect unique links using a queue for breadth-first crawling
            subpage_links = Queue()
            unique_links = set()  # To avoid duplicates

            # Iterate through all the links found on the main page
            for link in links:
                href = link['href']
                if href.startswith('/'):  # It's a relative link, prepend base URL
                    full_url = self.base_url + href
                elif href.startswith('http'):  # It's an absolute link
                    full_url = href
                else:
                    continue  # Skip if the href doesn't look like a valid link

                if full_url not in unique_links:
                    subpage_links.put(full_url)
                    unique_links.add(full_url)

            return subpage_links
        except requests.exceptions.RequestException as e:
            logger.error(f"An error occurred while getting links: {e}")
            return Queue()

    @staticmethod
    def clean_persian_text(text):
        # Remove extra spaces, non-Persian characters, and control characters
        cleaned_text = re.sub(r'[^؀-ۿ\s]', ' ', text)  # Keep only Persian characters and spaces
        cleaned_text = re.sub(r'\s+', ' ', cleaned_text)  # Replace multiple spaces with a single space
        return cleaned_text.strip()  # Strip leading and trailing spaces

    def extract_job_features(self, subpage_soup):
        # This function will extract and clean text related to job name, description, and other features
        job_features = {}

        # Extract job title (assuming job title is in an <h1> tag)
        job_title_tag = subpage_soup.find('h1')
        if job_title_tag:
            job_features['job_title'] = self.clean_persian_text(job_title_tag.get_text())

        # Extract job description (assuming job description is in a specific <div> class)
        job_description_tag = subpage_soup.find('div', class_='job-description')
        if job_description_tag:
            job_features['job_description'] = self.clean_persian_text(job_description_tag.get_text())
            # Extract additional features
        additional_features = {
            'job_category': ('div', 'job-category'),
            'job_location': ('div', 'job-location'),
            'employment_type': ('div', 'employment-type'),
            'min_experience': ('div', 'min-experience'),
            'salary': ('div', 'salary'),
            'gender': ('div', 'gender'),
            'military_status': ('div', 'military-status'),
            'education_level': ('div', 'education-level'),
            'company_intro': ('div', 'company-intro'),
            'skills_required': ('div', 'skills-required'),
        }

        for feature, (tag, class_name) in additional_features.items():
            feature_tag = subpage_soup.find(tag, class_=class_name)
            if feature_tag:
                job_features[feature] = self.clean_persian_text(feature_tag.get_text())

        # Extract content snippet (all text from the page)
        job_features['content_snippet'] = self.clean_persian_text(subpage_soup.get_text())

        return job_features

    def scrape_jobs(self):
        subpage_links = self.get_links()

        # Iterate through each subpage link to extract job information
        while not subpage_links.empty():
            subpage_url = subpage_links.get()
            try:
                # Send a GET request to each subpage
                subpage_response = requests.get(subpage_url, headers=self.headers, verify=False, timeout=10)
                subpage_response.raise_for_status()  # Raise an error if the request failed

                # Parse the HTML content of the subpage
                subpage_soup = BeautifulSoup(subpage_response.text, 'html.parser')
                
                # Extract job features from the subpage
                job_data = self.extract_job_features(subpage_soup)

                # Save the extracted job features if available
                if job_data:
                    self.all_jobs_data.append(job_data)

                    # Save all features to a .txt file
                    job_title = job_data.get('job_title', 'JobInja')  # Use 'JobInja' if job title is not available
                    file_path = os.path.join(self.save_dir, f"{job_title}.txt")
                    with open(file_path, 'w', encoding='utf-8') as file:
                        for key, value in job_data.items():
                            file.write(f"{key.capitalize()}: {value}\n")

                    # Print the extracted job features to the console
                    print(f"URL: {subpage_url}")
                    for key, value in job_data.items():
                        print(f"{key.capitalize()}: {value}")
                    print("\n")

                # To avoid overwhelming the server, add a short delay between requests
                time.sleep(1)

            except requests.exceptions.RequestException as e:
                # Log an error message if a request fails
                logger.error(f"An error occurred while accessing {subpage_url}: {e}")

    def save_dataset(self):
        # Save the dataset to .mat and .json files
        mat_file_path = os.path.join(self.save_dir, "jobinja_data.mat")
        json_file_path = os.path.join(self.save_dir, "jobinja_data.json")

        # Convert job data to a format suitable for saving
        job_data_dict = {f"job_{i}": job for i, job in enumerate(self.all_jobs_data)}
        
        # Save as .mat file using SciPy's savemat function
        sio.savemat(mat_file_path, job_data_dict)

        # Save as .json file using the json module
        with open(json_file_path, 'w', encoding='utf-8') as json_file:
            json.dump(self.all_jobs_data, json_file, ensure_ascii=False, indent=4)

    def display_dataset(self):
        # Convert the list of job data to a pandas DataFrame for easy viewing and analysis
        df = pd.DataFrame(self.all_jobs_data)
        print(df)
        return df

    def descriptive_statistics(self):
        # Generate descriptive statistics for the dataset
        df = pd.DataFrame(self.all_jobs_data)
        if not df.empty:
            print(df.describe(include='all'))
        else:
            print("No data available to generate statistics.")

# Example usage
if __name__ == "__main__":
    # Create an instance of the Jobinja class
    jobinja_scraper = Jobinja()
    
    # Scrape job listings
    jobinja_scraper.scrape_jobs()
    
    # Save the dataset to .mat and .json files
    jobinja_scraper.save_dataset()
    
    # Display the dataset
    df = jobinja_scraper.display_dataset()
    
    # Generate descriptive statistics
    jobinja_scraper.descriptive_statistics()
import requests
from bs4 import BeautifulSoup
import time

# Define the target URL
url = "https://jobinja.ir/"

# Send a GET request to the website
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
}
try:
    response = requests.get(url, headers=headers, verify=False, timeout=10)
    response.raise_for_status()
    
    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Extract all the links to tabs or subpages
    links = soup.find_all('a', href=True)
    
    # Collect unique links to avoid repetition
    base_url = "https://jobinja.ir"
    subpage_links = set()

    for link in links:
        href = link['href']
        if href.startswith('/'):  # It's a relative link
            full_url = base_url + href
        elif href.startswith('http'):
            full_url = href
        else:
            continue  # Skip if the href doesn't look like a valid link

        subpage_links.add(full_url)

    # Extract text from each subpage
    for subpage_url in subpage_links:
        try:
            subpage_response = requests.get(subpage_url, headers=headers, verify=False, timeout=10)
            subpage_response.raise_for_status()

            # Parse the HTML content of the subpage
            subpage_soup = BeautifulSoup(subpage_response.text, 'html.parser')
            
            # Extract all text from the subpage
            subpage_text = subpage_soup.get_text(strip=True)

            # Print a snippet of the extracted text from each subpage
            print(f"URL: {subpage_url}\nContent Snippet: {subpage_text[:1000]}\n")

            # To avoid overwhelming the server, add a short delay
            time.sleep(1)

        except requests.exceptions.RequestException as e:
            print(f"An error occurred while accessing {subpage_url}: {e}")

except requests.exceptions.SSLError as e:
    print(f"SSL error occurred: {e}")
except requests.exceptions.RequestException as e:
    print(f"An error occurred: {e}")
import requests
from bs4 import BeautifulSoup
import time
import re
import urllib3
import json
import scipy.io as sio
import os

# Suppress InsecureRequestWarning
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

# Define the target URL
url = "https://jobinja.ir/"

# Send a GET request to the website
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
}

# Directory to save text files
save_dir = "JobInja"
os.makedirs(save_dir, exist_ok=True)  # Create the directory if it doesn't exist

# Initialize list to store all job data
all_jobs_data = []

try:
    # Send a GET request to the main page
    response = requests.get(url, headers=headers, verify=False, timeout=10)
    response.raise_for_status()  # Raise an error if the request failed
    
    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Extract all the links to tabs or subpages
    links = soup.find_all('a', href=True)
    
    # Collect unique links to avoid repetition
    base_url = "https://jobinja.ir"
    subpage_links = set()  # Use a set to store unique links

    # Iterate through all the links found on the main page
    for link in links:
        href = link['href']
        if href.startswith('/'):  # It's a relative link, prepend base URL
            full_url = base_url + href
        elif href.startswith('http'):  # It's an absolute link
            full_url = href
        else:
            continue  # Skip if the href doesn't look like a valid link

        subpage_links.add(full_url)  # Add the link to the set of subpage links

    def clean_persian_text(text):
        # Remove extra spaces, non-Persian characters, and control characters
        cleaned_text = re.sub(r'[^\u0600-\u06FF\s]', ' ', text)  # Keep only Persian characters and spaces
        cleaned_text = re.sub(r'\s+', ' ', cleaned_text)  # Replace multiple spaces with a single space
        return cleaned_text.strip()  # Strip leading and trailing spaces

    def extract_job_features(subpage_soup):
        # This function will extract and clean text related to job name, description, and other features
        job_features = {}

        # Extract job title (assuming job title is in an <h1> tag)
        job_title_tag = subpage_soup.find('h1')
        if job_title_tag:
            job_features['job_title'] = clean_persian_text(job_title_tag.get_text())

        # Extract job description (assuming job description is in a specific <div> class)
        job_description_tag = subpage_soup.find('div', class_='job-description')  # Replace with the actual class name
        if job_description_tag:
            job_features['job_description'] = clean_persian_text(job_description_tag.get_text())

        # Extract company name (assuming company name is in a specific <div> class)
        company_name_tag = subpage_soup.find('div', class_='company-name')  # Replace with the actual class name
        if company_name_tag:
            job_features['company_name'] = clean_persian_text(company_name_tag.get_text())

        return job_features

    # Iterate through each subpage link to extract job information
    for subpage_url in subpage_links:
        try:
            # Send a GET request to each subpage
            subpage_response = requests.get(subpage_url, headers=headers, verify=False, timeout=10)
            subpage_response.raise_for_status()  # Raise an error if the request failed

            # Parse the HTML content of the subpage
            subpage_soup = BeautifulSoup(subpage_response.text, 'html.parser')
            
            # Extract job features from the subpage
            job_data = extract_job_features(subpage_soup)

            # Save the extracted job features if available
            if job_data:
                all_jobs_data.append(job_data)

                # Save Persian text to a .txt file
                job_title = job_data.get('job_title', 'JobInja')  # Use 'JobInja' if job title is not available
                file_path = os.path.join(save_dir, f"{job_title}.txt")
                with open(file_path, 'w', encoding='utf-8') as file:
                    for key, value in job_data.items():
                        file.write(f"{key.capitalize()}: {value}\n")

                # Print the extracted job features to the console
                print(f"URL: {subpage_url}")
                for key, value in job_data.items():
                    print(f"{key.capitalize()}: {value}")
                print("\n")

            # To avoid overwhelming the server, add a short delay between requests
            time.sleep(1)

        except requests.exceptions.RequestException as e:
            # Print an error message if a request fails
            print(f"An error occurred while accessing {subpage_url}: {e}")

    # Save the dataset to .mat and .json files
    mat_file_path = os.path.join(save_dir, "jobinja_data.mat")
    json_file_path = os.path.join(save_dir, "jobinja_data.json")

    # Convert job data to a format suitable for saving
    job_data_dict = {f"job_{i}": job for i, job in enumerate(all_jobs_data)}
    
    # Save as .mat file using SciPy's savemat function
    sio.savemat(mat_file_path, job_data_dict)

    # Save as .json file using the json module
    with open(json_file_path, 'w', encoding='utf-8') as json_file:
        json.dump(all_jobs_data, json_file, ensure_ascii=False, indent=4)

except requests.exceptions.RequestException as e:
    # Print an error message if the initial request fails
    print(f"An error occurred: {e}")

import requests
from bs4 import BeautifulSoup
import time
import re
import urllib3
import json
import scipy.io as sio
import os
import pandas as pd

class Jobinja:
    def __init__(self, base_url="https://jobinja.ir/", save_dir="JobInja"):
        self.base_url = base_url
        self.headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
        }
        self.save_dir = save_dir
        os.makedirs(self.save_dir, exist_ok=True)  # Create the directory if it doesn't exist
        self.all_jobs_data = []
        urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)  # Suppress InsecureRequestWarning

    def get_links(self):
        try:
            # Send a GET request to the main page
            response = requests.get(self.base_url, headers=self.headers, verify=False, timeout=10)
            response.raise_for_status()  # Raise an error if the request failed
            
            # Parse the HTML content using BeautifulSoup
            soup = BeautifulSoup(response.text, 'html.parser')
            
            # Extract all the links to tabs or subpages
            links = soup.find_all('a', href=True)
            
            # Collect unique links to avoid repetition
            subpage_links = set()  # Use a set to store unique links

            # Iterate through all the links found on the main page
            for link in links:
                href = link['href']
                if href.startswith('/'):  # It's a relative link, prepend base URL
                    full_url = self.base_url + href
                elif href.startswith('http'):  # It's an absolute link
                    full_url = href
                else:
                    continue  # Skip if the href doesn't look like a valid link

                subpage_links.add(full_url)  # Add the link to the set of subpage links

            return subpage_links
        except requests.exceptions.RequestException as e:
            print(f"An error occurred while getting links: {e}")
            return set()

    @staticmethod
    def clean_persian_text(text):
        # Remove extra spaces, non-Persian characters, and control characters
        cleaned_text = re.sub(r'[^\u0600-\u06FF\s]', ' ', text)  # Keep only Persian characters and spaces
        cleaned_text = re.sub(r'\s+', ' ', cleaned_text)  # Replace multiple spaces with a single space
        return cleaned_text.strip()  # Strip leading and trailing spaces

    def extract_job_features(self, subpage_soup):
        # This function will extract and clean text related to job name, description, and other features
        job_features = {}

        # Extract job title (assuming job title is in an <h1> tag)
        job_title_tag = subpage_soup.find('h1')
        if job_title_tag:
            job_features['job_title'] = self.clean_persian_text(job_title_tag.get_text())

        # Extract job description (assuming job description is in a specific <div> class)
        job_description_tag = subpage_soup.find('div', class_='job-description')  # Replace with the actual class name
        if job_description_tag:
            job_features['job_description'] = self.clean_persian_text(job_description_tag.get_text())

        # Extract company name (assuming company name is in a specific <div> class)
        company_name_tag = subpage_soup.find('div', class_='company-name')  # Replace with the actual class name
        if company_name_tag:
            job_features['company_name'] = self.clean_persian_text(company_name_tag.get_text())

        return job_features

    def scrape_jobs(self):
        subpage_links = self.get_links()

        # Iterate through each subpage link to extract job information
        for subpage_url in subpage_links:
            try:
                # Send a GET request to each subpage
                subpage_response = requests.get(subpage_url, headers=self.headers, verify=False, timeout=10)
                subpage_response.raise_for_status()  # Raise an error if the request failed

                # Parse the HTML content of the subpage
                subpage_soup = BeautifulSoup(subpage_response.text, 'html.parser')
                
                # Extract job features from the subpage
                job_data = self.extract_job_features(subpage_soup)

                # Save the extracted job features if available
                if job_data:
                    self.all_jobs_data.append(job_data)

                    # Save Persian text to a .txt file
                    job_title = job_data.get('job_title', 'JobInja')  # Use 'JobInja' if job title is not available
                    file_path = os.path.join(self.save_dir, f"{job_title}.txt")
                    with open(file_path, 'w', encoding='utf-8') as file:
                        for key, value in job_data.items():
                            file.write(f"{key.capitalize()}: {value}\n")

                    # Print the extracted job features to the console
                    print(f"URL: {subpage_url}")
                    for key, value in job_data.items():
                        print(f"{key.capitalize()}: {value}")
                    print("\n")

                # To avoid overwhelming the server, add a short delay between requests
                time.sleep(1)

            except requests.exceptions.RequestException as e:
                # Print an error message if a request fails
                print(f"An error occurred while accessing {subpage_url}: {e}")

    def save_dataset(self):
        # Save the dataset to .mat and .json files
        mat_file_path = os.path.join(self.save_dir, "jobinja_data.mat")
        json_file_path = os.path.join(self.save_dir, "jobinja_data.json")

        # Convert job data to a format suitable for saving
        job_data_dict = {f"job_{i}": job for i, job in enumerate(self.all_jobs_data)}
        
        # Save as .mat file using SciPy's savemat function
        sio.savemat(mat_file_path, job_data_dict)

        # Save as .json file using the json module
        with open(json_file_path, 'w', encoding='utf-8') as json_file:
            json.dump(self.all_jobs_data, json_file, ensure_ascii=False, indent=4)

    def display_dataset(self):
        # Convert the list of job data to a pandas DataFrame for easy viewing and analysis
        df = pd.DataFrame(self.all_jobs_data)
        print(df)
        return df

    def descriptive_statistics(self):
        # Generate descriptive statistics for the dataset
        df = pd.DataFrame(self.all_jobs_data)
        if not df.empty:
            print(df.describe(include='all'))
        else:
            print("No data available to generate statistics.")

# Example usage
if __name__ == "__main__":
    # Create an instance of the Jobinja class
    jobinja_scraper = Jobinja()
    
    # Scrape job listings
    jobinja_scraper.scrape_jobs()
    
    # Save the dataset to .mat and .json files
    jobinja_scraper.save_dataset()
    
    # Display the dataset
    df = jobinja_scraper.display_dataset()
    
    # Generate descriptive statistics
    jobinja_scraper.descriptive_statistics()

## import requests
from bs4 import BeautifulSoup
import time
import re
import urllib3
import json
import scipy.io as sio
import os
import pandas as pd

class Jobinja:
    def __init__(self, base_url="https://jobinja.ir/", save_dir="JobInja"):
        self.base_url = base_url
        self.headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
        }
        self.save_dir = save_dir
        os.makedirs(self.save_dir, exist_ok=True)  # Create the directory if it doesn't exist
        self.all_jobs_data = []
        urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)  # Suppress InsecureRequestWarning

    def get_links(self):
        try:
            # Send a GET request to the main page
            response = requests.get(self.base_url, headers=self.headers, verify=False, timeout=10)
            response.raise_for_status()  # Raise an error if the request failed
            
            # Parse the HTML content using BeautifulSoup
            soup = BeautifulSoup(response.text, 'html.parser')
            
            # Extract all the links to tabs or subpages
            links = soup.find_all('a', href=True)
            
            # Collect unique links to avoid repetition
            subpage_links = set()  # Use a set to store unique links

            # Iterate through all the links found on the main page
            for link in links:
                href = link['href']
                if href.startswith('/'):  # It's a relative link, prepend base URL
                    full_url = self.base_url + href
                elif href.startswith('http'):  # It's an absolute link
                    full_url = href
                else:
                    continue  # Skip if the href doesn't look like a valid link

                subpage_links.add(full_url)  # Add the link to the set of subpage links

            return subpage_links
        except requests.exceptions.RequestException as e:
            print(f"An error occurred while getting links: {e}")
            return set()

    @staticmethod
    def clean_persian_text(text):
        # Remove extra spaces, non-Persian characters, and control characters
        cleaned_text = re.sub(r'[^؀-ۿ\s]', ' ', text)  # Keep only Persian characters and spaces
        cleaned_text = re.sub(r'\s+', ' ', cleaned_text)  # Replace multiple spaces with a single space
        return cleaned_text.strip()  # Strip leading and trailing spaces

    def extract_job_features(self, subpage_soup):
        # This function will extract and clean text related to job name, description, and other features
        job_features = {}

        # Extract job title (assuming job title is in an <h1> tag)
        job_title_tag = subpage_soup.find('h1')
        if job_title_tag:
            job_features['job_title'] = self.clean_persian_text(job_title_tag.get_text())

        # Extract job description (assuming job description is in a specific <div> class)
        job_description_tag = subpage_soup.find('div', class_='job-description')  # Replace with the actual class name
        if job_description_tag:
            job_features['job_description'] = self.clean_persian_text(job_description_tag.get_text())

        # Extract company name (assuming company name is in a specific <div> class)
        company_name_tag = subpage_soup.find('div', class_='company-name')  # Replace with the actual class name
        if company_name_tag:
            job_features['company_name'] = self.clean_persian_text(company_name_tag.get_text())

        # Extract content snippet (all text from the page)
        job_features['content_snippet'] = self.clean_persian_text(subpage_soup.get_text())

        # Extract suggested jobs (assuming they are in a specific <div> class)
        suggested_jobs = self.extract_suggested_jobs(subpage_soup)
        if suggested_jobs:
            job_features['suggested_jobs'] = suggested_jobs

        return job_features

    def extract_suggested_jobs(self, soup):
        # This function extracts job listings from the "آگهی های پیشنهادی" section
        suggested_jobs_section = soup.find('div', class_='suggested-jobs')  # Replace with the actual class name
        suggested_jobs = []

        if suggested_jobs_section:
            job_items = suggested_jobs_section.find_all('div', class_='job-item')  # Replace with the actual class name
            for job_item in job_items:
                job_title = job_item.find('h2')  # Assuming the job title is in an <h2> tag
                company_name = job_item.find('div', class_='company-name')  # Replace with the actual class name
                if job_title and company_name:
                    suggested_jobs.append(
                        f"{self.clean_persian_text(job_title.get_text())} - {self.clean_persian_text(company_name.get_text())}"
                    )
        return ' | '.join(suggested_jobs) if suggested_jobs else None

    def scrape_jobs(self):
        subpage_links = self.get_links()

        # Iterate through each subpage link to extract job information
        for subpage_url in subpage_links:
            try:
                # Send a GET request to each subpage
                subpage_response = requests.get(subpage_url, headers=self.headers, verify=False, timeout=10)
                subpage_response.raise_for_status()  # Raise an error if the request failed

                # Parse the HTML content of the subpage
                subpage_soup = BeautifulSoup(subpage_response.text, 'html.parser')
                
                # Extract job features from the subpage
                job_data = self.extract_job_features(subpage_soup)

                # Save the extracted job features if available
                if job_data:
                    self.all_jobs_data.append(job_data)

                    # Save all features to a .txt file
                    job_title = job_data.get('job_title', 'JobInja')  # Use 'JobInja' if job title is not available
                    file_path = os.path.join(self.save_dir, f"{job_title}.txt")
                    with open(file_path, 'w', encoding='utf-8') as file:
                        for key, value in job_data.items():
                            file.write(f"{key.capitalize()}: {value} | ")

                    # Print the extracted job features to the console
                    print(f"URL: {subpage_url}")
                    for key, value in job_data.items():
                        print(f"{key.capitalize()}: {value}")
                    print("\n")

                # To avoid overwhelming the server, add a short delay between requests
                time.sleep(1)

            except requests.exceptions.RequestException as e:
                # Print an error message if a request fails
                print(f"An error occurred while accessing {subpage_url}: {e}")

    def save_dataset(self):
        # Save the dataset to .mat and .json files
        mat_file_path = os.path.join(self.save_dir, "jobinja_data.mat")
        json_file_path = os.path.join(self.save_dir, "jobinja_data.json")

        # Convert job data to a format suitable for saving
        job_data_dict = {f"job_{i}": job for i, job in enumerate(self.all_jobs_data)}
        
        # Save as .mat file using SciPy's savemat function
        sio.savemat(mat_file_path, job_data_dict)

        # Save as .json file using the json module
        with open(json_file_path, 'w', encoding='utf-8') as json_file:
            json.dump(self.all_jobs_data, json_file, ensure_ascii=False, indent=4)

    def display_dataset(self):
        # Convert the list of job data to a pandas DataFrame for easy viewing and analysis
        df = pd.DataFrame(self.all_jobs_data)
        print(df)
        return df

    def descriptive_statistics(self):
        # Generate descriptive statistics for the dataset
        df = pd.DataFrame(self.all_jobs_data)
        if not df.empty:
            print(df.describe(include='all'))
        else:
            print("No data available to generate statistics.")

# Example usage
if __name__ == "__main__":
    # Create an instance of the Jobinja class
    jobinja_scraper = Jobinja()
    
    # Scrape job listings
    jobinja_scraper.scrape_jobs()
    
    # Save the dataset to .mat and .json files
    jobinja_scraper.save_dataset()
    
    # Display the dataset
    df = jobinja_scraper.display_dataset()
    
    # Generate descriptive statistics
    jobinja_scraper.descriptive_statistics()

import requests
from bs4 import BeautifulSoup
import time
import re
import urllib3
import json
import scipy.io as sio
import os
import pandas as pd

class Jobinja:
    def __init__(self, base_url="https://jobinja.ir/", save_dir="JobInja/v2"):
        self.base_url = base_url
        self.headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
        }
        self.save_dir = save_dir
        os.makedirs(self.save_dir, exist_ok=True)  # Create the directory if it doesn't exist
        self.all_jobs_data = []
        urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)  # Suppress InsecureRequestWarning

    def get_links(self):
        try:
            # Send a GET request to the main page
            response = requests.get(self.base_url, headers=self.headers, verify=False, timeout=10)
            response.raise_for_status()  # Raise an error if the request failed
            
            # Parse the HTML content using BeautifulSoup
            soup = BeautifulSoup(response.text, 'html.parser')
            
            # Extract all the links to tabs or subpages
            links = soup.find_all('a', href=True)
            
            # Collect unique links to avoid repetition
            subpage_links = set()  # Use a set to store unique links

            # Iterate through all the links found on the main page
            for link in links:
                href = link['href']
                if href.startswith('/'):  # It's a relative link, prepend base URL
                    full_url = self.base_url + href
                elif href.startswith('http'):  # It's an absolute link
                    full_url = href
                else:
                    continue  # Skip if the href doesn't look like a valid link

                subpage_links.add(full_url)  # Add the link to the set of subpage links

            return subpage_links
        except requests.exceptions.RequestException as e:
            print(f"An error occurred while getting links: {e}")
            return set()

    @staticmethod
    def clean_persian_text(text):
        # Remove extra spaces, non-Persian characters, and control characters
        cleaned_text = re.sub(r'[^؀-ۿ\s]', ' ', text)  # Keep only Persian characters and spaces
        cleaned_text = re.sub(r'\s+', ' ', cleaned_text)  # Replace multiple spaces with a single space
        return cleaned_text.strip()  # Strip leading and trailing spaces

    def extract_job_features(self, subpage_soup):
        # This function will extract and clean text related to job name, description, and other features
        job_features = {}

        # Extract job title (assuming job title is in an <h1> tag)
        job_title_tag = subpage_soup.find('h1')
        if job_title_tag:
            job_features['job_title'] = self.clean_persian_text(job_title_tag.get_text())

        # Extract job description (assuming job description is in a specific <div> class)
        job_description_tag = subpage_soup.find('div', class_='job-description')  # Replace with the actual class name
        if job_description_tag:
            job_features['job_description'] = self.clean_persian_text(job_description_tag.get_text())

        # Extract company name (assuming company name is in a specific <div> class)
        company_name_tag = subpage_soup.find('div', class_='company-name')  # Replace with the actual class name
        if company_name_tag:
            job_features['company_name'] = self.clean_persian_text(company_name_tag.get_text())

        # Extract additional features
        additional_features = {
            'job_category': ('div', 'job-category'),  # Replace with the actual tag and class name
            'job_location': ('div', 'job-location'),
            'employment_type': ('div', 'employment-type'),
            'min_experience': ('div', 'min-experience'),
            'salary': ('div', 'salary'),
            'gender': ('div', 'gender'),
            'military_status': ('div', 'military-status'),
            'education_level': ('div', 'education-level'),
            'company_intro': ('div', 'company-intro'),
            'skills_required': ('div', 'skills-required'),
        }

        for feature, (tag, class_name) in additional_features.items():
            feature_tag = subpage_soup.find(tag, class_=class_name)
            if feature_tag:
                job_features[feature] = self.clean_persian_text(feature_tag.get_text())

        # Extract content snippet (all text from the page)
        job_features['content_snippet'] = self.clean_persian_text(subpage_soup.get_text())

        # Extract suggested jobs (assuming they are in a specific <div> class)
        suggested_jobs = self.extract_suggested_jobs(subpage_soup)
        if suggested_jobs:
            job_features['suggested_jobs'] = suggested_jobs

        return job_features

    def extract_suggested_jobs(self, soup):
        # This function extracts job listings from the "آگهی های پیشنهادی" section
        suggested_jobs_section = soup.find('div', class_='suggested-jobs')  # Replace with the actual class name
        suggested_jobs = []

        if suggested_jobs_section:
            job_items = suggested_jobs_section.find_all('div', class_='job-item')  # Replace with the actual class name
            for job_item in job_items:
                job_title = job_item.find('h2')  # Assuming the job title is in an <h2> tag
                company_name = job_item.find('div', class_='company-name')  # Replace with the actual class name
                if job_title and company_name:
                    suggested_jobs.append(
                        f"{self.clean_persian_text(job_title.get_text())} - {self.clean_persian_text(company_name.get_text())}"
                    )
        return ' | '.join(suggested_jobs) if suggested_jobs else None

    def scrape_jobs(self):
        subpage_links = self.get_links()

        # Iterate through each subpage link to extract job information
        for subpage_url in subpage_links:
            try:
                # Send a GET request to each subpage
                subpage_response = requests.get(subpage_url, headers=self.headers, verify=False, timeout=10)
                subpage_response.raise_for_status()  # Raise an error if the request failed

                # Parse the HTML content of the subpage
                subpage_soup = BeautifulSoup(subpage_response.text, 'html.parser')
                
                # Extract job features from the subpage
                job_data = self.extract_job_features(subpage_soup)

                # Save the extracted job features if available
                if job_data:
                    self.all_jobs_data.append(job_data)

                    # Save all features to a .txt file
                    job_title = job_data.get('job_title', 'JobInja')  # Use 'JobInja' if job title is not available
                    file_path = os.path.join(self.save_dir, f"{job_title}.txt")
                    with open(file_path, 'w', encoding='utf-8') as file:
                        for key, value in job_data.items():
                            file.write(f"{key.capitalize()}: {value} | ")

                    # Print the extracted job features to the console
                    print(f"URL: {subpage_url}")
                    for key, value in job_data.items():
                        print(f"{key.capitalize()}: {value}")
                    print("\n")

                # To avoid overwhelming the server, add a short delay between requests
                time.sleep(1)

            except requests.exceptions.RequestException as e:
                # Print an error message if a request fails
                print(f"An error occurred while accessing {subpage_url}: {e}")

    def save_dataset(self):
        # Save the dataset to .mat and .json files
        mat_file_path = os.path.join(self.save_dir, "jobinja_data.mat")
        json_file_path = os.path.join(self.save_dir, "jobinja_data.json")

        # Convert job data to a format suitable for saving
        job_data_dict = {f"job_{i}": job for i, job in enumerate(self.all_jobs_data)}
        
        # Save as .mat file using SciPy's savemat function
        sio.savemat(mat_file_path, job_data_dict)

        # Save as .json file using the json module
        with open(json_file_path, 'w', encoding='utf-8') as json_file:
            json.dump(self.all_jobs_data, json_file, ensure_ascii=False, indent=4)

    def display_dataset(self):
        # Convert the list of job data to a pandas DataFrame for easy viewing and analysis
        df = pd.DataFrame(self.all_jobs_data)
        print(df)
        return df

    def descriptive_statistics(self):
        # Generate descriptive statistics for the dataset
        df = pd.DataFrame(self.all_jobs_data)
        if not df.empty:
            print(df.describe(include='all'))
        else:
            print("No data available to generate statistics.")

# Example usage
if __name__ == "__main__":
    # Create an instance of the Jobinja class
    jobinja_scraper = Jobinja()
    
    # Scrape job listings
    jobinja_scraper.scrape_jobs()
    
    # Save the dataset to .mat and .json files
    jobinja_scraper.save_dataset()
    
    # Display the dataset
    df = jobinja_scraper.display_dataset()
    
    # Generate descriptive statistics
    jobinja_scraper.descriptive_statistics()

import requests
from bs4 import BeautifulSoup
import time
import re
import urllib3
import json
import scipy.io as sio
import os
import pandas as pd
from queue import Queue
import logging
from logging.handlers import RotatingFileHandler

# Configure logging with a rotating file handler
log_handler = RotatingFileHandler('jobinja_scraper.log', maxBytes=5*1024*1024, backupCount=2)
log_handler.setLevel(logging.ERROR)
log_formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
log_handler.setFormatter(log_formatter)
logger = logging.getLogger()
logger.setLevel(logging.ERROR)
logger.addHandler(log_handler)

class Jobinja:
    def __init__(self, base_url="https://jobinja.ir/", save_dir="JobInja/v3"):
        self.base_url = base_url
        self.headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
        }
        self.save_dir = save_dir
        os.makedirs(self.save_dir, exist_ok=True)  # Create the directory if it doesn't exist
        self.all_jobs_data = []
        urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)  # Suppress InsecureRequestWarning

    def get_links(self):
        try:
            # Send a GET request to the main page
            response = requests.get(self.base_url, headers=self.headers, verify=False, timeout=10)
            response.raise_for_status()  # Raise an error if the request failed
            
            # Parse the HTML content using BeautifulSoup
            soup = BeautifulSoup(response.text, 'html.parser')
            
            # Extract all the links to tabs or subpages
            links = soup.find_all('a', href=True)
            
            # Collect unique links using a queue for breadth-first crawling
            subpage_links = Queue()
            unique_links = set()  # To avoid duplicates

            # Iterate through all the links found on the main page
            for link in links:
                href = link['href']
                if href.startswith('/'):  # It's a relative link, prepend base URL
                    full_url = self.base_url + href
                elif href.startswith('http'):  # It's an absolute link
                    full_url = href
                else:
                    continue  # Skip if the href doesn't look like a valid link

                if full_url not in unique_links:
                    subpage_links.put(full_url)
                    unique_links.add(full_url)

            return subpage_links
        except requests.exceptions.RequestException as e:
            logger.error(f"An error occurred while getting links: {e}")
            return Queue()

    @staticmethod
    def clean_persian_text(text):
        # Remove extra spaces, non-Persian characters, and control characters
        cleaned_text = re.sub(r'[^؀-ۿ\s]', ' ', text)  # Keep only Persian characters and spaces
        cleaned_text = re.sub(r'\s+', ' ', cleaned_text)  # Replace multiple spaces with a single space
        return cleaned_text.strip()  # Strip leading and trailing spaces

    def extract_job_features(self, subpage_soup):
        # This function will extract and clean text related to job name, description, and other features
        job_features = {}

        # Extract job title (assuming job title is in an <h1> tag)
        job_title_tag = subpage_soup.find('h1')
        if job_title_tag:
            job_features['job_title'] = self.clean_persian_text(job_title_tag.get_text())

        # Extract job description (assuming job description is in a specific <div> class)
        job_description_tag = subpage_soup.find('div', class_='job-description')
        if job_description_tag:
            job_features['job_description'] = self.clean_persian_text(job_description_tag.get_text())

        # Extract additional features
        additional_features = {
            'job_category': ('div', 'job-category'),
            'job_location': ('div', 'job-location'),
            'employment_type': ('div', 'employment-type'),
            'min_experience': ('div', 'min-experience'),
            'salary': ('div', 'salary'),
            'gender': ('div', 'gender'),
            'military_status': ('div', 'military-status'),
            'education_level': ('div', 'education-level'),
            'company_intro': ('div', 'company-intro'),
            'skills_required': ('div', 'skills-required'),
        }

        for feature, (tag, class_name) in additional_features.items():
            feature_tag = subpage_soup.find(tag, class_=class_name)
            if feature_tag:
                job_features[feature] = self.clean_persian_text(feature_tag.get_text())

        # Extract content snippet (all text from the page)
        job_features['content_snippet'] = self.clean_persian_text(subpage_soup.get_text())

        return job_features

    def scrape_jobs(self):
        subpage_links = self.get_links()

        # Iterate through each subpage link to extract job information
        while not subpage_links.empty():
            subpage_url = subpage_links.get()
            try:
                # Send a GET request to each subpage
                subpage_response = requests.get(subpage_url, headers=self.headers, verify=False, timeout=10)
                subpage_response.raise_for_status()  # Raise an error if the request failed

                # Parse the HTML content of the subpage
                subpage_soup = BeautifulSoup(subpage_response.text, 'html.parser')
                
                # Extract job features from the subpage
                job_data = self.extract_job_features(subpage_soup)

                # Save the extracted job features if available
                if job_data:
                    self.all_jobs_data.append(job_data)

                    # Save all features to a .txt file
                    job_title = job_data.get('job_title', 'JobInja')  # Use 'JobInja' if job title is not available
                    file_path = os.path.join(self.save_dir, f"{job_title}.txt")
                    with open(file_path, 'w', encoding='utf-8') as file:
                        for key, value in job_data.items():
                            file.write(f"{key.capitalize()}: {value}\n")

                    # Print the extracted job features to the console
                    print(f"URL: {subpage_url}")
                    for key, value in job_data.items():
                        print(f"{key.capitalize()}: {value}")
                    print("\n")

                # To avoid overwhelming the server, add a short delay between requests
                time.sleep(1)

            except requests.exceptions.RequestException as e:
                # Log an error message if a request fails
                logger.error(f"An error occurred while accessing {subpage_url}: {e}")

    def save_dataset(self):
        # Save the dataset to .mat and .json files
        mat_file_path = os.path.join(self.save_dir, "jobinja_data.mat")
        json_file_path = os.path.join(self.save_dir, "jobinja_data.json")

        # Convert job data to a format suitable for saving
        job_data_dict = {f"job_{i}": job for i, job in enumerate(self.all_jobs_data)}
        
        # Save as .mat file using SciPy's savemat function
        sio.savemat(mat_file_path, job_data_dict)

        # Save as .json file using the json module
        with open(json_file_path, 'w', encoding='utf-8') as json_file:
            json.dump(self.all_jobs_data, json_file, ensure_ascii=False, indent=4)

    def display_dataset(self):
        # Convert the list of job data to a pandas DataFrame for easy viewing and analysis
        df = pd.DataFrame(self.all_jobs_data)
        print(df)
        return df

    def descriptive_statistics(self):
        # Generate descriptive statistics for the dataset
        df = pd.DataFrame(self.all_jobs_data)
        if not df.empty:
            print(df.describe(include='all'))
        else:
            print("No data available to generate statistics.")

# Example usage
if __name__ == "__main__":
    # Create an instance of the Jobinja class
    jobinja_scraper = Jobinja()
    
    # Scrape job listings
    jobinja_scraper.scrape_jobs()
    
    # Save the dataset to .mat and .json files
    jobinja_scraper.save_dataset()
    
    # Display the dataset
    df = jobinja_scraper.display_dataset()
    
    # Generate descriptive statistics
    jobinja_scraper.descriptive_statistics()



URL: https://jobinja.ir
Job_title: ۲۶ ۱۶۲ آگهی استخدام فعال در ۱۱ ۳۱۶ شرکت ایرانی
Content_snippet: استخدام آگهی استخدام سایت کاریابی جابینجا جابینجا استخدام، آگهی استخدام، کاریابی، استخدامی جدید، بازار کار، نیازمندیها، ساخت رزومه رایگان، کار و استخدام دولتی، اشتغال، جستجوگر حرفه ای استخدام و شغل خانه جستجوی مشاغل رزومه ساز ۵۰ شرکت برتر بخش کارفرمایان ورود کارجو ثبت نام کارجو ورود کارجو ثبت نام کارجو ۱ جابینجا چطور به استخدام شدن من کمک می کند؟ مشاهده مشاهده بیشتر ۲۶ ۱۶۲ آگهی استخدام فعال در ۱۱ ۳۱۶ شرکت ایرانی جابینجا سامانه کاریابی آنلاین با بیشترین تعداد آگهی استخدام در ایران همه ی استان ها تهران خراسان رضوی اصفهان البرز فارس قم آذربایجان شرقی مازندران گیلان خوزستان یزد کرمان قزوین مرکزی هرمزگان گلستان زنجان بوشهر آذربایجان غربی کرمانشاه سیستان و بلوچستان سمنان همدان اردبیل لرستان کردستان خراسان شمالی خراسان جنوبی چهارمحال بختیاری ایلام کهکیلویه و بویراحمد همه ی دسته بندی ها فروش و بازاریابی وب، برنامه نویسی و نرم افزار مالی و حسابداری مسئول دفتر، اجرائی و اداری تولید و مدیریت محتوا د