In [1]:
import requests
from bs4 import BeautifulSoup
import time
import re
import urllib3
import json
import scipy.io as sio
import os
import pandas as pd
from queue import Queue
import logging
from logging.handlers import RotatingFileHandler

# Configure logging with a rotating file handler
log_handler = RotatingFileHandler('jobinja_scraper.log', maxBytes=5*1024*1024, backupCount=2)
log_handler.setLevel(logging.ERROR)
log_formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
log_handler.setFormatter(log_formatter)
logger = logging.getLogger()
logger.setLevel(logging.ERROR)
logger.addHandler(log_handler)

class Jobinja:
    def __init__(self, base_url="https://jobinja.ir/", save_dir="JobInja/v3"):
        self.base_url = base_url
        self.headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
        }
        self.save_dir = save_dir
        os.makedirs(self.save_dir, exist_ok=True)
        self.all_jobs_data = []
        urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

    def get_links(self):
        try:
            response = requests.get(self.base_url, headers=self.headers, verify=False, timeout=10)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, 'html.parser')
            links = soup.find_all('a', href=True)
            subpage_links = Queue()
            unique_links = set()

            for link in links:
                href = link['href']
                if href.startswith('/'):
                    full_url = self.base_url + href
                elif href.startswith('http'):
                    full_url = href
                else:
                    continue

                if full_url not in unique_links:
                    subpage_links.put(full_url)
                    unique_links.add(full_url)

            print(f"Found {len(unique_links)} unique links.")  # Debugging line
            return subpage_links
        except requests.exceptions.RequestException as e:
            logger.error(f"An error occurred while getting links: {e}")
            return Queue()

    @staticmethod
    def clean_persian_text(text):
        cleaned_text = re.sub(r'[^؀-ۿ\s]', ' ', text)
        cleaned_text = re.sub(r'\s+', ' ', cleaned_text)
        return cleaned_text.strip()

    def extract_job_features(self, subpage_soup):
        job_features = {}

        job_title_tag = subpage_soup.find('h1')
        job_features['job_title'] = self.clean_persian_text(job_title_tag.get_text()) if job_title_tag else "اطلاعات موجود نیست"

        job_description_tag = subpage_soup.find('div', class_='job-description')
        job_features['job_description'] = self.clean_persian_text(job_description_tag.get_text()) if job_description_tag else "اطلاعات موجود نیست"

        additional_features = {
            'job_category': ('div', 'job-category'),
            'job_location': ('div', 'job-location'),
            'employment_type': ('div', 'employment-type'),
            'min_experience': ('div', 'min-experience'),
            'salary': ('div', 'salary'),
            'gender': ('div', 'gender'),
            'military_status': ('div', 'military-status'),
            'education_level': ('div', 'education-level'),
            'company_intro': ('div', 'company-intro'),
            'skills_required': ('div', 'skills-required'),
        }

        for feature, (tag, class_name) in additional_features.items():
            feature_tag = subpage_soup.find(tag, class_=class_name)
            job_features[feature] = self.clean_persian_text(feature_tag.get_text()) if feature_tag else "اطلاعات موجود نیست"

        job_features['content_snippet'] = self.clean_persian_text(subpage_soup.get_text())
        
        print(f"Extracted features: {job_features}")  # Debug: print extracted features
        return job_features

    def scrape_jobs(self):
        subpage_links = self.get_links()

        while not subpage_links.empty():
            subpage_url = subpage_links.get()
            try:
                subpage_response = requests.get(subpage_url, headers=self.headers, verify=False, timeout=10)
                subpage_response.raise_for_status()
                subpage_soup = BeautifulSoup(subpage_response.text, 'html.parser')
                
                job_data = self.extract_job_features(subpage_soup)

                if job_data['job_title'] != "اطلاعات موجود نیست":  # Ensure job title exists before saving
                    self.all_jobs_data.append(job_data)

                    # Save each job's features in a separate .txt file
                    job_title = job_data.get('job_title', 'JobInja')
                    file_path = os.path.join(self.save_dir, f"{job_title}.txt")
                    with open(file_path, 'w', encoding='utf-8') as file:
                        for key, value in job_data.items():
                            file.write(f"{key.capitalize()}: {value}\n")

                    print(f"Saved job data to {file_path}")  # Print confirmation of saving the file

                time.sleep(1)

            except requests.exceptions.RequestException as e:
                logger.error(f"An error occurred while accessing {subpage_url}: {e}")

    def save_dataset(self):
        mat_file_path = os.path.join(self.save_dir, "jobinja_data.mat")
        json_file_path = os.path.join(self.save_dir, "jobinja_data.json")

        job_data_dict = {f"job_{i}": job for i, job in enumerate(self.all_jobs_data)}
        
        if job_data_dict:
            sio.savemat(mat_file_path, job_data_dict)
            with open(json_file_path, 'w', encoding='utf-8') as json_file:
                json.dump(self.all_jobs_data, json_file, ensure_ascii=False, indent=4)

    def display_dataset(self):
        df = pd.DataFrame(self.all_jobs_data)
        if df.empty:
            print("No job data available.")
        else:
            print(df)
        return df

    def descriptive_statistics(self):
        df = pd.DataFrame(self.all_jobs_data)
        if not df.empty:
            print(df.describe(include='all'))
        else:
            print("No data available to generate statistics.")

# Example usage
if __name__ == "__main__":
    jobinja_scraper = Jobinja()
    jobinja_scraper.scrape_jobs()
    jobinja_scraper.save_dataset()
    df = jobinja_scraper.display_dataset()
    jobinja_scraper.descriptive_statistics()


No job data available.
No data available to generate statistics.


In [3]:
import requests
from bs4 import BeautifulSoup
import time
import re
import urllib3
import json
import scipy.io as sio
import os
import pandas as pd
from queue import Queue
import logging
from logging.handlers import RotatingFileHandler

# Configure logging with a rotating file handler
log_handler = RotatingFileHandler('jobinja_scraper.log', maxBytes=5*1024*1024, backupCount=2)
log_handler.setLevel(logging.ERROR)
log_formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
log_handler.setFormatter(log_formatter)
logger = logging.getLogger()
logger.setLevel(logging.ERROR)
logger.addHandler(log_handler)

class Jobinja:
    def __init__(self, base_url="https://jobinja.ir/", save_dir="JobInja/v3"):
        self.base_url = base_url
        self.headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
        }
        self.save_dir = save_dir
        os.makedirs(self.save_dir, exist_ok=True)
        self.all_jobs_data = []
        urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

    def get_links(self):
        try:
            response = requests.get(self.base_url, headers=self.headers, verify=False, timeout=10)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, 'html.parser')
            links = soup.find_all('a', href=True)
            subpage_links = Queue()
            unique_links = set()

            for link in links:
                href = link['href']
                if href.startswith('/'):
                    full_url = self.base_url + href
                elif href.startswith('http'):
                    full_url = href
                else:
                    continue

                if full_url not in unique_links:
                    subpage_links.put(full_url)
                    unique_links.add(full_url)

            print(f"Found {len(unique_links)} unique links.")  # Debugging line
            return subpage_links
        except requests.exceptions.RequestException as e:
            logger.error(f"An error occurred while getting links: {e}")
            return Queue()

    @staticmethod
    def clean_persian_text(text):
        cleaned_text = re.sub(r'[^؀-ۿ\s]', ' ', text)
        cleaned_text = re.sub(r'\s+', ' ', cleaned_text)
        return cleaned_text.strip()

    def extract_job_features(self, subpage_soup):
        job_features = {}

        job_title_tag = subpage_soup.find('h1')
        job_features['job_title'] = self.clean_persian_text(job_title_tag.get_text()) if job_title_tag else "اطلاعات موجود نیست"

        job_description_tag = subpage_soup.find('div', class_='job-description')
        job_features['job_description'] = self.clean_persian_text(job_description_tag.get_text()) if job_description_tag else "اطلاعات موجود نیست"

        additional_features = {
            'job_category': ('div', 'job-category'),
            'job_location': ('div', 'job-location'),
            'employment_type': ('div', 'employment-type'),
            'min_experience': ('div', 'min-experience'),
            'salary': ('div', 'salary'),
            'gender': ('div', 'gender'),
            'military_status': ('div', 'military-status'),
            'education_level': ('div', 'education-level'),
            'company_intro': ('div', 'company-intro'),
            'skills_required': ('div', 'skills-required'),
        }

        for feature, (tag, class_name) in additional_features.items():
            feature_tag = subpage_soup.find(tag, class_=class_name)
            job_features[feature] = self.clean_persian_text(feature_tag.get_text()) if feature_tag else "اطلاعات موجود نیست"

        job_features['content_snippet'] = self.clean_persian_text(subpage_soup.get_text())
        
        print(f"Extracted features: {job_features}")  # Debug: print extracted features
        return job_features

    def scrape_jobs(self):
        subpage_links = self.get_links()

        while not subpage_links.empty():
            subpage_url = subpage_links.get()
            try:
                subpage_response = requests.get(subpage_url, headers=self.headers, verify=False, timeout=10)
                subpage_response.raise_for_status()
                subpage_soup = BeautifulSoup(subpage_response.text, 'html.parser')
                
                job_data = self.extract_job_features(subpage_soup)

                if job_data['job_title'] != "اطلاعات موجود نیست":  # Ensure job title exists before saving
                    self.all_jobs_data.append(job_data)

                    # Save each job's features in a separate .txt file
                    job_title = job_data.get('job_title', 'JobInja')
                    file_path = os.path.join(self.save_dir, f"{job_title}.txt")
                    with open(file_path, 'w', encoding='utf-8') as file:
                        for key, value in job_data.items():
                            file.write(f"{key.capitalize()}: {value}\n")

                    print(f"Saved job data to {file_path}")  # Print confirmation of saving the file
                else:
                    print(f"No valid job title for URL: {subpage_url}")  # Debug message for invalid title

                time.sleep(1)  # Add delay to avoid overwhelming the server

            except requests.exceptions.RequestException as e:
                logger.error(f"An error occurred while accessing {subpage_url}: {e}")
                print(f"Request failed for URL: {subpage_url}. Error: {e}")  # Print error for visibility

    def save_dataset(self):
        if self.all_jobs_data:
            mat_file_path = os.path.join(self.save_dir, "jobinja_data.mat")
            json_file_path = os.path.join(self.save_dir, "jobinja_data.json")

            job_data_dict = {f"job_{i}": job for i, job in enumerate(self.all_jobs_data)}
            sio.savemat(mat_file_path, job_data_dict)

            with open(json_file_path, 'w', encoding='utf-8') as json_file:
                json.dump(self.all_jobs_data, json_file, ensure_ascii=False, indent=4)
            print("Dataset saved successfully.")  # Confirmation message for saved dataset
        else:
            print("No job data to save.")  # Informative message if no data

    def display_dataset(self):
        df = pd.DataFrame(self.all_jobs_data)
        if df.empty:
            print("No job data available.")
        else:
            print(df)
        return df

    def descriptive_statistics(self):
        df = pd.DataFrame(self.all_jobs_data)
        if not df.empty:
            print(df.describe(include='all'))
        else:
            print("No data available to generate statistics.")

# Example usage
if __name__ == "__main__":
    jobinja_scraper = Jobinja()
    jobinja_scraper.scrape_jobs()
    jobinja_scraper.save_dataset()
    df = jobinja_scraper.display_dataset()
    jobinja_scraper.descriptive_statistics()


No job data to save.
No job data available.
No data available to generate statistics.


In [5]:
import requests
from bs4 import BeautifulSoup
import time
import re
import urllib3
import json
import scipy.io as sio
import os
import pandas as pd
from queue import Queue
import logging
from logging.handlers import RotatingFileHandler

# Configure logging with a rotating file handler
log_handler = RotatingFileHandler('jobinja_scraper.log', maxBytes=5*1024*1024, backupCount=2)
log_handler.setLevel(logging.ERROR)
log_formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
log_handler.setFormatter(log_formatter)
logger = logging.getLogger()
logger.setLevel(logging.ERROR)
logger.addHandler(log_handler)

class Jobinja:
    def __init__(self, base_url="https://jobinja.ir/", save_dir="JobInja/v3"):
        self.base_url = base_url
        self.headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
        }
        self.save_dir = save_dir
        os.makedirs(self.save_dir, exist_ok=True)
        self.all_jobs_data = []
        urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

    def get_links(self):
        try:
            response = requests.get(self.base_url, headers=self.headers, verify=False, timeout=10)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, 'html.parser')
            links = soup.find_all('a', href=True)
            subpage_links = Queue()
            unique_links = set()

            for link in links:
                href = link['href']
                if href.startswith('/'):
                    full_url = self.base_url + href
                elif href.startswith('http'):
                    full_url = href
                else:
                    continue

                if full_url not in unique_links:
                    subpage_links.put(full_url)
                    unique_links.add(full_url)

            print(f"Found {len(unique_links)} unique links.")  # Debugging line
            return subpage_links
        except requests.exceptions.RequestException as e:
            logger.error(f"An error occurred while getting links: {e}")
            return Queue()

    @staticmethod
    def clean_persian_text(text):
        cleaned_text = re.sub(r'[^؀-ۿ\s]', ' ', text)
        cleaned_text = re.sub(r'\s+', ' ', cleaned_text)
        return cleaned_text.strip()

    def extract_job_features(self, subpage_soup):
        job_features = {}

        job_title_tag = subpage_soup.find('h1')
        job_features['job_title'] = self.clean_persian_text(job_title_tag.get_text()) if job_title_tag else "اطلاعات موجود نیست"

        job_description_tag = subpage_soup.find('div', class_='job-description')
        job_features['job_description'] = self.clean_persian_text(job_description_tag.get_text()) if job_description_tag else "اطلاعات موجود نیست"

        additional_features = {
            'job_category': ('div', 'job-category'),
            'job_location': ('div', 'job-location'),
            'employment_type': ('div', 'employment-type'),
            'min_experience': ('div', 'min-experience'),
            'salary': ('div', 'salary'),
            'gender': ('div', 'gender'),
            'military_status': ('div', 'military-status'),
            'education_level': ('div', 'education-level'),
            'company_intro': ('div', 'company-intro'),
            'skills_required': ('div', 'skills-required'),
        }

        for feature, (tag, class_name) in additional_features.items():
            feature_tag = subpage_soup.find(tag, class_=class_name)
            job_features[feature] = self.clean_persian_text(feature_tag.get_text()) if feature_tag else "اطلاعات موجود نیست"

        job_features['content_snippet'] = self.clean_persian_text(subpage_soup.get_text())
        
        print(f"Extracted features: {job_features}")  # Debug: print extracted features
        return job_features

    def scrape_jobs(self):
        subpage_links = self.get_links()

        while not subpage_links.empty():
            subpage_url = subpage_links.get()
            try:
                subpage_response = requests.get(subpage_url, headers=self.headers, verify=False, timeout=10)
                subpage_response.raise_for_status()
                subpage_soup = BeautifulSoup(subpage_response.text, 'html.parser')
                
                job_data = self.extract_job_features(subpage_soup)

                # Debug: Check if job_data has a valid title
                if job_data['job_title'] and job_data['job_title'] != "اطلاعات موجود نیست":
                    self.all_jobs_data.append(job_data)

                    # Save each job's features in a separate .txt file
                    job_title = job_data.get('job_title', 'JobInja')
                    file_path = os.path.join(self.save_dir, f"{job_title}.txt")
                    with open(file_path, 'w', encoding='utf-8') as file:
                        for key, value in job_data.items():
                            file.write(f"{key.capitalize()}: {value}\n")

                    print(f"Saved job data to {file_path}")  # Print confirmation of saving the file
                else:
                    print(f"No valid job title for URL: {subpage_url}")  # Debug message for invalid title

                time.sleep(1)  # Add delay to avoid overwhelming the server

            except requests.exceptions.RequestException as e:
                logger.error(f"An error occurred while accessing {subpage_url}: {e}")
                print(f"Request failed for URL: {subpage_url}. Error: {e}")  # Print error for visibility

    def save_dataset(self):
        if self.all_jobs_data:
            mat_file_path = os.path.join(self.save_dir, "jobinja_data.mat")
            json_file_path = os.path.join(self.save_dir, "jobinja_data.json")

            job_data_dict = {f"job_{i}": job for i, job in enumerate(self.all_jobs_data)}
            sio.savemat(mat_file_path, job_data_dict)

            with open(json_file_path, 'w', encoding='utf-8') as json_file:
                json.dump(self.all_jobs_data, json_file, ensure_ascii=False, indent=4)
            print("Dataset saved successfully.")  # Confirmation message for saved dataset
        else:
            print("No job data to save.")  # Informative message if no data

    def display_dataset(self):
        df = pd.DataFrame(self.all_jobs_data)
        if df.empty:
            print("No job data available.")
        else:
            print(df)
        return df

    def descriptive_statistics(self):
        df = pd.DataFrame(self.all_jobs_data)
        if not df.empty:
            print(df.describe(include='all'))
        else:
            print("No data available to generate statistics.")

# Example usage
if __name__ == "__main__":
    jobinja_scraper = Jobinja()
    jobinja_scraper.scrape_jobs()
    jobinja_scraper.save_dataset()
    df = jobinja_scraper.display_dataset()
    jobinja_scraper.descriptive_statistics()


No job data to save.
No job data available.
No data available to generate statistics.


In [7]:
import requests
from bs4 import BeautifulSoup
import time
import re
import urllib3
import json
import scipy.io as sio
import os
import pandas as pd
from queue import Queue
import logging
from logging.handlers import RotatingFileHandler

# Configure logging with a rotating file handler
log_handler = RotatingFileHandler('jobinja_scraper.log', maxBytes=5*1024*1024, backupCount=2)
log_handler.setLevel(logging.ERROR)
log_formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
log_handler.setFormatter(log_formatter)
logger = logging.getLogger()
logger.setLevel(logging.ERROR)
logger.addHandler(log_handler)

class Jobinja:
    def __init__(self, base_url="https://jobinja.ir/", save_dir="JobInja/v3"):
        self.base_url = base_url
        self.headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
        }
        self.save_dir = save_dir
        os.makedirs(self.save_dir, exist_ok=True)
        self.all_jobs_data = []
        urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

    def get_links(self):
        try:
            response = requests.get(self.base_url, headers=self.headers, verify=False, timeout=10)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, 'html.parser')
            links = soup.find_all('a', href=True)
            subpage_links = Queue()
            unique_links = set()

            for link in links:
                href = link['href']
                if href.startswith('/'):
                    full_url = self.base_url + href
                elif href.startswith('http'):
                    full_url = href
                else:
                    continue

                if full_url not in unique_links:
                    subpage_links.put(full_url)
                    unique_links.add(full_url)

            print(f"Found {len(unique_links)} unique links.")  # Debugging line
            return subpage_links
        except requests.exceptions.RequestException as e:
            logger.error(f"An error occurred while getting links: {e}")
            return Queue()

    @staticmethod
    def clean_persian_text(text):
        cleaned_text = re.sub(r'[^؀-ۿ\s]', ' ', text)
        cleaned_text = re.sub(r'\s+', ' ', cleaned_text)
        return cleaned_text.strip()

    def extract_job_features(self, subpage_soup):
        job_features = {}

        job_title_tag = subpage_soup.find('h1')
        job_features['job_title'] = self.clean_persian_text(job_title_tag.get_text()) if job_title_tag else "اطلاعات موجود نیست"

        job_description_tag = subpage_soup.find('div', class_='job-description')
        job_features['job_description'] = self.clean_persian_text(job_description_tag.get_text()) if job_description_tag else "اطلاعات موجود نیست"

        additional_features = {
            'job_category': ('div', 'job-category'),
            'job_location': ('div', 'job-location'),
            'employment_type': ('div', 'employment-type'),
            'min_experience': ('div', 'min-experience'),
            'salary': ('div', 'salary'),
            'gender': ('div', 'gender'),
            'military_status': ('div', 'military-status'),
            'education_level': ('div', 'education-level'),
            'company_intro': ('div', 'company-intro'),
            'skills_required': ('div', 'skills-required'),
        }

        for feature, (tag, class_name) in additional_features.items():
            feature_tag = subpage_soup.find(tag, class_=class_name)
            job_features[feature] = self.clean_persian_text(feature_tag.get_text()) if feature_tag else "اطلاعات موجود نیست"

        job_features['content_snippet'] = self.clean_persian_text(subpage_soup.get_text())
        
        print(f"Extracted features: {job_features}")  # Debug: print extracted features
        return job_features

    def scrape_jobs(self):
        subpage_links = self.get_links()

        while not subpage_links.empty():
            subpage_url = subpage_links.get()
            try:
                subpage_response = requests.get(subpage_url, headers=self.headers, verify=False, timeout=10)
                subpage_response.raise_for_status()
                subpage_soup = BeautifulSoup(subpage_response.text, 'html.parser')
                
                job_data = self.extract_job_features(subpage_soup)

                # Only append if job data is valid and title is not empty
                if job_data['job_title'] and job_data['job_title'] != "اطلاعات موجود نیست":
                    self.all_jobs_data.append(job_data)

                    # Save each job's features in a separate .txt file
                    job_title = re.sub(r'[<>:"/\\|?*]', '_', job_data['job_title'])  # Clean job title for file naming
                    file_path = os.path.join(self.save_dir, f"{job_title}.txt")
                    with open(file_path, 'w', encoding='utf-8') as file:
                        for key, value in job_data.items():
                            file.write(f"{key.capitalize()}: {value}\n")

                    print(f"Saved job data to {file_path}")  # Print confirmation of saving the file
                else:
                    print(f"No valid job title for URL: {subpage_url}")  # Debug message for invalid title

                time.sleep(1)  # Add delay to avoid overwhelming the server

            except requests.exceptions.RequestException as e:
                logger.error(f"An error occurred while accessing {subpage_url}: {e}")
                print(f"Request failed for URL: {subpage_url}. Error: {e}")  # Print error for visibility

    def save_dataset(self):
        if self.all_jobs_data:
            mat_file_path = os.path.join(self.save_dir, "jobinja_data.mat")
            json_file_path = os.path.join(self.save_dir, "jobinja_data.json")

            job_data_dict = {f"job_{i}": job for i, job in enumerate(self.all_jobs_data)}
            sio.savemat(mat_file_path, job_data_dict)

            with open(json_file_path, 'w', encoding='utf-8') as json_file:
                json.dump(self.all_jobs_data, json_file, ensure_ascii=False, indent=4)
            print("Dataset saved successfully.")  # Confirmation message for saved dataset
        else:
            print("No job data to save.")  # Informative message if no data

    def display_dataset(self):
        df = pd.DataFrame(self.all_jobs_data)
        if df.empty:
            print("No job data available.")
        else:
            print(df)
        return df

    def descriptive_statistics(self):
        df = pd.DataFrame(self.all_jobs_data)
        if not df.empty:
            print(df.describe(include='all'))
        else:
            print("No data available to generate statistics.")

# Example usage
if __name__ == "__main__":
    jobinja_scraper = Jobinja()
    jobinja_scraper.scrape_jobs()
    jobinja_scraper.save_dataset()
    df = jobinja_scraper.display_dataset()
    jobinja_scraper.descriptive_statistics()


No job data to save.
No job data available.
No data available to generate statistics.
