In [None]:
import json
import faiss
import numpy as np
import re
import os
import pandas as pd
from tqdm import tqdm, trange
from dotenv import load_dotenv
from sentence_transformers import SentenceTransformer
from chatgroq import ChatGroq

# Load environment variables (e.g., API keys for Llama RAG)
load_dotenv()

groq_api_key = os.getenv('GROQ_API_KEY')

llm = ChatGroq(
    model="llama3-8b-8192",
    temperature=0, 
    max_tokens=None,
    timeout=None,
    max_retries=2
)

  from tqdm.autonotebook import tqdm, trange


ModuleNotFoundError: No module named 'chatgroq'

# # Job Description Generator


In [None]:
salaries = pd.read_json("data/json/salaries.json", lines=True)
resumes = pd.read_json("data/json/Entity Recognition in Resumes.json", lines=True)
it_jobs = pd.read_json("data/json/IT Job Desc Annotated Detailed.json", lines=True)

In [None]:
# ## Step 1: Initialization

class JobDescriptionGenerator:
    def __init__(self):
        # Initialize data with default values
        self.data = {
            "Position": "N/A",
            "Specialization": "N/A",
            "Work Model": "N/A",
            "Remote Location": "N/A",
            "Remote Timezone": "N/A",
            "Technical Equipment": "N/A",
            "Remote Percentage": "N/A",
            "BI Tools": "N/A",
            "Required Tools": "N/A",
            "Visualization Tools": "N/A",
            "Statistical Methods": "N/A",
            "Big Data Tools": "N/A",
            "Experience Level": "N/A",
            "Leadership Skills": "None",
            "Educational Requirements": "None",
            "Project Leadership": "No",
            "Compensation": "N/A",
            "Home Office Allowance": "None",
            "Remote Benefits": "None",
            "Additional Benefits": "None"
        }
        # Load pre-trained model for embedding generation
        self.model = SentenceTransformer('all-MiniLM-L6-v2')
        # Initialize FAISS index for similarity search
        self.index = None
        self.documents = []

In [None]:
    # ## Step 2: Data Cleaning and Preprocessing
def clean_and_preprocess_dataset(self, dataset_path):
        # Load dataset
        with open(dataset_path, 'r') as f:
            data = json.load(f)

        # Filter records with missing job descriptions
        filtered_data = [item for item in data if 'job_description' in item and item['job_description'].strip()]

        # Extract and normalize job descriptions
        job_descriptions = [item['job_description'].strip().lower() for item in filtered_data]

        # Remove duplicates
        unique_job_descriptions = list(set(job_descriptions))

        # Preprocess text to remove special characters
        processed_descriptions = [re.sub(r'[^a-zA-Z0-9\s]', '', desc) for desc in unique_job_descriptions]

        return processed_descriptions

In [None]:
    # ## Step 3: Loading Dataset and Building FAISS Index
def load_dataset_and_build_index(self, dataset):
        # Clean and preprocess dataset
        print("Cleaning and preprocessing dataset...")
        job_descriptions = self.clean_and_preprocess_dataset(dataset)
        self.documents = job_descriptions

        # Create embeddings for job descriptions with progress report
        print("Generating embeddings...")
        embeddings = []
        for desc in tqdm(job_descriptions, desc="Embedding job descriptions"):
            embedding = self.model.encode(desc)
            embeddings.append(embedding)
        embeddings = np.array(embeddings)

        # Create a FAISS index and add embeddings
        print("Building FAISS index...")
        if embeddings.size > 0:
            dimension = embeddings.shape[1]
            self.index = faiss.IndexFlatL2(dimension)
            self.index.add(embeddings)
        else:
            print("Error: No embeddings found to build the FAISS index.")

In [None]:
 # ## Step 4: Finding Similar Job Descriptions
def find_similar_jobs(self, query, k=3):
        # Create embedding for the query
        print("Generating embedding for the query...")
        query_embedding = self.model.encode([query])

        # Search the FAISS index for similar job descriptions
        print("Searching for similar job descriptions...")
        _, indices = self.index.search(np.array(query_embedding), k)

        # Retrieve and return the top-k most similar job descriptions
        return [self.documents[idx] for idx in indices[0]]


In [None]:
    # ## Step 5: Interactive User Input
def ask_question(self, question, options=None, multiple=False):
        print(question)
        if options:
            for i, option in enumerate(options, 1):
                print(f"{i}. {option}")
            if multiple:
                selected_options = input("Enter the numbers of all applicable options, separated by commas: ")
                return [options[int(choice.strip()) - 1] for choice in selected_options.split(",")]
            else:
                while True:
                    try:
                        choice = int(input("Please choose an option: ")) - 1
                        if 0 <= choice < len(options):
                            return options[choice]
                        else:
                            print("Invalid choice. Please enter one of the displayed numbers.")
                    except ValueError:
                        print("Invalid input. Please enter a number.")
        else:
            # Use Llama RAG to get an enhanced response for a more complex question
            user_input = input("Your question or query: ").strip()
            response = llm.ask(user_input)
            print(f"AI Assistant Response: {response}")
            return response

def ask_text_input(self, prompt):
        return input(prompt).strip()

In [None]:
    # ## Step 6: Collecting Job Information
def collect_position_info(self):
        position = self.ask_question("What position are you hiring for?", ["Data Scientist", "Data Analyst"])
        self.data["Position"] = position

        if position == "Data Scientist":
            specialization = self.ask_question("Is there a specific focus for this role?", ["Machine Learning", "Statistics", "Big Data"])
            self.data["Specialization"] = specialization

            if specialization == "Machine Learning":
                ml_focus = self.ask_question("Are there specific machine learning techniques required?", ["Deep Learning", "NLP", "Reinforcement Learning"], multiple=True)
                self.data["Machine Learning Focus"] = ml_focus

                if "Deep Learning" in ml_focus:
                    self.data["Frameworks"] = self.ask_question("Are there specific frameworks required?", ["TensorFlow", "Keras", "PyTorch"], multiple=True)
                if "NLP" in ml_focus:
                    self.data["NLP Tools"] = self.ask_question("Are there specific NLP libraries or tools that should be used?", ["spaCy", "Hugging Face", "NLTK"], multiple=True)

            elif specialization == "Statistics":
                self.data["Statistical Methods"] = self.ask_text_input("Which statistical methods are particularly important (e.g., regression analysis, ANOVA)?: ")

            elif specialization == "Big Data":
                self.data["Big Data Tools"] = self.ask_text_input("Are there specific Big Data tools the candidate should be proficient with (e.g., Spark, Hadoop)?: ")

            self.data["Tools"] = self.ask_text_input("Please list any specific tools required for this role (e.g., Python, Java, Spark): ")

        elif position == "Data Analyst":
            focus_area = self.ask_question("What is the main focus for this role?", ["Statistical Analysis", "Business Intelligence", "Data Visualization"])
            self.data["Focus Area"] = focus_area

            if focus_area == "Business Intelligence":
                self.data["BI Tools"] = self.ask_question("Which BI tools should the candidate use?", ["PowerBI", "Tableau", "QlikView"], multiple=True)
                if "PowerBI" in self.data["BI Tools"]:
                    self.data["PowerBI Features"] = self.ask_text_input("Are there specific PowerBI features the candidate should know (e.g., DAX, Power Query)?: ")

            elif focus_area == "Data Visualization":
                self.data["Visualization Tools"] = self.ask_text_input("Which visualization tools are required (e.g., Matplotlib, D3.js, ggplot)?: ")

            self.data["Tools"] = self.ask_text_input("Please list any specific tools required for this role (e.g., Excel, PowerBI, SQL): ")

def collect_work_model_info(self):
        work_model = self.ask_question("Is the position On-Site, Remote, or Hybrid?", ["On-Site", "Remote", "Hybrid"])
        self.data["Work Model"] = work_model

        if work_model == "Remote":
            self.data["Remote Location"] = self.ask_question("Can the role be remote anywhere in Germany, EU-wide, or globally?", ["Germany", "EU-wide", "Worldwide"])
            self.data["Remote Timezone"] = self.ask_question("Are there timezone or work hour requirements?", ["No specific requirements", "CET timezone preferred", "Fixed working hours required"])
            self.data["Technical Equipment"] = self.ask_question("Will technical equipment be provided for remote work?", ["Yes", "No"])

        elif work_model == "Hybrid":
            self.data["Remote Percentage"] = self.ask_question("What percentage of work is Remote vs. On-Site?", ["70% Remote / 30% On-Site", "50% Remote / 50% On-Site"])

def collect_qualifications_info(self):
        experience_level = self.ask_question("What level of experience is required for this role?", ["Junior", "Mid-Level", "Senior"])
        self.data["Experience Level"] = experience_level

        if experience_level == "Junior":
            self.data["Educational Requirements"] = self.ask_text_input("Are there specific educational requirements (e.g., Bachelor's in Computer Science)?: ")

        elif experience_level == "Mid-Level":
            self.data["Project Experience"] = self.ask_text_input("What project experience should a mid-level candidate have (e.g., data analysis projects, model training)?: ")

        elif experience_level == "Senior":
            self.data["Project Leadership"] = self.ask_question("Is project leadership experience required?", ["Yes", "No"])
            if self.data["Project Leadership"] == "Yes":
                self.data["Leadership Skills"] = self.ask_text_input("What leadership skills are particularly important (e.g., team leadership, strategic planning)?: ")

def collect_compensation_info(self):
        self.data["Compensation"] = self.ask_question("What does the compensation package include?", ["Fixed salary", "Variable compensation", "Both"])

        remote_benefits = self.ask_question("Are there specific benefits for remote employees?", ["Yes", "No"])
        if remote_benefits == "Yes":
            # Additional follow-up questions for remote benefits
            benefits = []
            health_benefits = self.ask_text_input("Specify any health benefits (e.g., health insurance, wellness programs): ")
            if health_benefits:
                benefits.append(f"Health Benefits: {health_benefits}")
            
            internet_stipend = self.ask_text_input("Specify if there's an internet stipend or reimbursement: ")
            if internet_stipend:
                benefits.append(f"Internet Stipend: {internet_stipend}")
            
            professional_dev = self.ask_text_input("Specify if there are professional development funds (e.g., training, courses): ")
            if professional_dev:
                benefits.append(f"Professional Development: {professional_dev}")
            
            equipment_allowance = self.ask_text_input("Specify any equipment allowance for remote work: ")
            if equipment_allowance:
                benefits.append(f"Equipment Allowance: {equipment_allowance}")
                
            # Add gathered remote benefits to data
            self.data["Remote Benefits"] = ", ".join(benefits)

            # Allowance follow-up question
            periodicity = self.ask_question("Is the home office allowance provided monthly or yearly?", ["Monthly", "Yearly"])
            amount = self.ask_text_input(f"Enter the {periodicity.lower()} allowance amount (e.g., 50 Euro): ")
            self.data["Home Office Allowance"] = f"{periodicity} {amount}"

In [None]:
    # ## Step 7: Generating Job Description
def generate_job_description(self):
        description = f"""
        Position: {self.data["Position"]}
        Specialization: {self.data["Specialization"]}
        Work Model: {self.data["Work Model"]}
        Remote Location: {self.data["Remote Location"]}
        Remote Timezone: {self.data["Remote Timezone"]}
        Technical Equipment: {self.data["Technical Equipment"]}
        Remote Percentage: {self.data["Remote Percentage"]}
        BI Tools: {self.data["BI Tools"]}
        Required Tools: {self.data["Tools"]}
        Visualization Tools: {self.data["Visualization Tools"]}
        Statistical Methods: {self.data["Statistical Methods"]}
        Big Data Tools: {self.data["Big Data Tools"]}
        Experience Level: {self.data["Experience Level"]}
        Leadership Skills: {self.data["Leadership Skills"]}
        Educational Requirements: {self.data["Educational Requirements"]}
        Project Leadership: {self.data["Project Leadership"]}
        Compensation: {self.data["Compensation"]}
        Home Office Allowance: {self.data["Home Office Allowance"]}
        Remote Benefits: {self.data["Remote Benefits"]}
        Additional Benefits: {self.data["Additional Benefits"]}
        """
        print("\n--- Generated Job Description ---")
        print(description.strip())

In [None]:
if __name__ == "__main__":
    # Create an instance of the JobDescriptionGenerator
    generator = JobDescriptionGenerator()

    # Print all available methods to check if the class is properly defined
    print(dir(generator))  # This will list all attributes and methods of the object

    # Load Dataset and Build Index
    generator.load_dataset_and_build_index(it_jobs)

    # Run Job Description Generator
    generator.run()

    # Find Similar Job Descriptions
    query = input("Enter a job role or description to find similar roles: ")
    similar_jobs = generator.find_similar_jobs(query)
    print("\n--- Similar Job Descriptions ---")
    for job in similar_jobs:
        print(job)


In [None]:
    # ### Step 9: Load Dataset and Build Index
generator = JobDescriptionGenerator()
generator.load_dataset_and_build_index(it_jobs)

In [None]:
    # ### Step 10: Run Job Description Generator
generator.run()

In [None]:
    # ### Step 11: Find Similar Job Descriptions
query = input("Enter a job role or description to find similar roles: ")
similar_jobs = generator.find_similar_jobs(query)
print("\n--- Similar Job Descriptions ---")
for job in similar_jobs:
        print(job)