# Enhanced NLP Workbook for Recruitment Chatbot with Advanced Feature Engineering, State Machine, Sentiment Analysis, Personalization and Graph Database Integration

In [7]:
!conda list

# packages in environment at C:\Users\gerri\.conda\envs\capstone:
#
# Name                    Version                   Build  Channel
ace-tools                 0.0                      pypi_0    pypi
aiohappyeyeballs          2.4.3              pyhd8ed1ab_0    conda-forge
aiohttp                   3.10.10         py310h38315fa_0    conda-forge
aiosignal                 1.3.1              pyhd8ed1ab_0    conda-forge
altair                    5.4.1              pyhd8ed1ab_1    conda-forge
annotated-types           0.7.0              pyhd8ed1ab_0    conda-forge
anyio                     4.6.2.post1        pyhd8ed1ab_0    conda-forge
arrow-cpp                 16.1.0               h7cd61ee_0  
asttokens                 2.4.1              pyhd8ed1ab_0    conda-forge
async-timeout             4.0.3              pyhd8ed1ab_0    conda-forge
attrs                     24.2.0             pyh71513ae_0    conda-forge
aws-c-auth                0.6.21               h1ab79aa_0    conda-forge
aws-c-cal

### Step 1: Importing Required Libraries

We import necessary Python libraries for data manipulation, feature extraction, modeling, evaluation, and visualization.

- **Numpy and Pandas** for data manipulation.
- **Scikit-learn** for model building, feature extraction, and evaluation.
- **VaderSentiment** for sentiment analysis.
- **Matplotlib and Seaborn** for data visualization.
- **Tqdm** for progress bars to monitor loops and training processes.
- **Transitions** for implementing state machines to manage the conversation flow.
- **Neo4j** for using graph databases for a flexible conversation flow.

In [None]:

# Step 1: Import Libraries
import warnings
warnings.filterwarnings("ignore")
from langchain_groq import ChatGroq
from langchain.prompts.prompt import PromptTemplate
import os
import json
import random
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from dotenv import load_dotenv
from langchain.chains import RetrievalQA

# Load environment variables (e.g., API keys for Llama RAG)
load_dotenv()

groq_api_key = os.getenv('GROQ_API_KEY')


In [None]:
query = """
    given the information {information} about a vacancy I want you to create a professional job advertisement
    """

In [None]:
prompt_template = PromptTemplate(
    input_variables=["information"],
    template=query
)

In [None]:

# Step 3: Set up Llama RAG
class LlamaRAG:
    def __init__(self, model_path):
        """
        Initializes Retrieval-Augmented Generation model using Llama.
        """
        self.model_path = model_path
        self.llm = LlamaCpp(model_path=self.model_path, api_key=groq_api_key)
        self.service_context = ServiceContext.from_defaults(llm=self.llm)
        self.index = self.build_index()

    def build_index(self):
        """
        Build the GPTVectorStoreIndex from documents in the provided 'data' folder.
        """
        documents = SimpleDirectoryReader('F:\Capstone\Github Repo\Recruitment_Need_Analysis_Wepapp_DS_Capstone\data').load_data()
        index = VectorStoreIndex.from_documents(documents, service_context=self.service_context)
        return index

    def query(self, user_input):
        """
        Query the Llama model with user input to get a response.
        """
        response = self.index.query(user_input)
        return response.response

# Initialize Llama RAG
rag_model = LlamaRAG(model_path='./path/to/your/llama/model.bin')

In [None]:
# Step 3: Set up Llama RAG
class LlamaRAG:
    def __init__(self, model_path):
        """
        Initializes Retrieval-Augmented Generation model using Llama.
        """
        self.model_path = model_path
        self.llm = LlamaCpp(model_path=self.model_path)
        self.service_context = ServiceContext.from_defaults(llm=self.llm)
        self.index = self.build_index()

    def build_index(self):
        """
        Build the GPTSimpleVectorIndex from documents in the provided 'data' folder.
        """
        documents = SimpleDirectoryReader('F:\Capstone\Github Repo\Recruitment_Need_Analysis_Wepapp_DS_Capstone\data').load_data()
        index = GPTSimpleVectorIndex.from_documents(documents, service_context=self.service_context)
        return index

    def query(self, user_input):
        """
        Query the Llama model with user input to get a response.
        """
        response = self.index.query(user_input)
        return response.response

# Initialize Llama RAG
rag_model = LlamaRAG(model_path='./path/to/your/llama/model.bin')


In [None]:
# Step 4: Update Recruitment Bot Workflow
class RecruitmentBot:
    def __init__(self):
        # State machine setup for managing bot conversation flow
        self.states = ['initial', 'gathering_info', 'presenting_summary', 'done']
        self.machine = Machine(model=se
lf, states=self.states, initial='initial')
        self.machine.add_transition(trigger='start_info_gathering', source='initial', dest='gathering_info')
        self.machine.add_transition(trigger='complete_summary', source='gathering_info', dest='presenting_summary')
        self.machine.add_transition(trigger='finish', source='presenting_summary', dest='done')

    def ask_question(self, user_input):
        """
        Uses OpenAI RAG to generate relevant questions for the role based on user responses.
        """
        response = rag_model(user_input)
        return response

    def collect_input(self, user_input):
        """
        Collect input from the user and update internal states, if needed.
        """
        if self.state == 'gathering_info':
            # Process input using RAG for enhanced responses
            detailed_response = self.ask_question(user_input)
            return detailed_response
        elif self.state == 'presenting_summary':
            # Summarize details collected
            return "Here is the summary of information gathered."  # Placeholder

In [None]:
# Step 5: Interaction with the Bot
if __name__ == "__main__":
    bot = RecruitmentBot()
    bot.start_info_gathering()
    
    print("Welcome to the Recruitment Assistant!")
    while True:
        user_input = input("You: ")
        if user_input.lower() == "quit":
            bot.finish()
            break
        # Collect input and respond using RAG
        response = bot.collect_input(user_input)
        print(f"Bot: {response}")

# Step 6: Deployment and Fine-tuning
# Deployment:
# 1. Dockerize the application: Create a Dockerfile that contains all necessary dependencies for the bot, including Python, required libraries, and environment variables.
# 2. Create a requirements.txt file to list all Python dependencies:
#
# transitions
# langchain
# dotenv
# scikit-learn
# faiss-cpu
#
# 3. Build the Docker image:
#    docker build -t recruitment_bot .
# 4. Run the bot in a container:
#    docker run -p 8080:8080 recruitment_bot
# 5. Deploy on a cloud service such as AWS, Azure, or Google Cloud Platform for scalability.
#
# Fine-Tuning:
# 1. Optimize Retrieval Strategy:
#    - Modify the FAISS document store to include more documents for richer context.
#    - Adjust the indexing parameters in FAISS to balance between retrieval speed and accuracy.
#
# 2. Tune OpenAI Hyperparameters:
#    - Experiment with model parameters such as temperature, max_tokens, and frequency_penalty to control the style and detail of responses.
#
# 3. Enhance NLP capabilities:
#    - Use additional NLP techniques, such as Named Entity Recognition (NER), to better extract and understand user inputs.
#
# 4. Customization for Specific Roles:
#    - Pre-train the model on documents related to specific industries (e.g., finance, healthcare) to provide more nuanced and role-specific responses.
#
# 5. Update Document Store:
#    - Regularly update the FAISS document store with new job postings, company data, and industry insights to keep responses fresh and relevant.


### Step 3: Loading, Merging, and Integrating New Datasets

In this step, we merge various data sources (`sample_skills.csv`, `sample_job_summary.csv`, and `gd_rev_preprocessed.csv`) to form a unified dataset for further analysis. This helps provide a complete understanding of the job descriptions.

- **Data Sources**: Skills, job summaries, and interview Q&A.
- **Purpose**: To enrich the dataset with all possible information to produce insightful NLP analysis.
- **Merging Strategy**: Merge on `job_title` to ensure that all related information is brought together.


In [None]:
# Load datasets
gsearch_jobs = pd.read_csv('data/gsearch_jobs.csv')

In [None]:
gsearch_jobs.info()

In [None]:
gsearch_jobs.head()

### Step 4: Data Cleaning and Preprocessing

We clean text data to remove any unnecessary characters and prepare the dataset for NLP operations. This involves removing punctuation, converting text to lowercase, and combining key textual information into a single column for analysis.


In [None]:
def clean_text(text):
    text = re.sub(r'\W+', ' ', text)  # Remove non-word characters
    text = text.lower()  # Convert to lowercase
    return text

# Apply text cleaning to relevant columns
gsearch_jobs['description_clean'] = gsearch_jobs['description'].apply(lambda x: clean_text(str(x)))

# Drop rows with missing descriptions
gsearch_jobs.dropna(subset=['description_clean'], inplace=True)

### Step 5: Advanced Feature Engineering

#### Step 5.1: TF-IDF Vectorization

We use **TF-IDF Vectorizer** to convert the textual data into numerical feature vectors that the model can process.

- **Why TF-IDF**: It captures the importance of words in a document relative to the corpus, making it a powerful feature extraction technique for NLP.

In [None]:
# TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
X = vectorizer.fit_transform(gsearch_jobs['description_clean'])

#### Step 5.2: Polynomial Features and Standard Scaling

- **Polynomial Features**: Increase the complexity of our features by generating interaction terms, which can improve model performance when relationships between features are non-linear.
- **Standard Scaling**: Standardizes the features by removing the mean and scaling to unit variance, which is especially important for linear models.


In [None]:
from sklearn.naive_bayes import ComplementNB

In [None]:
# Add polynomial features to increase feature complexity (Reduced degree to prevent memory issues)
poly = PolynomialFeatures(degree=1, interaction_only=True, include_bias=False)
X_poly = poly.fit_transform(X.toarray())

# Min-Max Scaling to keep features non-negative
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X_poly)

#### Step 5.3: Dimensionality Reduction

We use **Truncated SVD** to reduce the dimensionality of the TF-IDF matrix. This helps reduce computational cost and overfitting while preserving essential information.

In [None]:
# Apply Truncated SVD to reduce dimensionality
svd = TruncatedSVD(n_components=50, random_state=42)  # Reduce dimensions further to avoid overfitting
X_reduced = svd.fit_transform(X)


### Step 6: Introducing State Machine for Conversation Flow Management

We use a **State Machine** to define the conversation flow for guiding managers through the recruitment question generation process.

#### Step 6.1: Defining States and Transitions

- **States**: Represent parts of the conversation (e.g., Role Requirements, Company Environment, Compensation & Benefits).
- **Transitions**: Define how the flow moves from one state to another based on the manager's response.


In [None]:
from transitions import Machine

# Define states for the recruitment conversation flow
states = ['role_requirements', 'company_environment', 'compensation_benefits', 'role_nuances', 'final_summary']

# Define the state machine model
class RecruitmentAssistant:
    def __init__(self, name):
        self.name = name

# Create an instance of RecruitmentAssistant
recruitment_assistant = RecruitmentAssistant("Assistant")

# Create a state machine with defined states and transitions
machine = Machine(model=recruitment_assistant, states=states, initial='role_requirements')

# Define state transitions based on manager inputs
machine.add_transition(trigger='ask_company_environment', source='role_requirements', dest='company_environment')
machine.add_transition(trigger='ask_compensation', source='company_environment', dest='compensation_benefits')
machine.add_transition(trigger='ask_role_nuances', source='compensation_benefits', dest='role_nuances')
machine.add_transition(trigger='summarize', source='role_nuances', dest='final_summary')

# Example of using the state machine
recruitment_assistant.ask_company_environment()
print(recruitment_assistant.state)  # Output: company_environment

### Step 7: Dynamic Question Generation with Decision Trees

We use **Decision Trees** for dynamic questioning, where each node represents a question and each branch represents possible answers leading to different follow-up questions.


In [None]:
from sklearn.tree import DecisionTreeClassifier

# Define sample training data for decision tree - features are hypothetical attributes, target is follow-up question ID
X_sample = [[1, 0, 1], [0, 1, 1], [1, 1, 0], [0, 0, 1]]  # Example feature vectors
y_sample = [0, 1, 2, 3]  # Follow-up question IDs

decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_sample, y_sample)

# Use decision tree to determine the next question
sample_input = [1, 0, 1]
next_question = decision_tree.predict([sample_input])
print(f"Next question ID: {next_question}")

### Step 8: Introducing Personalization and Rule-Based Logic

We use **NLP models** for evaluating sentiment, determining the conversation tone, and dynamically adjusting questions to improve personalization.

#### Step 8.1: Sentiment Analysis and Adaptive Questioning

We use **Naïve Bayes** or **Logistic Regression** models for text classification to evaluate the sentiment of responses and determine the conversation's engagement level.


In [None]:
from sklearn.naive_bayes import ComplementNB
from sklearn.linear_model import LogisticRegression

# Train a simple sentiment model (example data)
text_clf = ComplementNB()
y_labels = np.abs(np.random.choice([0, 1], len(X_reduced)))  # Randomly generated labels for demonstration, ensure no negative values
text_clf.fit(X_reduced, y_labels)

# Analyze sentiment and adjust follow-up
sample_response = "I would prefer a remote work setting."
sample_vector = vectorizer.transform([sample_response])
sentiment = text_clf.predict(sample_vector)
if sentiment == 1:
    print("Positive sentiment detected, proceeding with follow-up questions about remote tools and flexibility.")

#### Step 8.2: Rule-Based Logic Stored in JSON

To make the decision flow configurable and easier to maintain, we store the conversation rules in a JSON file.


In [None]:
# Define rules for conversation flow in a JSON format
rules = {
    "role_requirements": {
        "next": "company_environment",
        "questions": ["What are the must-have skills for this role?", "Are there any certifications required?"]
    },
    "company_environment": {
        "next": "compensation_benefits",
        "questions": ["How many people are in the team?", "Can you describe the company culture?"]
    }
}

# Example usage of JSON-based rules
current_state = "role_requirements"
for question in rules[current_state]["questions"]:
    print(question)

### Step 9: Combining Predefined Question Templates and Dynamic Elements

We blend **predefined question templates** with dynamically generated content to ensure the conversation is both personalized and comprehensive.

- Start with a core set of questions (e.g., role-specific skills).
- Adaptively generate follow-up prompts based on previous answers.


In [None]:
# Example of predefined question and dynamically generated follow-up
core_questions = ["What are the must-have skills for this role?"]
response = "This role is temporary."

# Use response to create a personalized follow-up
if "temporary" in response.lower():
    follow_up = "Considering that the role is temporary, would you like to discuss the option for contract renewal and team integration procedures?"
    core_questions.append(follow_up)

for question in core_questions:
    print(question)

### Step 10: Splitting Data for Training and Evaluation

We split our dataset into training and testing sets to evaluate our model's performance accurately.

In [None]:
# Splitting data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_reduced, gsearch_jobs['title'], test_size=0.2, random_state=42)

### Step 11: Model Training

We use an **SGDClassifier**, a linear model with stochastic gradient descent learning, which is efficient for large datasets.

- **Why SGD**: It works well with high-dimensional data and supports various loss functions suitable for classification.


In [None]:
# Initialize and train SGD Classifier
model = SGDClassifier()
model.fit(X_train, y_train)

# Display training progress
print("Model training complete. Now proceeding to evaluation...")

### Step 12: Model Evaluation, Sentiment, Interview Response Analysis, and Explainability

#### Step 12.1: Sentiment and Interview Response Analysis

We add **sentiment analysis** to understand the overall sentiment behind the 'pros' and 'cons' sections, and the candidate interview responses. This helps gauge candidates' attitudes and alignment with company culture.


In [None]:
# Initialize sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

# Sentiment analysis for pros and cons
gsearch_jobs['pros_sentiment'] = gsearch_jobs['description_tokens'].apply(lambda x: analyzer.polarity_scores(str(x))['compound'])

### Step 12.2: Enhanced Analysis

We use various metrics to assess content quality and coverage, ensuring that the model-generated questions align with the hiring requirements and cover essential job aspects.

### Step 13: Explainability with SHAP

We use **SHAP (SHapley Additive exPlanations)** to explain the output of our model, providing transparency in decision-making and helping us understand which features are most influential in predictions.


In [None]:
# Use a small subset of the training data to fit the SHAP explainer
explainer = shap.Explainer(model, X_train[:100])
shap_values = explainer(X_test[:10])

# Plot summary of the SHAP values
shap.summary_plot(shap_values, X_test[:10], feature_names=vectorizer.get_feature_names_out())

### Step 14: Combining Graph Database for Conversation Flexibility

Using a **Graph Database** like Neo4j to store and navigate through your conversation flow provides flexibility to adapt questions based on user interaction.


In [None]:
# Establish a connection to Neo4j database
driver = GraphDatabase.driver("bolt://localhost:7687", auth=("neo4j", "password"))

# Define a function to add nodes and relationships
def add_question(tx, question, answer_options):
    tx.run("CREATE (q:Question {text: $question})", question=question)
    for option in answer_options:
        tx.run("MATCH (q:Question {text: $question}) CREATE (q)-[:HAS_OPTION]->(:Option {text: $option})", question=question, option=option)