In [1]:
# Import necessary libraries
import pandas as pd
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from gensim.models import Word2Vec
from transformers import DistilBertTokenizer, DistilBertModel
import torch
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from torch.quantization import quantize_dynamic
import os


from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [2]:
# Correct the file paths for input and output, both within the 'data' folder
skills = pd.read_csv('data/skills_preprocessed.csv')
job_summary = pd.read_csv('data/job_summary_preprocessed.csv')
gd_rev = pd.read_csv('data/gd_rev_preprocessed.csv')

In [3]:
# Merge DataFrames on 'job_title'
merged_df = skills.merge(job_summary, on='job_link', how='left')


In [4]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1296381 entries, 0 to 1296380
Data columns (total 5 columns):
 #   Column             Non-Null Count    Dtype 
---  ------             --------------    ----- 
 0   job_link           1296381 non-null  object
 1   job_skills         1294295 non-null  object
 2   job_skills_clean   1294294 non-null  object
 3   job_summary        1296376 non-null  object
 4   job_summary_clean  1296365 non-null  object
dtypes: object(5)
memory usage: 49.5+ MB


In [5]:
import nltk
print(nltk.data.path)


['C:\\Users\\gerri/nltk_data', 'f:\\Capstone\\your_envs_directory\\mlenv\\nltk_data', 'f:\\Capstone\\your_envs_directory\\mlenv\\share\\nltk_data', 'f:\\Capstone\\your_envs_directory\\mlenv\\lib\\nltk_data', 'C:\\Users\\gerri\\AppData\\Roaming\\nltk_data', 'C:\\nltk_data', 'D:\\nltk_data', 'E:\\nltk_data']


In [6]:
import nltk

# Set up the paths for NLTK to search
nltk.data.path.extend([
    'C:\\Users\\gerri/nltk_data',
    'f:\\Capstone\\your_envs_directory\\mlenv\\nltk_data',
    'f:\\Capstone\\your_envs_directory\\mlenv\\share\\nltk_data',
    'f:\\Capstone\\your_envs_directory\\mlenv\\lib\\nltk_data',
    'C:\\Users\\gerri\\AppData\\Roaming\\nltk_data',
    'C:\\nltk_data',
    'D:\\nltk_data',
    'E:\\nltk_data',
    'F:\\Capstone\\your_envs_directory\\mlenv\\Lib\\site-packages\\nltk_data'
])

# Download all NLTK data to make sure nothing is missing, especially the punkt_tab resource
nltk.download('all', download_dir='F:\\Capstone\\your_envs_directory\\mlenv\\Lib\\site-packages\\nltk_data')

# Your processing code
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Example tokenization and lemmatization function for reference
def tokenize_and_lemmatize(text):
    lemmatizer = WordNetLemmatizer()
    tokens = word_tokenize(text.lower())  # Tokenize and lowercase
    tokens = [t for t in tokens if t.isalpha()]  # Remove punctuation
    stop_words = set(stopwords.words('english'))
    tokens = [t for t in tokens if t not in stop_words]  # Remove stopwords
    lemmatized_tokens = [lemmatizer.lemmatize(t) for t in tokens]  # Lemmatization
    return ' '.join(lemmatized_tokens)  # Return as a single string for vectorization


[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     F:\Capstone\your_envs_directory\mlenv\Lib\site-
[nltk_data]    |     packages\nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     F:\Capstone\your_envs_directory\mlenv\Lib\site-
[nltk_data]    |     packages\nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     F:\Capstone\your_envs_directory\mlenv\Lib\site-
[nltk_data]    |     packages\nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     F:\Capstone\your_envs_directory\mlenv\Lib\site-
[nltk_data]    |     packages\nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagge

In [7]:
# Step 1: Improving Data Quality
# Handling missing values by filling with 'not available'
merged_df.fillna('not available', inplace=True)

# Consistent formatting: lowercasing and removing extra whitespace
merged_df['job_summary_clean'] = merged_df['job_summary_clean'].str.lower().str.strip()
merged_df['job_skills_clean'] = merged_df['job_skills_clean'].str.lower().str.strip()

# Removing duplicates
merged_df.drop_duplicates(subset=['job_summary_clean', 'job_skills_clean'], inplace=True)

# Reducing the dataset size to 40% to speed up computation
merged_df = merged_df.sample(frac=0.4, random_state=42)

# Tokenization and Lemmatization Function
def tokenize_and_lemmatize(text):
    lemmatizer = WordNetLemmatizer()
    tokens = word_tokenize(text.lower())  # Tokenize and lowercase
    stop_words = set(stopwords.words('english'))
    tokens = [t for t in tokens if t not in stop_words]  # Remove stopwords
    lemmatized_tokens = [lemmatizer.lemmatize(t) for t in tokens]  # Lemmatization
    return ' '.join(lemmatized_tokens)  # Return as a single string for vectorization

# Apply tokenization and lemmatization to the DataFrame with tqdm for progress tracking
merged_df['job_summary_processed'] = merged_df['job_summary_clean'].apply(tokenize_and_lemmatize)
merged_df['job_skills_processed'] = merged_df['job_skills_clean'].apply(tokenize_and_lemmatize)

In [8]:
# Step 2: Word2Vec Embeddings for Simple Similarity Tasks
# Tokenize job summaries for Word2Vec
merged_df['job_summary_tokenized'] = merged_df['job_summary_processed'].apply(word_tokenize)

# Train Word2Vec model with tqdm for progress tracking
word2vec_model = Word2Vec(tqdm(merged_df['job_summary_tokenized']), vector_size=100, window=5, min_count=1, workers=4)

# Function to find similar words or roles using Word2Vec
def find_similar_roles(word, top_n=5):
    try:
        similar_roles = word2vec_model.wv.most_similar(word, topn=top_n)
        return similar_roles
    except KeyError:
        return "Word not in vocabulary."

# Example usage
similar_words = find_similar_roles('developer')
print("Similar words to 'developer':", similar_words)

100%|██████████| 516942/516942 [01:54<00:00, 4532.19it/s] 


Similar words to 'developer': [('architect', 0.7253365516662598), ('sdet', 0.7210280895233154), ('engineer', 0.683315098285675), ('sde', 0.6705234050750732), ('designer', 0.6669469475746155)]


In [9]:

# Step 3: DistilBERT Embeddings for Nuanced Contextual Understanding
# Load pre-trained DistilBERT model and tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')

# Quantize the BERT model to use int8 instead of float32 for faster inference
model = quantize_dynamic(model, {torch.nn.Linear}, dtype=torch.qint8)

# Function to get DistilBERT embeddings for job summaries with tqdm for progress tracking
def get_bert_embeddings(text, max_length=128):
    inputs = tokenizer(text, return_tensors='pt', max_length=max_length, truncation=True)
    outputs = model(**inputs)
    last_hidden_state = outputs.last_hidden_state
    # Taking the average of the token embeddings
    return torch.mean(last_hidden_state, dim=1).squeeze().detach().numpy()

# Apply BERT embedding to job summaries with tqdm for progress tracking and batch processing
batch_size = 16
bert_embeddings = []
for i in tqdm(range(0, len(merged_df), batch_size), desc="Generating BERT Embeddings in Batches"):
    batch_texts = merged_df['job_summary_processed'][i:i+batch_size].tolist()
    batch_embeddings = [get_bert_embeddings(text) for text in batch_texts]
    bert_embeddings.extend(batch_embeddings)

# Adding the embeddings back to the DataFrame
merged_df['job_summary_bert'] = bert_embeddings


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Generating BERT Embeddings in Batches: 100%|██████████| 32309/32309 [11:07:12<00:00,  1.24s/it]   


In [10]:
# Step 4: TF-IDF Vectorization
# Create TF-IDF vectors for both job summaries and skills
vectorizer = TfidfVectorizer(max_features=500)

# Vectorizing both columns with progress tracking
summary_tfidf = vectorizer.fit_transform(tqdm(merged_df['job_summary_processed']))
skills_tfidf = vectorizer.fit_transform(tqdm(merged_df['job_skills_processed']))


100%|██████████| 516942/516942 [03:13<00:00, 2668.79it/s]
100%|██████████| 516942/516942 [00:30<00:00, 17171.83it/s]


In [24]:
pip install neo4j

Collecting neo4j
  Using cached neo4j-5.26.0-py3-none-any.whl.metadata (5.9 kB)
Using cached neo4j-5.26.0-py3-none-any.whl (302 kB)
Installing collected packages: neo4j
Successfully installed neo4j-5.26.0
Note: you may need to restart the kernel to use updated packages.


In [25]:
# Import necessary libraries
import numpy as np
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from tqdm import tqdm
import shap
from textstat import flesch_reading_ease
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from transitions import Machine  # State machine for managing question flow
from neo4j import GraphDatabase  # Graph database for flexible conversation flow
import json  # For storing rule-based logic in a JSON format

#### Step 5.2: Polynomial Features and Standard Scaling

- **Polynomial Features**: Increase the complexity of our features by generating interaction terms, which can improve model performance when relationships between features are non-linear.
- **Standard Scaling**: Standardizes the features by removing the mean and scaling to unit variance, which is especially important for linear models.


In [11]:

# Step 5: Splitting Data for Modeling
# Splitting data into training and testing sets (for potential modeling)
X_train, X_test, y_train, y_test = train_test_split(
    summary_tfidf,  # Feature set (use job summaries or a combination of columns)
    skills_tfidf,   # Labels for training/testing (if we're predicting skills, etc.)
    test_size=0.2,  # 80/20 split
    random_state=42
)


In [14]:

# Step 6: Hybrid Approach for Chatbot Integration
# Caching embeddings to avoid recomputation
embedding_cache = {}

def get_bert_embeddings_with_cache(text):
    if text in embedding_cache:
        return embedding_cache[text]
    else:
        embedding = get_bert_embeddings(text)
        embedding_cache[text] = embedding
        return embedding

# Function to handle user queries with a hybrid approach
def handle_query(query):
    # Determine if the query is simple or complex
    if len(query.split()) <= 3:  # Example heuristic for simplicity
        return find_similar_roles(query)
    else:
        return get_bert_embeddings_with_cache(query)

# Example hybrid chatbot usage
user_query = "developer"
response = handle_query(user_query)
print("Response to query:", response)

# Print output of the processed DataFrame and shapes of TF-IDF vectors
print("Processed DataFrame:")
print(merged_df[['job_summary_processed', 'job_skills_processed', 'job_summary_bert']])
print("\nSummary TF-IDF Shape:", summary_tfidf.shape)
print("Skills TF-IDF Shape:", skills_tfidf.shape)

# Summary
# 1. Data Merging: Combine different DataFrames to create a unified dataset.
# 2. Data Quality Improvement: Handle missing values, ensure consistent formatting, and remove duplicates.
# 3. Tokenization and Lemmatization: Convert raw text into cleaned tokens.
# 4. Word2Vec: Train word embeddings for simple similarity tasks, such as finding related skills or job titles.
# 5. DistilBERT: Use contextual embeddings for nuanced understanding and question-answer tasks, optimized with quantization.
# 6. TF-IDF Vectorization: Convert cleaned text into numerical vectors for ML applications.
# 7. Split Data: Separate training and testing data for modeling purposes.
# 8. Hybrid Chatbot Logic: Use Word2Vec for simple queries and DistilBERT for more complex, context-aware responses, using caching to speed up repeated queries.
# 9. Progress Tracking: Use tqdm for progress tracking of computationally heavy steps.

# Tips on How to Proceed Further
# - Use Word2Vec for similarity searches in the chatbot when quick responses are needed (e.g., finding related roles).
# - Use DistilBERT embeddings for more complex responses where understanding the full context is essential.
# - Combine outputs from both approaches to create a hybrid chatbot capable of answering simple and complex queries efficiently.
# - Monitor data quality regularly and incorporate user feedback to improve both data and model performance.


Response to query: [('architect', 0.7253365516662598), ('sdet', 0.7210280895233154), ('engineer', 0.683315098285675), ('sde', 0.6705234050750732), ('designer', 0.6669469475746155)]
Processed DataFrame:
                                     job_summary_processed  \
149839   occupational health manager leading client oxf...   
556166   hybrid remote opportunity base charleston sc o...   
846189   overviewst john regional medical center locate...   
137368   time time general dentist need sylvania oh cor...   
151108   description summarywork collaboratively physic...   
...                                                    ...   
553871   newsradio wtmj marketing consultant account ex...   
1036555  senior level engineer keen embrace new excitin...   
635577   description position brightview landscape serv...   
914059   company overview genie ai lead artificial inte...   
1036186  description integrate power service ip lead no...   

                                      job_skills_proc