In [None]:
##################
### FOR　COLAB ###
##################
!pip install --upgrade scipy gensim spacy pyLDAvis pandas nltk
!python -m spacy download en_core_web_sm

# restart kernel
import os
os.kill(os.getpid(), 9)

Collecting scipy
  Downloading scipy-1.15.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.0/62.0 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m92.8 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
import pandas as pd
import spacy
from spacy.pipeline import EntityRuler
import gensim
from gensim import corpora
from gensim.models import CoherenceModel
import pyLDAvis
import pyLDAvis.gensim_models
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import nltk

# Download required NLTK data
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

# Step 1: Load the Data
def load_data(file_path):
    df = pd.read_csv(file_path)
    # Remove "關於該職缺" from job descriptions
    df['jobs-desc'] = df['jobs-desc'].str.replace("關於該職缺", "", regex=False)
    return df

# Keyword Extraction with spaCy
def setup_spacy_ner():
    nlp = spacy.load("en_core_web_sm")
    ruler = nlp.add_pipe("entity_ruler", before="ner")

    # Define technical entities (tools, libraries, methodologies)
    technical_entities = [
        {"label": "TECH_ENTITY", "pattern": "Scikit-learn"},
        {"label": "TECH_ENTITY", "pattern": "PyTorch"},
        {"label": "TECH_ENTITY", "pattern": "TensorFlow"},
        {"label": "TECH_ENTITY", "pattern": "A/B Testing"},
        {"label": "TECH_ENTITY", "pattern": "MLOps"},
        {"label": "TECH_ENTITY", "pattern": "Docker"},
        {"label": "TECH_ENTITY", "pattern": "Kubernetes"}
    ]
    ruler.add_patterns(technical_entities)
    return nlp

def extract_technical_entities(nlp, text):
    doc = nlp(text)
    entities = [ent.text for ent in doc.ents if ent.label_ == "TECH_ENTITY"]
    return entities

def compare_skills(df, nlp):
    hidden_requirements = []
    for index, row in df.iterrows():
        title_skills = set(extract_technical_entities(nlp, row['title']))
        desc_skills = set(extract_technical_entities(nlp, row['jobs-desc']))
        hidden = desc_skills - title_skills  # Skills in description but not in title
        hidden_requirements.append({
            'title': row['title'],
            'hidden_requirements': list(hidden)
        })
    return hidden_requirements

# Topic Modeling with LDA
def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    tokens = word_tokenize(text.lower())
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token.isalnum() and token not in stop_words]
    return tokens

def apply_lda(texts, num_topics_range):
    processed_texts = [preprocess_text(text) for text in texts]
    dictionary = corpora.Dictionary(processed_texts)
    corpus = [dictionary.doc2bow(text) for text in processed_texts]

    # Find optimal number of topics using coherence score
    coherence_scores = []
    models = []
    for num_topics in num_topics_range:
        lda_model = gensim.models.LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, random_state=42)
        coherence_model = CoherenceModel(model=lda_model, texts=processed_texts, dictionary=dictionary, coherence='c_v')
        coherence_scores.append(coherence_model.get_coherence())
        models.append(lda_model)

    optimal_index = coherence_scores.index(max(coherence_scores))
    optimal_model = models[optimal_index]
    return optimal_model, corpus, dictionary

def interpret_topics(lda_model, num_words=30):
    topics = lda_model.print_topics(num_words=num_words)
    topic_labels = []
    for topic_id, topic in topics:
        top_words = [word.split('"')[1] for word in topic.split(' + ')]
        if "feature" in top_words or "model" in top_words:
            label = "Data Modeling"
        elif "api" in top_words or "container" in top_words:
            label = "Engineering Deployment"
        elif "metric" in top_words or "testing" in top_words:
            label = "Business Analysis"
        else:
            label = f"Topic {topic_id}"
        topic_labels.append((label, top_words))
    return topic_labels

# Visualization
def visualize_lda(lda_model, corpus, dictionary, output_file="lda_visualization.html"):
    vis_data = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary)
    pyLDAvis.save_html(vis_data, output_file)
    print(f"LDA visualization saved to {output_file}")

# Main Execution
def main():
    # Load data
    file_path = "preprocessed_linkedin_data.csv"
    df = load_data(file_path)

    # Keyword Extraction
    nlp = setup_spacy_ner()
    hidden_requirements = compare_skills(df, nlp)
    print("Hidden Requirements:")
    for item in hidden_requirements:
        if len(item['hidden_requirements']) >= 1:  # Fixed syntax error here
            print(f"Job Title: {item['title']}")
            print(f"Hidden Requirements: {item['hidden_requirements']}\n")

    # Topic Modeling
    num_topics_range = range(2, 6)  # Test 2 to 5 topics
    lda_model, corpus, dictionary = apply_lda(df['jobs-desc'].tolist(), num_topics_range)
    topics = interpret_topics(lda_model)
    print("LDA Topics:")
    for label, words in topics:
        print(f"{label}: {words}\n")

    # Visualization
    visualize_lda(lda_model, corpus, dictionary)

if __name__ == "__main__":
    main()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


Hidden Requirements:
Job Title: Data Engineer
Hidden Requirements: ['Docker']

Job Title: Software Intern, Machine Learning (6-month placement)
Hidden Requirements: ['MLOps', 'Docker', 'Kubernetes']

Job Title: Data Engineer
Hidden Requirements: ['Docker']

Job Title: Localization Technical Specialist/ Localization Engineer
Hidden Requirements: ['Docker']

Job Title: Analyst, IT Security Operations & Engineering
Hidden Requirements: ['Kubernetes']

Job Title: System Analyst
Hidden Requirements: ['Kubernetes']

Job Title: AI Engineer Lead (GenAI) | HKD 60K - HKD 80K per month
Hidden Requirements: ['TensorFlow', 'PyTorch']

Job Title: IT Specialist (2 positions) (Job ID: 10728)
Hidden Requirements: ['Kubernetes', 'Docker', 'PyTorch']

Job Title: DevOps Database Administrator (Exchange)
Hidden Requirements: ['Kubernetes']

Job Title: Cloud Security- 6 months contract- FS- 50k P/M
Hidden Requirements: ['Kubernetes']

Job Title: MongoDB Specialist
Hidden Requirements: ['Docker', 'Kubernetes



LDA Topics:
Topic 0: ['experience', 'data', 'work', 'team', 'skill', 'system', 'management', 'application', 'business', 'support', 'project', 'related', 'opportunity', 'ability', 'hong', 'development', 'technology', 'knowledge', 'year', 'requirement', 'process', 'including', 'kong', 'role', 'communication', 'strong', 'job', 'english', 'client', 'information']

Topic 1: ['experience', 'data', 'team', 'business', 'skill', 'support', 'work', 'system', 'management', 'strong', 'project', 'application', 'service', 'kong', 'requirement', 'development', 'customer', 'hong', 'information', 'year', 'client', 'communication', 'ability', 'including', 'opportunity', 'digital', 'role', 'product', 'technology', 'provide']

Topic 2: ['team', 'experience', 'data', 'support', 'service', 'client', 'application', 'skill', 'business', 'work', 'opportunity', 'project', 'strong', 'technology', 'role', 'requirement', 'management', 'including', 'system', 'knowledge', 'communication', 'year', 'security', 'respon

In [None]:
from IPython.display import HTML
display(HTML("lda_visualization.html"))