# **Topic Modeling**

In [67]:
import pandas as pd

In [68]:
df = pd.read_csv('linkedin_scraped_job_details_1600.csv')

In [69]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Download necessary NLTK data, including 'punkt_tab'
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('punkt_tab', quiet=True) # Download punkt_tab resource


# 1. Handle missing values
df['description'] = df['description'].fillna('')

# Initialize lemmatizer and stop words
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Function to preprocess text
def preprocess_text(text):
    # 2. Tokenize
    tokens = word_tokenize(text.lower())

    # 3. Remove stop words and 4. Lemmatize
    processed_tokens = [lemmatizer.lemmatize(word) for word in tokens if word.isalpha() and word not in stop_words]

    return processed_tokens

# Apply preprocessing to the description column
df['processed_description'] = df['description'].apply(preprocess_text)

display(df[['description', 'processed_description']].head())

Unnamed: 0,description,processed_description
0,Position: Engineer / AI-ML Full Stack Develope...,"[position, engineer, full, stack, developer, n..."
1,,[]
2,Responsibilities\n\nResponsibilities & Qualifi...,"[responsibility, responsibility, qualification..."
3,I‚Äôm helping Digidzign find a top candidate to ...,"[helping, digidzign, find, top, candidate, joi..."
4,üåü Join our Amazing Tech Team! üåü\n\nWe are look...,"[join, amazing, tech, team, looking, enthusias..."


Import the CountVectorizer class, join the processed tokens into strings, instantiate CountVectorizer, and create the document-term matrix.



In [70]:
from sklearn.feature_extraction.text import CountVectorizer

# Join the tokens into strings for CountVectorizer
df['processed_description_str'] = df['processed_description'].apply(lambda tokens: ' '.join(tokens))

# Instantiate CountVectorizer
vectorizer = CountVectorizer(max_df=0.95, min_df=2)

# Create the document-term matrix
dtm = vectorizer.fit_transform(df['processed_description_str'])

print("Shape of the Document-Term Matrix:", dtm.shape)

Shape of the Document-Term Matrix: (1600, 3153)


In [71]:
%pip install gensim



In [72]:
from sklearn.decomposition import LatentDirichletAllocation
from gensim.models.coherencemodel import CoherenceModel
from gensim.models.ldamulticore import LdaMulticore
from gensim.corpora import Dictionary
import numpy as np
import gensim

# Convert DTM to a list of lists for Gensim
corpus_list = [[(vectorizer.vocabulary_[word], count) for word, count in zip(vectorizer.get_feature_names_out(), dtm[i, :].toarray().flatten()) if count > 0] for i in range(dtm.shape[0])]

# Create Gensim Dictionary and Corpus
dictionary = Dictionary(df['processed_description'])
corpus = [dictionary.doc2bow(text) for text in df['processed_description']]

**Reasoning**:
Determine the optimal number of topics by training and evaluating LDA models with varying numbers of topics and store the coherence scores.



In [73]:
coherence_scores = []
model_list = []
topics_range = range(5, 21)

for num_topics in topics_range:
    model = LdaMulticore(corpus=corpus,
                         id2word=dictionary,
                         num_topics=num_topics,
                         random_state=100,
                         passes=10,
                         workers=2)
    model_list.append(model)
    coherencemodel = CoherenceModel(model=model, texts=df['processed_description'], dictionary=dictionary, coherence='c_v')
    coherence_scores.append(coherencemodel.get_coherence())

# Find the optimal number of topics
optimal_num_topics_index = np.argmax(coherence_scores)
optimal_num_topics = topics_range[optimal_num_topics_index]

print(f"Optimal number of topics: {optimal_num_topics}")

Optimal number of topics: 6


**Reasoning**:
Train the final LDA model using the optimal number of topics and store the model.



In [74]:
# Train the final LDA model with the optimal number of topics
final_lda_model = LdaMulticore(corpus=corpus,
                               id2word=dictionary,
                               num_topics=optimal_num_topics,
                               random_state=100,
                               passes=10,
                               workers=2)

# Store the trained LDA model (optional, but good practice)
# final_lda_model.save("final_lda_model")

print("Final LDA model trained with optimal number of topics.")

Final LDA model trained with optimal number of topics.


**Reasoning**:
Calculate the coherence score for the final LDA model using the corpus, dictionary, and processed texts, then print the score.



In [75]:
from gensim.models.coherencemodel import CoherenceModel

# Calculate the coherence score of the final_lda_model
coherence_model_lda = CoherenceModel(model=final_lda_model, texts=df['processed_description'], dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()

# Print the coherence score
print(f'Coherence Score: {coherence_lda}')

Coherence Score: 0.3992130248711314


In [76]:
# Get the topic terms and their probabilities from the final LDA model
topic_terms = final_lda_model.print_topics(num_words=10)

# Print the top terms for each topic
print("Top terms for each topic:")
for i, topic in enumerate(topic_terms):
    print(f"Topic {i}: {topic[1]}")


labor_market_demands = [
    "Business operations and accessibility",
    "Web and software development",
    "AI-driven hiring and HR tech",
    "Digital business and customer engagement",
    "Data and analytics",
    "Software engineering and development teams"
]

print("\nIdentified Labor Market Demands (based on topic interpretation):")
for demand in labor_market_demands:
    print(f"- {demand}")

Top terms for each topic:
Topic 0: 0.014*"business" + 0.009*"team" + 0.009*"service" + 0.008*"work" + 0.008*"experience" + 0.007*"status" + 0.006*"application" + 0.006*"technology" + 0.006*"disability" + 0.006*"u"
Topic 1: 0.012*"developer" + 0.012*"work" + 0.011*"team" + 0.010*"web" + 0.009*"application" + 0.008*"role" + 0.008*"experience" + 0.007*"technology" + 0.006*"financial" + 0.006*"system"
Topic 2: 0.014*"application" + 0.013*"job" + 0.013*"employee" + 0.010*"mci" + 0.009*"ai" + 0.009*"employment" + 0.008*"company" + 0.007*"environment" + 0.007*"responsibility" + 0.007*"required"
Topic 3: 0.011*"team" + 0.010*"company" + 0.009*"business" + 0.009*"employee" + 0.009*"people" + 0.009*"digital" + 0.009*"customer" + 0.008*"make" + 0.008*"please" + 0.008*"u"
Topic 4: 0.026*"data" + 0.020*"team" + 0.012*"work" + 0.010*"solution" + 0.008*"experience" + 0.008*"role" + 0.007*"business" + 0.007*"client" + 0.007*"join" + 0.006*"model"
Topic 5: 0.019*"team" + 0.014*"role" + 0.010*"software"

In [77]:
%pip install pyLDAvis

Collecting pyLDAvis
  Downloading pyLDAvis-3.4.1-py3-none-any.whl.metadata (4.2 kB)
Collecting funcy (from pyLDAvis)
  Downloading funcy-2.0-py2.py3-none-any.whl.metadata (5.9 kB)
Downloading pyLDAvis-3.4.1-py3-none-any.whl (2.6 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m2.6/2.6 MB[0m [31m71.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading funcy-2.0-py2.py3-none-any.whl (30 kB)
Installing collected packages: funcy, pyLDAvis
Successfully installed funcy-2.0 pyLDAvis-3.4.1


**Reasoning**:
Import pyLDAvis and its gensim module and prepare the data for visualization.



In [78]:
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

# Enable notebook mode for pyLDAvis
pyLDAvis.enable_notebook()

# Prepare the visualization data
vis_data = gensimvis.prepare(final_lda_model, corpus, dictionary)

# Display the visualization
display(vis_data)

  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)


# **Classification Model**

**Reasoning**:
Load the dataset into a pandas DataFrame for analysis.

In [79]:
import pandas as pd

df = pd.read_csv('/content/linkedin_scraped_job_details_1600.csv')
display(df.head())

  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)


Unnamed: 0,url,title,company,location,description
0,https://ph.linkedin.com/jobs/view/engineer-ai-...,Engineer / AI-ML Full Stack Developer (LB - 09...,RippedBoxStation,Manila,Position: Engineer / AI-ML Full Stack Develope...
1,https://ph.linkedin.com/jobs/view/investor-rel...,Investor Relations and Corporate Planning- Dat...,BDO Unibank,Metro Manila,
2,https://ph.linkedin.com/jobs/view/data-scienti...,Data Scientist,TekSynap,"National Capital Region, Philippines",Responsibilities\n\nResponsibilities & Qualifi...
3,https://ph.linkedin.com/jobs/view/ai-ml-engine...,AI / ML Engineer,Emma of Torre.ai,Philippines,I‚Äôm helping Digidzign find a top candidate to ...
4,https://ph.linkedin.com/jobs/view/join-our-ama...,Join our Amazing Tech Team | Get 26K salary + ...,Tap Growth ai,Quezon City,üåü Join our Amazing Tech Team! üåü\n\nWe are look...


  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)


In [80]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Download necessary NLTK data, including 'punkt_tab'
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('punkt_tab', quiet=True) # Download punkt_tab resource


# 1. Handle missing values
df['description'] = df['description'].fillna('')

# Initialize lemmatizer and stop words
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Function to preprocess text
def preprocess_text(text):
    # 2. Tokenize
    tokens = word_tokenize(text.lower())

    # 3. Remove stop words and 4. Lemmatize
    processed_tokens = [lemmatizer.lemmatize(word) for word in tokens if word.isalpha() and word not in stop_words]

    return processed_tokens

# Apply preprocessing to the description column
df['processed_description'] = df['description'].apply(preprocess_text)

display(df[['description', 'processed_description']].head())

  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return date

Unnamed: 0,description,processed_description
0,Position: Engineer / AI-ML Full Stack Develope...,"[position, engineer, full, stack, developer, n..."
1,,[]
2,Responsibilities\n\nResponsibilities & Qualifi...,"[responsibility, responsibility, qualification..."
3,I‚Äôm helping Digidzign find a top candidate to ...,"[helping, digidzign, find, top, candidate, joi..."
4,üåü Join our Amazing Tech Team! üåü\n\nWe are look...,"[join, amazing, tech, team, looking, enthusias..."


  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)


In [81]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Perform TF-IDF vectorization using the processed_description
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2)
X = tfidf_vectorizer.fit_transform(df['processed_description'].apply(lambda x: ' '.join(x)))

print("Shape of the TF-IDF matrix:", X.shape)

  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)


Shape of the TF-IDF matrix: (1600, 3153)


  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)


In [83]:
# Assign topics to documents
topic_assignments = final_lda_model[corpus]

# Get the dominant topic for each document
dominant_topics = [max(doc, key=lambda x: x[1])[0] for doc in topic_assignments]

# Map numerical topic labels to job categories
df['job_category'] = [topic_to_category.get(topic_id) for topic_id in dominant_topics]

# Drop rows where job_category is None (if any topics didn't map)
df.dropna(subset=['job_category'], inplace=True)

# Map the 'job_category' column in the DataFrame df to numerical labels using the topic_to_category dictionary
category_to_topic = {v: k for k, v in topic_to_category.items()}
df['job_category_encoded'] = df['job_category'].map(category_to_topic)

# Split the data into training and testing sets
X = X[df.index] # Align X with the filtered df
X_train, X_test, y_train, y_test = train_test_split(X, df['job_category_encoded'], test_size=0.2, random_state=42)

print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)

  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return date

Shape of X_train: (1280, 3153)
Shape of X_test: (320, 3153)
Shape of y_train: (1280,)
Shape of y_test: (320,)


  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)


In [84]:
from sklearn.naive_bayes import MultinomialNB

# Instantiate the Multinomial Naive Bayes model
model = MultinomialNB()

# Train the model
model.fit(X_train, y_train)

print("Multinomial Naive Bayes model trained successfully.")

  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)


Multinomial Naive Bayes model trained successfully.


  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)


In [85]:
from sklearn.metrics import accuracy_score, classification_report

# 1. Use the trained model to make predictions on the test set X_test
y_pred = model.predict(X_test)

# 2. Calculate the accuracy of the model's predictions
accuracy = accuracy_score(y_test, y_pred)

# 3. Print the accuracy score
print(f"Accuracy: {accuracy:.2f}")

# 4. Generate a classification report
report = classification_report(y_test, y_pred, target_names=category_to_topic.keys())

# 5. Print the classification report
print("Classification Report:")
print(report)

  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return date

Accuracy: 0.95
Classification Report:
                                              precision    recall  f1-score   support

    Business operations & service management       0.96      1.00      0.98       114
                Software and web development       1.00      0.96      0.98        51
   AI-assisted hiring & workforce management       1.00      0.93      0.97        30
      Digital business & customer engagement       1.00      0.79      0.88        19
Data analytics & data-driven decision-making       0.86      1.00      0.92        72
   Software engineering & system development       1.00      0.74      0.85        34

                                    accuracy                           0.95       320
                                   macro avg       0.97      0.90      0.93       320
                                weighted avg       0.95      0.95      0.95       320



  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
