In [None]:
##################
### FOR　COLAB ###
##################
!pip install --upgrade scipy gensim pyLDAvis pandas nltk
!python -m spacy download en_core_web_sm

# restart kernel
import os
os.kill(os.getpid(), 9)

Collecting gensim
  Downloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.1 kB)
Collecting pyLDAvis
  Downloading pyLDAvis-3.4.1-py3-none-any.whl.metadata (4.2 kB)
Collecting pandas
  Downloading pandas-2.3.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (91 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m91.2/91.2 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
Collecting numpy<2.5,>=1.23.5 (from scipy)
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting scipy
  Downloading scipy-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
Collecting funcy (from pyLDAvis)
  Downlo

In [None]:
import pandas as pd
import numpy as np
import json
import plotly.graph_objs as go
import gensim
from gensim import corpora
from gensim.models import LdaModel
import pyLDAvis
import pyLDAvis.gensim_models
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfTransformer

class TechStackTopicModeler:
    def __init__(self, file_path, n_topics=10):
        """Initialize with CSV file and number of topics."""
        self.df = pd.read_csv(file_path, encoding='utf-8')
        self.n_topics = n_topics
        self.preprocess_skills()
        self.compute_skill_counts()
        self.compute_tfidf()
        self.compute_similarity()
        self.create_document_term_matrix()
        self.run_lda()
        self.prepare_visualization()

    def preprocess_skills(self):
        # Clean and format skills data and tokenize.
        self.df['processed_skills'] = (
            self.df['skill']
            .str.lower()
            .str.replace('、', ',')
            .str.replace(r'[^\w\s,]', '', regex=True)
        )

        self.df['tokenized_skills'] = self.df['processed_skills'].apply(
            lambda x: [skill.strip() for skill in x.split(',') if skill.strip()]
        )

    def compute_skill_counts(self):
        # Aggregate skill frequency per company.
        exploded = self.df.explode('tokenized_skills')
        self.skill_counts = exploded.groupby(['company', 'tokenized_skills']).size().unstack(fill_value=0)
        self.skill_counts = self.skill_counts[self.skill_counts.sum(axis=1) > 0]
        self.company_freq = self.df['company'].value_counts()
        self.skill_counts = self.skill_counts.loc[self.company_freq.index]

    def compute_tfidf(self):
        # Compute TF‑IDF vectors for each company tech stack.
        tfidf = TfidfTransformer()
        self.skill_tfidf = tfidf.fit_transform(self.skill_counts)

    def compute_similarity(self):
        # Calculate cosine similarity between companies.
        self.similarity_matrix = cosine_similarity(self.skill_tfidf).astype(np.float64)

    def create_document_term_matrix(self):
        # Create document-term matrix where each company is a document.
        valid_companies = self.skill_counts.index

        company_skills = (
            self.df.groupby('company')['tokenized_skills']
            .apply(lambda x: [item for sublist in x for item in sublist])
            .loc[valid_companies]
        )

        self.token_lists = company_skills.tolist()
        self.company_names = company_skills.index.tolist()

        self.dictionary = corpora.Dictionary(self.token_lists)
        self.corpus = [self.dictionary.doc2bow(tokens) for tokens in self.token_lists]

    def run_lda(self):
        # Train LDA topic model using Gensim.
        self.lda_model = LdaModel(
            corpus=self.corpus,
            id2word=self.dictionary,
            num_topics=self.n_topics,
            random_state=42,
            passes=10,
            alpha='auto'
        )

    def prepare_visualization(self):
        # Prepare pyLDAvis visualization data for Gensim model.
        self.vis_data = pyLDAvis.gensim_models.prepare(
            self.lda_model,
            self.corpus,
            self.dictionary,
            sort_topics=False
        )

    def get_similar_companies(self, company, top_k=5):
        # Get top similar companies based on TF-IDF similarity.
        if company not in self.skill_counts.index:
            return []

        idx = self.skill_counts.index.get_loc(company)
        sim_scores = self.similarity_matrix[idx]

        top_indices = sim_scores.argsort()[::-1][1:top_k+1]
        top_companies = self.skill_counts.index[top_indices].tolist()
        top_scores = sim_scores[top_indices]

        return [(comp, float(score)) for comp, score in zip(top_companies, top_scores)]

    def generate_dashboard(self, output_file='tech_dashboard.html'):
        # Generate complete HTML dashboard with visualization and company analysis.
        lda_html = pyLDAvis.prepared_data_to_html(self.vis_data)

        company_data = {}
        for idx, company in enumerate(self.company_names):
            doc = self.corpus[idx]
            topic_dist = dict(self.lda_model.get_document_topics(doc, minimum_probability=0))
            full_topic_dist = [float(topic_dist.get(i, 0.0)) for i in range(self.n_topics)]
            similar_companies = self.get_similar_companies(company)

            company_data[company] = {
                "topics": full_topic_dist,
                "similar": similar_companies,
                "job_count": int(self.company_freq.get(company, 0))  # Add job posting count
            }

        # create html content
        html_content = f"""
        <!DOCTYPE html>
        <html>
        <head>
            <title>Company Tech Stack Analysis</title>
            <script src="https://cdn.plot.ly/plotly-latest.min.js"></script>
            <style>
                body {{ font-family: Arial, sans-serif; margin: 20px; }}
                .container {{ display: flex; flex-direction: column; }}
                .header {{ text-align: center; margin-bottom: 20px; }}
                .section {{ margin-bottom: 30px; border: 1px solid #ddd; padding: 15px; border-radius: 5px; }}
                .chart-container {{ height: 500px; margin-bottom: 20px; }}
                #topic-viz {{ height: 700px; }}
                .controls {{ margin: 15px 0; }}
                .similar-list {{ margin-top: 10px; }}
            </style>
        </head>
        <body>
            <div class="container">
                <div class="header">
                    <h1>Company Tech Stack Analysis</h1>
                    <p>Interactive visualization of technology stacks using topic modeling</p>
                </div>

                <div class="section">
                    <h2>Topic Model Visualization</h2>
                    <div id="topic-viz">{lda_html}</div>
                </div>

                <div class="section">
                    <h2>Company Analysis</h2>
                    <div class="controls">
                        <label for="company-select">Select Company:</label>
                        <select id="company-select" onchange="updateCompany()">
                            {''.join(f'<option value="{company}">{company} ({self.company_freq.get(company, 0)} job postings)</option>'
                                    for company in self.company_names)}
                        </select>
                    </div>

                    <div id="topic-distribution" class="chart-container"></div>

                    <div>
                        <h3>Similar Companies</h3>
                        <div id="similar-companies" class="similar-list"></div>
                    </div>
                </div>
            </div>

            <script>
                // Precomputed company data
                const companyData = {json.dumps(company_data)};

                // Initialize with first company
                document.addEventListener('DOMContentLoaded', function() {{
                    updateCompany();
                }});

                function updateCompany() {{
                    const company = document.getElementById('company-select').value;
                    const data = companyData[company];

                    // Update topic distribution chart
                    const topicDist = {{
                        x: data.topics.map((_, i) => `Topic ${{i+1}}`),
                        y: data.topics,
                        type: 'bar',
                        marker: {{ color: '#1f77b4' }}
                    }};

                    const layout = {{
                        title: `Topic Distribution for ${{company}} (${{data.job_count}} job postings)`,
                        xaxis: {{ title: 'Topics' }},
                        yaxis: {{ title: 'Probability' }}
                    }};

                    Plotly.newPlot('topic-distribution', [topicDist], layout);

                    // Update similar companies list
                    let similarHtml = '<ul>';
                    data.similar.forEach(([comp, score]) => {{
                        similarHtml += `<li>${{comp}} (Similarity: ${{score.toFixed(3)}})</li>`;
                    }});
                    similarHtml += '</ul>';

                    document.getElementById('similar-companies').innerHTML = similarHtml;
                }}
            </script>
        </body>
        </html>
        """

        # save the dashboard
        with open(output_file, 'w', encoding='utf-8') as f:
            f.write(html_content)

if __name__ == "__main__":
    # process data nad generate dashboard
    modeler = TechStackTopicModeler('preprocessed_linkedin_data.csv', n_topics=5)
    modeler.generate_dashboard('tech_stack_dashboard.html')
    print("Dashboard generated: tech_stack_dashboard.html")

Dashboard generated: tech_stack_dashboard.html


In [None]:
from IPython.display import HTML
display(HTML("tech_stack_dashboard.html"))