In [1]:
from collections import Counter

In [2]:
import pandas as pd
from scipy.sparse import dok_matrix, save_npz
import numpy as np
import pickle
from tqdm import tqdm
import os

def build_profession_skill_matrix(file_path, output_dir, chunk_size=50000):
    """–ü–æ—Å—Ç—Ä–æ–µ–Ω–∏–µ –º–∞—Ç—Ä–∏—Ü—ã –ø—Ä–æ—Ñ–µ—Å—Å–∏–∏-–Ω–∞–≤—ã–∫–∏ –¥–ª—è –±–æ–ª—å—à–∏—Ö —Ñ–∞–π–ª–æ–≤"""
    os.makedirs(output_dir, exist_ok=True)

    # 1. –ü–µ—Ä–≤—ã–π –ø—Ä–æ—Ö–æ–¥: —Å–±–æ—Ä —É–Ω–∏–∫–∞–ª—å–Ω—ã—Ö –∑–Ω–∞—á–µ–Ω–∏–π
    unique_professions = set()
    unique_skills = set()

    print("üîç –ü–µ—Ä–≤—ã–π –ø—Ä–æ—Ö–æ–¥: —Å–±–æ—Ä —É–Ω–∏–∫–∞–ª—å–Ω—ã—Ö –ø—Ä–æ—Ñ–µ—Å—Å–∏–π –∏ –Ω–∞–≤—ã–∫–æ–≤...")
    reader = pd.read_csv(file_path, sep='|', header=None,
                         names=['id','profession', 'hard_skills', 'soft_skills'],
                         chunksize=chunk_size, low_memory=False)

    for chunk in tqdm(reader):
        # –°–±–æ—Ä —É–Ω–∏–∫–∞–ª—å–Ω—ã—Ö –ø—Ä–æ—Ñ–µ—Å—Å–∏–π —Å –Ω–æ—Ä–º–∞–ª–∏–∑–∞—Ü–∏–µ–π
        professions = chunk['profession'].dropna().str.strip().unique()
        unique_professions.update(professions)

        # –û–±—Ä–∞–±–æ—Ç–∫–∞ hard skills —Å –Ω–æ—Ä–º–∞–ª–∏–∑–∞—Ü–∏–µ–π
        if 'hard_skills' in chunk:
            hard_skills = chunk['hard_skills'].dropna()
            hard_skills = hard_skills.str.split(';').explode()
            hard_skills = hard_skills.str.strip()  # –ù–æ—Ä–º–∞–ª–∏–∑–∞—Ü–∏—è
            hard_skills = hard_skills[hard_skills != '']
            unique_skills.update(hard_skills)

        # –û–±—Ä–∞–±–æ—Ç–∫–∞ soft skills —Å –Ω–æ—Ä–º–∞–ª–∏–∑–∞—Ü–∏–µ–π –∏ –ø—Ä–µ—Ñ–∏–∫—Å–æ–º
        if 'soft_skills' in chunk:
            soft_skills = chunk['soft_skills'].dropna()
            soft_skills = soft_skills.str.split(';').explode()
            soft_skills = soft_skills.str.strip()  # –ù–æ—Ä–º–∞–ª–∏–∑–∞—Ü–∏—è
            soft_skills = soft_skills[soft_skills != '']
            unique_skills.update("SOFT_" + soft_skills)

    # 2. –°–æ–∑–¥–∞–Ω–∏–µ —Å–ª–æ–≤–∞—Ä–µ–π –∏–Ω–¥–µ–∫—Å–æ–≤
    profession_to_idx = {prof: idx for idx, prof in enumerate(unique_professions)}
    skill_to_idx = {skill: idx for idx, skill in enumerate(unique_skills)}

    # 3. –í—Ç–æ—Ä–æ–π –ø—Ä–æ—Ö–æ–¥: –ø–æ—Å—Ç—Ä–æ–µ–Ω–∏–µ –º–∞—Ç—Ä–∏—Ü—ã
    matrix = dok_matrix((len(unique_professions), len(unique_skills)), dtype=np.int32)

    print("\nüîß –í—Ç–æ—Ä–æ–π –ø—Ä–æ—Ö–æ–¥: –ø–æ—Å—Ç—Ä–æ–µ–Ω–∏–µ –º–∞—Ç—Ä–∏—Ü—ã...")
    reader = pd.read_csv(file_path, sep='|', header=None,
                         names=['profession', 'hard_skills', 'soft_skills'],
                         chunksize=chunk_size, low_memory=False)

    skipped_skills = set()  # –î–ª—è –æ—Ç—Å–ª–µ–∂–∏–≤–∞–Ω–∏—è –ø—Ä–æ–ø—É—â–µ–Ω–Ω—ã—Ö –Ω–∞–≤—ã–∫–æ–≤
    for chunk in tqdm(reader):
        for _, row in chunk.iterrows():
            if pd.isna(row['profession']):
                continue

            # –ù–æ—Ä–º–∞–ª–∏–∑–∞—Ü–∏—è –ø—Ä–æ—Ñ–µ—Å—Å–∏–∏
            profession = str(row['profession']).strip()
            if not profession or profession not in profession_to_idx:
                continue

            p_idx = profession_to_idx[profession]

            # –û–±—Ä–∞–±–æ—Ç–∫–∞ hard skills —Å –ø—Ä–æ–≤–µ—Ä–∫–æ–π
            if pd.notna(row['hard_skills']):
                for skill in str(row['hard_skills']).split(';'):
                    if skill := skill.strip():
                        if skill in skill_to_idx:
                            s_idx = skill_to_idx[skill]
                            matrix[p_idx, s_idx] += 1
                        else:
                            skipped_skills.add(skill)

            # –û–±—Ä–∞–±–æ—Ç–∫–∞ soft skills —Å –ø—Ä–æ–≤–µ—Ä–∫–æ–π
            if pd.notna(row['soft_skills']):
                for skill in str(row['soft_skills']).split(';'):
                    if skill := skill.strip():
                        skill_key = "SOFT_" + skill
                        if skill_key in skill_to_idx:
                            s_idx = skill_to_idx[skill_key]
                            matrix[p_idx, s_idx] += 1
                        else:
                            skipped_skills.add(skill_key)

    # –°–æ–æ–±—â–∞–µ–º –æ –ø—Ä–æ–ø—É—â–µ–Ω–Ω—ã—Ö –Ω–∞–≤—ã–∫–∞—Ö
    if skipped_skills:
        print(f"\n‚ö†Ô∏è –ü—Ä–æ–ø—É—â–µ–Ω–æ {len(skipped_skills)} –Ω–∞–≤—ã–∫–æ–≤, –æ—Ç—Å—É—Ç—Å—Ç–≤—É—é—â–∏—Ö –≤ —Å–ª–æ–≤–∞—Ä–µ")
        print("–ü—Ä–∏–º–µ—Ä—ã –ø—Ä–æ–ø—É—â–µ–Ω–Ω—ã—Ö –Ω–∞–≤—ã–∫–æ–≤:", list(skipped_skills)[:5])

    # 4. –°–æ—Ö—Ä–∞–Ω–µ–Ω–∏–µ —Ä–µ–∑—É–ª—å—Ç–∞—Ç–æ–≤
    print("\nüíæ –°–æ—Ö—Ä–∞–Ω–µ–Ω–∏–µ —Ä–µ–∑—É–ª—å—Ç–∞—Ç–æ–≤...")
    # –ú–∞—Ç—Ä–∏—Ü–∞ –≤ CSR —Ñ–æ—Ä–º–∞—Ç–µ
    csr_matrix = matrix.tocsr()
    save_npz(os.path.join(output_dir, "profession_skills_matrix.npz"), csr_matrix)

    # –°–ª–æ–≤–∞—Ä–∏ –∏–Ω–¥–µ–∫—Å–æ–≤
    with open(os.path.join(output_dir, "profession_to_idx.pkl"), 'wb') as f:
        pickle.dump(profession_to_idx, f)

    with open(os.path.join(output_dir, "skill_to_idx.pkl"), 'wb') as f:
        pickle.dump(skill_to_idx, f)

    # –û–±—Ä–∞—Ç–Ω—ã–µ –∏–Ω–¥–µ–∫—Å—ã
    idx_to_profession = {v: k for k, v in profession_to_idx.items()}
    idx_to_skill = {v: k for k, v in skill_to_idx.items()}

    with open(os.path.join(output_dir, "idx_to_profession.pkl"), 'wb') as f:
        pickle.dump(idx_to_profession, f)

    with open(os.path.join(output_dir, "idx_to_skill.pkl"), 'wb') as f:
        pickle.dump(idx_to_skill, f)

    print(f"‚úÖ –ì–æ—Ç–æ–≤–æ! –ú–∞—Ç—Ä–∏—Ü–∞ —Ä–∞–∑–º–µ—Ä–æ–º {csr_matrix.shape[0]} –ø—Ä–æ—Ñ–µ—Å—Å–∏–π √ó {csr_matrix.shape[1]} –Ω–∞–≤—ã–∫–æ–≤")
    print(f"üìÅ –†–µ–∑—É–ª—å—Ç–∞—Ç—ã —Å–æ—Ö—Ä–∞–Ω–µ–Ω—ã –≤: {output_dir}")

# –ü—Ä–∏–º–µ—Ä –∏—Å–ø–æ–ª—å–∑–æ–≤–∞–Ω–∏—è
if __name__ == "__main__":
    build_profession_skill_matrix(
        file_path="/data/extracted_skills11.txt",
        output_dir="output_matrix",
        chunk_size=100000  # –†–∞–∑–º–µ—Ä —á–∞–Ω–∫–∞ –¥–ª—è –æ–±—Ä–∞–±–æ—Ç–∫–∏
    )

üîç –ü–µ—Ä–≤—ã–π –ø—Ä–æ—Ö–æ–¥: —Å–±–æ—Ä —É–Ω–∏–∫–∞–ª—å–Ω—ã—Ö –ø—Ä–æ—Ñ–µ—Å—Å–∏–π –∏ –Ω–∞–≤—ã–∫–æ–≤...


1it [00:01,  1.37s/it]



üîß –í—Ç–æ—Ä–æ–π –ø—Ä–æ—Ö–æ–¥: –ø–æ—Å—Ç—Ä–æ–µ–Ω–∏–µ –º–∞—Ç—Ä–∏—Ü—ã...


1it [00:20, 20.78s/it]



üíæ –°–æ—Ö—Ä–∞–Ω–µ–Ω–∏–µ —Ä–µ–∑—É–ª—å—Ç–∞—Ç–æ–≤...
‚úÖ –ì–æ—Ç–æ–≤–æ! –ú–∞—Ç—Ä–∏—Ü–∞ —Ä–∞–∑–º–µ—Ä–æ–º 91 –ø—Ä–æ—Ñ–µ—Å—Å–∏–π √ó 118120 –Ω–∞–≤—ã–∫–æ–≤
üìÅ –†–µ–∑—É–ª—å—Ç–∞—Ç—ã —Å–æ—Ö—Ä–∞–Ω–µ–Ω—ã –≤: output_matrix


In [3]:
!pip install graphistry

Collecting graphistry
  Downloading graphistry-0.41.0-py3-none-any.whl.metadata (23 kB)
Collecting palettable>=3.0 (from graphistry)
  Downloading palettable-3.3.3-py2.py3-none-any.whl.metadata (3.3 kB)
Collecting squarify (from graphistry)
  Downloading squarify-0.4.4-py3-none-any.whl.metadata (600 bytes)
Downloading graphistry-0.41.0-py3-none-any.whl (332 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m332.4/332.4 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading palettable-3.3.3-py2.py3-none-any.whl (332 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m332.3/332.3 kB[0m [31m26.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading squarify-0.4.4-py3-none-any.whl (4.1 kB)
Installing collected packages: squarify, palettable, graphistry
Successfully installed graphistry-0.41.0 palett

In [4]:
import numpy as np
from scipy.sparse import load_npz
import pandas as pd
import graphistry
import pickle
from tqdm import tqdm

# –ó–∞–≥—Ä—É–∑–∫–∞ –¥–∞–Ω–Ω—ã—Ö
matrix = load_npz("output_matrix/profession_skills_matrix.npz")
with open("output_matrix/skill_to_idx.pkl", "rb") as f:
    skill_to_idx = pickle.load(f)
with open("output_matrix/idx_to_skill.pkl", "rb") as f:
    idx_to_skill = pickle.load(f)

# –ö–æ–Ω–≤–µ—Ä—Ç–∞—Ü–∏—è –≤ CSC –¥–ª—è —ç—Ñ—Ñ–µ–∫—Ç–∏–≤–Ω–æ–≥–æ –¥–æ—Å—Ç—É–ø–∞ –ø–æ —Å—Ç–æ–ª–±—Ü–∞–º (–Ω–∞–≤—ã–∫–∞–º)
csc_matrix = matrix.tocsc()

# –†–∞—Å—á–µ—Ç –æ–±—â–µ–π —Å—Ç–∞—Ç–∏—Å—Ç–∏–∫–∏
total_professions = matrix.shape[0]
skill_frequencies = np.array(csc_matrix.sum(axis=0)).flatten()

# –§—É–Ω–∫—Ü–∏—è —Ä–∞—Å—á–µ—Ç–∞ NPMI
def calculate_npmi(skill_i, skill_j):
    # –°–æ–≤–º–µ—Å—Ç–Ω–∞—è –≤—Å—Ç—Ä–µ—á–∞–µ–º–æ—Å—Ç—å
    col_i = csc_matrix[:, skill_i]
    col_j = csc_matrix[:, skill_j]
    co_occurrence = col_i.multiply(col_j).sum()

    if co_occurrence == 0:
        return 0.0

    # –í–µ—Ä–æ—è—Ç–Ω–æ—Å—Ç–∏
    p_i = skill_frequencies[skill_i] / total_professions
    p_j = skill_frequencies[skill_j] / total_professions
    p_ij = co_occurrence / total_professions

    # PMI –∏ NPMI
    pmi = np.log(p_ij / (p_i * p_j))
    npmi = pmi / (-np.log(p_ij))

    return npmi



In [5]:
import pandas as pd
import numpy as np
from collections import defaultdict
import pickle
from tqdm import tqdm
import os
import math
from scipy.sparse import load_npz

# –ó–∞–≥—Ä—É–∑–∫–∞ –ø—Ä–µ–¥–≤–∞—Ä–∏—Ç–µ–ª—å–Ω–æ –≤—ã—á–∏—Å–ª–µ–Ω–Ω—ã—Ö –¥–∞–Ω–Ω—ã—Ö
matrix = load_npz("output_matrix/profession_skills_matrix.npz")
with open("output_matrix/skill_to_idx.pkl", "rb") as f:
    skill_to_idx = pickle.load(f)
with open("output_matrix/idx_to_skill.pkl", "rb") as f:
    idx_to_skill = pickle.load(f)
with open("output_matrix/profession_to_idx.pkl", "rb") as f:
    profession_to_idx = pickle.load(f)

# –í—ã—á–∏—Å–ª–µ–Ω–∏–µ —á–∞—Å—Ç–æ—Ç
total_professions = matrix.shape[0]
skill_frequencies = np.array(matrix.sum(axis=0)).flatten()
profession_frequencies = np.array(matrix.sum(axis=1)).flatten()
# –§—É–Ω–∫—Ü–∏—è —Ä–∞—Å—á–µ—Ç–∞ NPMI
def calculate_npmi(freq_i, freq_j, co_occurrence, total):
    if co_occurrence == 0:
        return 0.0

    p_i = freq_i / total
    p_j = freq_j / total
    p_ij = co_occurrence / total

    # –ò–∑–±–µ–≥–∞–µ–º –Ω—É–ª–µ–≤—ã—Ö –≤–µ—Ä–æ—è—Ç–Ω–æ—Å—Ç–µ–π
    if p_i <= 0 or p_j <= 0 or p_ij <= 0:
        return 0.0

    pmi = math.log(p_ij / (p_i * p_j))
    npmi = pmi / (-math.log(p_ij))

    return npmi

# –ò–Ω–∏—Ü–∏–∞–ª–∏–∑–∞—Ü–∏—è —Å—Ç—Ä—É–∫—Ç—É—Ä –¥–∞–Ω–Ω—ã—Ö
file_path = "/data/extracted_skills11.txt"
chunk_size = 100000
min_cooccurrence = 10  # –ú–∏–Ω–∏–º–∞–ª—å–Ω–∞—è —Å–æ–≤–º–µ—Å—Ç–Ω–∞—è –≤—Å—Ç—Ä–µ—á–∞–µ–º–æ—Å—Ç—å

print("‚öôÔ∏è –ò–Ω–∏—Ü–∏–∞–ª–∏–∑–∞—Ü–∏—è –∑–∞–≤–µ—Ä—à–µ–Ω–∞")

‚öôÔ∏è –ò–Ω–∏—Ü–∏–∞–ª–∏–∑–∞—Ü–∏—è –∑–∞–≤–µ—Ä—à–µ–Ω–∞


In [6]:
# –°–±–æ—Ä —Å—Ç–∞—Ç–∏—Å—Ç–∏–∫–∏ –¥–ª—è NPMI
from itertools import combinations
soft_soft_edges = Counter()
soft_hard_edges = Counter()
profession_soft_edges = Counter()

soft_freq = Counter()
hard_freq = Counter()
profession_freq = Counter()
total_vacancies = 0

# –ü–µ—Ä–≤—ã–π –ø—Ä–æ—Ö–æ–¥: —Å–±–æ—Ä —á–∞—Å—Ç–æ—Ç
print("üîç –ü–µ—Ä–≤—ã–π –ø—Ä–æ—Ö–æ–¥: —Å–±–æ—Ä —Å—Ç–∞—Ç–∏—Å—Ç–∏–∫–∏...")
reader = pd.read_csv(
    file_path,
    sep='|',
    header=None,
    names=['id', 'profession', 'hard_skills', 'soft_skills'],
    chunksize=chunk_size,
    low_memory=False
)

for chunk in tqdm(reader):
    for _, row in chunk.iterrows():
        total_vacancies += 1

        profession = str(row['profession']).strip() if pd.notna(row['profession']) else None
        if profession:
            profession_freq[profession] += 1

        # Hard skills
        hard_skills = []
        if pd.notna(row['hard_skills']):
            for skill in str(row['hard_skills']).split(';'):
                if skill := skill.strip():
                    hard_skills.append(skill)
                    hard_freq[skill] += 1

        # Soft skills
        soft_skills = []
        if pd.notna(row['soft_skills']):
            for skill in str(row['soft_skills']).split(';'):
                if skill := skill.strip():
                    soft_skill = "SOFT_" + skill
                    soft_skills.append(soft_skill)
                    soft_freq[soft_skill] += 1

        # Soft-soft —Å–≤—è–∑–∏
        '''for i in range(len(soft_skills)):
            for j in range(i + 1, len(soft_skills)):
                key = tuple(sorted([soft_skills[i], soft_skills[j]]))
                soft_soft_edges[key] += 1'''
        for skill_pair in combinations(soft_skills, 2):
          key = tuple(sorted(skill_pair))
          soft_soft_edges[key] += 1
        # Soft-hard —Å–≤—è–∑–∏
        for soft in soft_skills:
            for hard in hard_skills:
                key = (soft, hard)
                soft_hard_edges[key] += 1

        # Profession-soft —Å–≤—è–∑–∏
        if profession:
            for soft in soft_skills:
                key = (profession, soft)
                profession_soft_edges[key] += 1

# –§–∏–ª—å—Ç—Ä–∞—Ü–∏—è —Ä–µ–¥–∫–∏—Ö —Å–≤—è–∑–µ–π
soft_soft_edges = {k: v for k, v in soft_soft_edges.items() if v >= min_cooccurrence}
soft_hard_edges = {k: v for k, v in soft_hard_edges.items() if v >= min_cooccurrence}
profession_soft_edges = {k: v for k, v in profession_soft_edges.items() if v >= min_cooccurrence}

# –†–∞—Å—á–µ—Ç NPMI –∏ —Å–æ–∑–¥–∞–Ω–∏–µ DataFrame
def create_npmi_df(edges_dict, freq_x, freq_y):
    rows = []
    for (x, y), co_occur in edges_dict.items():
        npmi = calculate_npmi(freq_x[x], freq_y[y], co_occur, total_vacancies)
        rows.append({
            "source": x,
            "target": y,
            "co_occurrence": co_occur,
            "npmi": npmi
        })
    return pd.DataFrame(rows)

# –°–æ–∑–¥–∞–Ω–∏–µ DataFrame —Å NPMI
print("\nüìä –†–∞—Å—á–µ—Ç NPMI –¥–ª—è –≥—Ä–∞—Ñ–æ–≤...")
df_soft_soft = create_npmi_df(
    soft_soft_edges,
    soft_freq,
    soft_freq
)

df_soft_hard = create_npmi_df(
    soft_hard_edges,
    soft_freq,
    hard_freq
)

df_profession_soft = create_npmi_df(
    profession_soft_edges,
    profession_freq,
    soft_freq
)

print("‚úÖ –ì—Ä–∞—Ñ—ã –ø–æ—Å—Ç—Ä–æ–µ–Ω—ã:")
print(f"Soft-Soft: {len(df_soft_soft)} —Ä–µ–±–µ—Ä")
print(f"Soft-Hard: {len(df_soft_hard)} —Ä–µ–±–µ—Ä")
print(f"Profession-Soft: {len(df_profession_soft)} —Ä–µ–±–µ—Ä")

üîç –ü–µ—Ä–≤—ã–π –ø—Ä–æ—Ö–æ–¥: —Å–±–æ—Ä —Å—Ç–∞—Ç–∏—Å—Ç–∏–∫–∏...


1it [00:14, 14.57s/it]


üìä –†–∞—Å—á–µ—Ç NPMI –¥–ª—è –≥—Ä–∞—Ñ–æ–≤...
‚úÖ –ì—Ä–∞—Ñ—ã –ø–æ—Å—Ç—Ä–æ–µ–Ω—ã:
Soft-Soft: 4323 —Ä–µ–±–µ—Ä
Soft-Hard: 16802 —Ä–µ–±–µ—Ä
Profession-Soft: 1800 —Ä–µ–±–µ—Ä





In [13]:
def write_to_file(soft_soft, soft_hard,profession_soft):
    soft_soft.to_csv('df_soft_soft', encoding='utf-8')
    print("–ó–∞–ø–∏—Å—å –≤ —Ñ–∞–π–ª df_soft_soft —É—Å–ø–µ—à–Ω–∞!")
    soft_hard.to_csv('df_soft_hard', encoding='utf-8')
    print("–ó–∞–ø–∏—Å—å –≤ —Ñ–∞–π–ª df_soft_hard —É—Å–ø–µ—à–Ω–∞!")
    profession_soft.to_csv('df_profession_soft',encoding='utf-8')
    print("–ó–∞–ø–∏—Å—å –≤ —Ñ–∞–π–ª df_profession_soft —É—Å–ø–µ—à–Ω–∞!")
    #nodes.to_csv('df_nodes',encoding='utf-8')
    #print("–ó–∞–ø–∏—Å—å –≤ —Ñ–∞–π–ª df_nodes —É—Å–ø–µ—à–Ω–∞!")
write_to_file(df_soft_soft,df_soft_hard,df_profession_soft)

–ó–∞–ø–∏—Å—å –≤ —Ñ–∞–π–ª df_soft_soft —É—Å–ø–µ—à–Ω–∞!
–ó–∞–ø–∏—Å—å –≤ —Ñ–∞–π–ª df_soft_hard —É—Å–ø–µ—à–Ω–∞!
–ó–∞–ø–∏—Å—å –≤ —Ñ–∞–π–ª df_profession_soft —É—Å–ø–µ—à–Ω–∞!


In [8]:
number_of_edges=100

–ì—Ä–∞—Ñ–∏–∫–∏ –Ω–∏–∂–µ —Å —Ü–≤–µ—Ç–æ–º –∏ –∫–æ—Ä—Ä–µ–∫—Ç–Ω–æ—Å—Ç—å—é –≤–µ—Å–∞ –∏–Ω–æ–≥–¥–∞ –≤—ã–µ–±—ã–≤–∞—é—Ç—Å—è, —á—Ç–æ–±—ã –≤—Å–µ –±—ã–ª–æ –æ–∫, –º–æ–∂–Ω–æ –∑–∞–ø—É—Å—Ç–∏—Ç—å –≥—Ä–∞—Ñ–∏–∫ –≤ –æ—Ç–¥–µ–ª—å–Ω–æ–º –æ–∫–Ω–µ(–≤–µ—Ä—Ö–Ω–∏–π –ø—Ä–∞–≤—ã–π —É–≥–æ–ª, —Ç—Ä–µ—Ç—å—è —Å–ø—Ä–∞–≤–∞ –∫–Ω–æ–ø–∫–∞)

In [9]:

# –ü–æ–¥–≥–æ—Ç–æ–≤–∫–∞ –¥–∞–Ω–Ω—ã—Ö
nodes_set = set(df_soft_soft['source']).union(set(df_soft_soft['target']))
nodes_df = pd.DataFrame({"node": list(nodes_set)})
nodes_df["frequency"] = nodes_df["node"].map(soft_freq)
nodes_df["type"] = "soft"

# –§–∏–ª—å—Ç—Ä–∞—Ü–∏—è —Å–ª–∞–±—ã—Ö —Å–≤—è–∑–µ–π
filtered_edges = df_soft_soft[df_soft_soft["npmi"] > 0.2] \
    .sort_values("npmi", ascending=False) \
    .head(number_of_edges)

top_nodes = set(filtered_edges['source']).union(set(filtered_edges['target']))
nodes_df = nodes_df[nodes_df['node'].isin(top_nodes)]

# –ü–æ—Å—Ç—Ä–æ–µ–Ω–∏–µ –≥—Ä–∞—Ñ–∞
graphistry.register(api=3, protocol="https", server="hub.graphistry.com", personal_key_id="TCSI0SV3RL", personal_key_secret="XYF05KFCBNK1W2RX")
g = graphistry.nodes(nodes_df, "node") \
    .edges(filtered_edges, "source", "target") \
    .bind(
        #point_color="type",
        point_size="frequency",
        edge_weight="npmi",
        edge_title="co_occurrence"
    )
print("–í–ò–ó–£–ê–õ–ò–ó–ê–¶–ò–Ø Soft-Soft –≥—Ä–∞—Ñ–∞ –ø–æ –¢–û–ü-"+str(number_of_edges)+ " —Å–≤—è–∑—è–º –ø–æ NPMI")

g.plot()

–í–ò–ó–£–ê–õ–ò–ó–ê–¶–ò–Ø Soft-Soft –≥—Ä–∞—Ñ–∞ –ø–æ –¢–û–ü-100 —Å–≤—è–∑—è–º –ø–æ NPMI


In [10]:
# –ü–æ–¥–≥–æ—Ç–æ–≤–∫–∞ –¥–∞–Ω–Ω—ã—Ö
nodes_set = set(df_soft_hard['source']).union(set(df_soft_hard['target']))
nodes_df = pd.DataFrame({"node": list(nodes_set)})

# –û–ø—Ä–µ–¥–µ–ª–µ–Ω–∏–µ —Ç–∏–ø–∞ —É–∑–ª–∞ –∏ —á–∞—Å—Ç–æ—Ç—ã
def get_node_type(node):
    if node.startswith("SOFT_"):
        return "soft", soft_freq.get(node, 1)
    return "hard", hard_freq.get(node, 1)

nodes_df[["type", "frequency"]] = nodes_df["node"].apply(
    lambda x: pd.Series(get_node_type(x))
)

# –§–∏–ª—å—Ç—Ä–∞—Ü–∏—è —Å–≤—è–∑–µ–π
filtered_edges = df_soft_hard[df_soft_hard["npmi"] > 0.2] \
    .sort_values("npmi", ascending=False) \
    .head(number_of_edges)

top_nodes = set(filtered_edges['source']).union(set(filtered_edges['target']))
nodes_df = nodes_df[nodes_df['node'].isin(top_nodes)]

# –ü–æ—Å—Ç—Ä–æ–µ–Ω–∏–µ –≥—Ä–∞—Ñ–∞
g = graphistry.nodes(nodes_df, "node") \
    .edges(filtered_edges, "source", "target") \
    .bind(
        point_color="type",
        point_size="frequency",
        point_title="node",
        edge_weight="npmi",
        edge_title="co_occurrence"
    )\
    .encode_point_color("type", categorical_mapping={
          'soft': 'blue',
          'hard': 'orange'
      }, default_mapping='gray')


print("–í–ò–ó–£–ê–õ–ò–ó–ê–¶–ò–Ø Soft-Hard –≥—Ä–∞—Ñ–∞ –ø–æ –¢–û–ü-"+str(number_of_edges)+ " —Å–≤—è–∑—è–º –ø–æ NPMI")
g.plot()

–í–ò–ó–£–ê–õ–ò–ó–ê–¶–ò–Ø Soft-Hard –≥—Ä–∞—Ñ–∞ –ø–æ –¢–û–ü-100 —Å–≤—è–∑—è–º –ø–æ NPMI


In [11]:
# –ü–æ–¥–≥–æ—Ç–æ–≤–∫–∞ –¥–∞–Ω–Ω—ã—Ö
nodes_set = set(df_profession_soft['source']).union(set(df_profession_soft['target']))
nodes_df = pd.DataFrame({"node": list(nodes_set)})

# –û–ø—Ä–µ–¥–µ–ª–µ–Ω–∏–µ —Ç–∏–ø–∞ —É–∑–ª–∞ –∏ —á–∞—Å—Ç–æ—Ç—ã
def get_node_type(node):
    if node in profession_freq:
        return "profession", profession_freq.get(node, 1)
    return "soft", soft_freq.get(node, 1)

nodes_df[["type", "frequency"]] = nodes_df["node"].apply(
    lambda x: pd.Series(get_node_type(x))
)

# –§–∏–ª—å—Ç—Ä–∞—Ü–∏—è —Å–≤—è–∑–µ–π
#filtered_edges = df_profession_soft[df_profession_soft["npmi"] > 0.05]
filtered_edges = df_profession_soft[df_profession_soft["npmi"] > 0.2] \
    .sort_values("npmi", ascending=False) \
    .head(number_of_edges)

top_nodes = set(filtered_edges['source']).union(set(filtered_edges['target']))
nodes_df = nodes_df[nodes_df['node'].isin(top_nodes)]
# –ü–æ—Å—Ç—Ä–æ–µ–Ω–∏–µ –≥—Ä–∞—Ñ–∞
g = graphistry.nodes(nodes_df, "node") \
    .edges(filtered_edges, "source", "target") \
    .bind(
        point_color="type",
        point_size="frequency",
        point_title="node",
        edge_weight="npmi",
        edge_title="co_occurrence"
    )\
    .encode_point_color("type", categorical_mapping={
          'profession': 'red',
          'soft': 'blue'
      }, default_mapping='gray')


print("–í–ò–ó–£–ê–õ–ò–ó–ê–¶–ò–Ø Profession-Soft –≥—Ä–∞—Ñ–∞ –ø–æ –¢–û–ü-"+str(number_of_edges)+ " —Å–≤—è–∑—è–º –ø–æ NPMI")
g.plot()

–í–ò–ó–£–ê–õ–ò–ó–ê–¶–ò–Ø Profession-Soft –≥—Ä–∞—Ñ–∞ –ø–æ –¢–û–ü-100 —Å–≤—è–∑—è–º –ø–æ NPMI
