In [4]:
import pandas as pd
import re
from tqdm import tqdm


df = pd.read_csv("train.csv")


def clean_text(text):
    if pd.isna(text):
        return ""
    text = text.lower()
    text = re.sub(r"[^a-z\s]", " ", text)
    return re.sub(r"\s+", " ", text).strip()

# ----------------------------
# 3. Extract skills from text
# ----------------------------
def extract_skills(text):
    # لو عندك عمود جاهز اسمه skills، ممكن نستخدمه بدلاً من دي
    if pd.isna(text):
        return []
    
    text = clean_text(text)
    tokens = text.split()
    
    # مثال بسيط لقائمة skills keywords (ممكن نحدثها حسب الداتا بتاعتك)
    skill_keywords = [
        'python', 'sql', 'excel', 'powerbi', 'tableau', 'machine', 'learning',
        'pandas', 'numpy', 'communication', 'statistics', 'ai', 'deep', 'data', 'analysis'
    ]
    
    found = [w for w in tokens if w in skill_keywords]
    return list(set(found))

# ----------------------------
# 4. Apply skill extraction
# ----------------------------
df['cv_skills'] = df['resume_text'].apply(extract_skills)
df['jd_skills'] = df['job_description_text'].apply(extract_skills)

# ----------------------------
# 5. Calculate overlap and missing
# ----------------------------
def skill_overlap(cv_skills, jd_skills):
    cv_set = set(cv_skills)
    jd_set = set(jd_skills)
    if not jd_set:
        return 0, [], []
    
    overlap = cv_set.intersection(jd_set)
    missing = jd_set - cv_set
    
    overlap_ratio = len(overlap) / len(jd_set)
    return overlap_ratio, list(overlap), list(missing)

tqdm.pandas()
df['skill_overlap_ratio'], df['common_skills'], df['missing_skills'] = zip(*df.progress_apply(
    lambda row: skill_overlap(row['cv_skills'], row['jd_skills']), axis=1
))

# ----------------------------
# 6. Display results
# ----------------------------
pd.set_option('display.max_colwidth', 120)
print(df[['cv_skills', 'jd_skills', 'skill_overlap_ratio', 'common_skills', 'missing_skills']].head(10))


100%|████████████████████████████████████| 6241/6241 [00:00<00:00, 28601.14it/s]

                                           cv_skills  \
0                                            [excel]   
1                 [analysis, communication, machine]   
2                                                 []   
3                       [communication, data, excel]   
4  [communication, analysis, data, excel, sql, deep]   
5                    [analysis, communication, data]   
6                                  [analysis, excel]   
7                    [communication, learning, data]   
8             [analysis, deep, communication, excel]   
9                         [sql, communication, data]   

                      jd_skills  skill_overlap_ratio    common_skills  \
0                  [sql, excel]             0.500000          [excel]   
1  [communication, data, excel]             0.333333  [communication]   
2       [python, communication]             0.000000               []   
3               [communication]             1.000000  [communication]   
4    [ai, deep, da




In [5]:
import pandas as pd
import re
from tqdm import tqdm

# ----------------------------
# 1. Load dataset
# ----------------------------
df = pd.read_csv("train.csv")

# ----------------------------
# 2. Clean text helper
# ----------------------------
def clean_text(text):
    if pd.isna(text):
        return ""
    text = text.lower()
    text = re.sub(r"[^a-z\s]", " ", text)
    return re.sub(r"\s+", " ", text).strip()

# ----------------------------
# 3. Parse skills column safely
# ----------------------------
def parse_skills(skills):
    """
    يحاول يقرأ عمود skills سواء مكتوب كـ list أو نص
    """
    if isinstance(skills, str):
        # لو مكتوبة كـ string list زي ['python','sql']
        skills = re.findall(r"[a-zA-Z+#]+", skills.lower())
    elif isinstance(skills, list):
        skills = [s.lower() for s in skills]
    else:
        return []
    return list(set(skills))

# ----------------------------
# 4. Clean skills from dataset
# ----------------------------
df['cv_skills'] = df['cv_skills'].apply(parse_skills) if 'cv_skills' in df.columns else df['resume_text'].apply(lambda x: re.findall(r"[a-zA-Z+#]+", str(x).lower()))
df['jd_skills'] = df['jd_skills'].apply(parse_skills) if 'jd_skills' in df.columns else df['job_description_text'].apply(lambda x: re.findall(r"[a-zA-Z+#]+", str(x).lower()))

# ----------------------------
# 5. Skill overlap function
# ----------------------------
def skill_overlap(cv_skills, jd_skills):
    cv_set = set(cv_skills)
    jd_set = set(jd_skills)
    if not jd_set:
        return 0, [], []
    
    overlap = cv_set.intersection(jd_set)
    missing = jd_set - cv_set
    
    overlap_ratio = len(overlap) / len(jd_set)
    return overlap_ratio, list(overlap), list(missing)

# ----------------------------
# 6. Apply to dataset
# ----------------------------
tqdm.pandas()
df['skill_overlap_ratio'], df['common_skills'], df['missing_skills'] = zip(*df.progress_apply(
    lambda row: skill_overlap(row['cv_skills'], row['jd_skills']), axis=1
))

# ----------------------------
# 7. Display results
# ----------------------------
pd.set_option('display.max_colwidth', 120)
print(df[['cv_skills', 'jd_skills', 'skill_overlap_ratio', 'common_skills', 'missing_skills']].head(10))


100%|█████████████████████████████████████| 6241/6241 [00:00<00:00, 6258.30it/s]


                                                                                                                 cv_skills  \
0  [summaryhighly, motivated, sales, associate, with, extensive, customer, service, and, sales, experience, outgoing, s...   
1  [professional, summarycurrently, working, with, caterpillar, as, an, contract, employee, active, in, several, npi, a...   
2  [summaryi, started, my, construction, career, in, june, of, in, jacksonville, florida, as, an, electrical, apprentic...   
3  [summarycertified, electrical, foremanwith, thirteen, years, of, experience, in, the, implementation, of, electrical...   
4  [summarywith, extensive, experience, in, business, requirement, analysis, with, sdlc, processing, experience, includ...   
5  [summarysolution, oriented, results, driven, strategic, sales, executive, with, a, proven, track, record, in, genera...   
6  [summarya, position, in, a, company, that, will, utilize, my, abilities, in, accounting, and, finance, while, provi