In [3]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
import re, warnings
warnings.filterwarnings("ignore")

# ======================================================
# Model: Babelscape/wikineural-multilingual-ner
#   - Based on XLM-RoBERTa-large
#   - Trained on WikiNEuRal (9 languages incl. FR + EN)
#   - Significantly better cross-lingual NER than CoNLL03
# ======================================================
MODEL_NAME = "Babelscape/wikineural-multilingual-ner"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model     = AutoModelForTokenClassification.from_pretrained(MODEL_NAME)

nlp = pipeline(
    "ner",
    model=model,
    tokenizer=tokenizer,
    aggregation_strategy="first",  # cleaner span merging than "simple"
    stride=64,                     # 64-token overlap between windows ‚Äî no missed entities
)
print(f"Model loaded : {MODEL_NAME}")
print(f"Labels       : {model.config.id2label}")


Device set to use cpu


Model loaded : Babelscape/wikineural-multilingual-ner
Labels       : {0: 'O', 1: 'B-PER', 2: 'I-PER', 3: 'B-ORG', 4: 'I-ORG', 5: 'B-LOC', 6: 'I-LOC', 7: 'B-MISC', 8: 'I-MISC'}


In [4]:

# ======================================================
# CV text  (French or English ‚Äî model handles both)
# ======================================================
text_fr = """
Mohammed JADIR

√âtudiant Ing√©nieur en Data Science & Intelligence Artificielle

+212 6 82 84 65 80
mohamad.jadir2018@gmail.com
GitHub : github.com/Jadir99
LinkedIn : mohammed-jadir
Portfolio : jadir-mohammed.com

FORMATION

Diplome d'Ing√©nieur - Data Science, Big Data & Intelligence Artificielle
√âcole Nationale Sup√©rieure de l'Intelligence Artificielle et Sciences des Donn√©es (ENSIASD), Taroudant
2023 - Pr√©sent

Diplome Universitaire de Technologie (DUT) - G√©nie Informatique
√âcole Sup√©rieure de Technologie (EST), Universit√© Cadi Ayyad, Safi
2021 - 2023

EXP√âRIENCE PROFESSIONNELLE

Stagiaire Data Science - Banque Centrale Populaire (Stage PFA)
Juillet 2025 - Septembre 2025
D√©veloppement d'un mod√®le hybride CamemBERT + Random Forest pour la classification automatique des r√©clamations clients.
Conception de dashboards interactifs avec Power BI pour l'aide √† la d√©cision.
Analyse et traitement des donn√©es avec Pandas et PostgreSQL.
D√©veloppement et d√©ploiement d'API avec Flask.

Stagiaire Data Science - YaneCode
Juillet 2024 - Ao√ªt 2024
D√©veloppement d'une plateforme e-learning intelligente int√©grant un syst√®me de recommandation bas√© sur l'IA.
Utilisation de Laravel, Flask, TensorFlow, Gemini LLM, Scikit-learn, NumPy et Pandas.
Conception de l'architecture backend et entra√Ænement des mod√®les de recommandation.
Int√©gration des algorithmes IA dans l'application web.

PROJETS

SmartCourseQA - Chatbot Intelligent & G√©n√©rateur de Quiz
github.com/Jadir99/SmartCourseQA
D√©veloppement d'une application RAG hybride (FAISS, BM25, GPT-4o-mini, multilingual-e5-large).
Chatbot acad√©mique, g√©n√©ration automatique de QCM et syst√®me d'√©valuation.
Backend d√©velopp√© avec Flask.

Syst√®me de Questions-R√©ponses sur Documents PDF
github.com/Jadir99/RAG-Based-Document-Question-Answering-System
Impl√©mentation d'un syst√®me RAG bas√© sur SentenceTransformers, FAISS et GPT-4o-mini.
Interface interactive avec affichage des r√©f√©rences de pages et score de confiance.

AZUL-AI - Plateforme Touristique IA
azulaimaroc.com
Transformation de l'exp√©rience touristique au Maroc via l'IA g√©n√©rative.
Int√©gration de Gemini LLM, backend en Flask et API consomm√©e en JavaScript.

Application de Visualisation - League of Legends
github.com/Jadir99/league-of-legend-prediction-win
Analyse et visualisation des donn√©es joueurs avec SQL, Django, Pandas, NumPy et Chart.js.

Amazon Product Sentiment Analysis
github.com/Jadir99/Sentiment-analysis-from-product-reviews
Scraping automatis√© via Selenium.
Analyse de sentiments avec RoBERTa.
Visualisations avec Matplotlib, Seaborn et WordCloud.

COMP√âTENCES TECHNIQUES

Machine Learning / IA
Python, TensorFlow, PyTorch, Scikit-learn, Pandas, NumPy, Matplotlib, Seaborn, NLP, Power BI

D√©veloppement & Outils
Laravel, Flask, Django, Hadoop, MapReduce, Java, Git, GitHub, Docker, Linux, n8n

Bases de donn√©es
MySQL, SQLite, PostgreSQL, MongoDB

CERTIFICATIONS

365 Data Science - Deep Learning with TensorFlow 2
Oracle Cloud Infrastructure 2025 Certified Generative AI Professional
CCNAv7 - Introduction to Networks

LANGUES

Arabe : Langue maternelle
Fran√ßais : Interm√©diaire
Anglais : Interm√©diaire

ACTIVIT√âS PARASCOLAIRES

Membre principal - Google Developer Group (GDG), Campus Universiapolis (2024-2025)
Pr√©sident - Club Nakama (Culture asiatique) (2023-2024)
Pr√©sident - Chess Club (2022-2023)
"""

text_en = """
Mohammed JADIR
Engineering Student in Data Science & Artificial Intelligence
+212 6 82 84 65 80 | mohamad.jadir2018@gmail.com
GitHub: github.com/Jadir99 | LinkedIn: mohammed-jadir | Portfolio: jadir-mohammed.com

EDUCATION
Engineering Degree in Data Science, Big Data & Artificial Intelligence
National School of Artificial Intelligence and Data Science (ENSIASD), Taroudant
2023 - Present

University Diploma of Technology (DUT) in Computer Engineering
Higher School of Technology (EST), Cadi Ayyad University, Safi
2021 - 2023

PROFESSIONAL EXPERIENCE
Data Science Intern - Banque Centrale Populaire (PFA Internship)
July 2025 - September 2025
Developed a hybrid CamemBERT + Random Forest model for automatic customer complaint classification.
Designed interactive dashboards using Power BI to support decision-making.
Processed and analyzed data using Pandas and PostgreSQL.
Built and deployed APIs using Flask.

Data Science Intern - YaneCode
July 2024 - August 2024
Developed an intelligent e-learning platform integrating an AI-based recommendation system.
Used Laravel, Flask, TensorFlow, Gemini LLM, Scikit-learn, NumPy, and Pandas.
Designed backend architecture and trained recommendation models.
Integrated AI algorithms into the web application.

PROJECTS
SmartCourseQA - Intelligent Chatbot & Quiz Generator
github.com/Jadir99/SmartCourseQA
Developed a hybrid RAG application (FAISS, BM25, GPT-4o-mini, multilingual-e5-large).
Academic chatbot with automatic MCQ generation and evaluation system.

RAG-Based Document Question Answering System
github.com/Jadir99/RAG-Based-Document-Question-Answering-System
Implemented a RAG-based QA system using SentenceTransformers, FAISS, and GPT-4o-mini.

AZUL-AI - AI-Powered Tourism Platform
azulaimaroc.com
Enhanced the tourism experience in Morocco using generative AI.
Integrated Gemini LLM, Flask backend, and JavaScript API consumption.

League of Legends Data Visualization Application
github.com/Jadir99/league-of-legend-prediction-win
Data analysis and visualization using SQL, Django, Pandas, NumPy, and Chart.js.

Amazon Product Sentiment Analysis
github.com/Jadir99/Sentiment-analysis-from-product-reviews
Automated product review extraction using Selenium.
Sentiment analysis using RoBERTa.
Data visualization with Matplotlib, Seaborn, and WordCloud.

TECHNICAL SKILLS
Machine Learning & AI
Python, TensorFlow, PyTorch, Scikit-learn, Pandas, NumPy, Matplotlib, Seaborn, NLP, Power BI

Development & Tools
Laravel, Flask, Django, Hadoop, MapReduce, Java, Git, GitHub, Docker, Linux, n8n

Databases
MySQL, SQLite, PostgreSQL, MongoDB

CERTIFICATIONS
365 Data Science - Deep Learning with TensorFlow 2
Oracle Cloud Infrastructure 2025 - Certified Generative AI Professional
CCNAv7 - Introduction to Networks

LANGUAGES
Arabic: Native
French: Intermediate
English: Intermediate

EXTRACURRICULAR ACTIVITIES
Core Member - Google Developer Group (GDG), Universiapolis Campus (2024-2025)
President - Nakama Club (Asian Culture Club) (2023-2024)
President - Chess Club (2022-2023)
"""

print("Texts loaded. FR words:", len(text_fr.split()), "| EN words:", len(text_en.split()))


Texts loaded. FR words: 406 | EN words: 365


In [5]:

# ======================================================
# Preprocessing: clean text for the transformer
# - Remove emojis & special bullets
# - Normalize dashes and whitespace
# - Keep structure (newlines) so sentence boundaries are clear
# ======================================================
def preprocess(text):
    EMOJI = re.compile(
        "[" + u"\U0001F300-\U0001FAFF" + u"\U00002300-\U000027BF" + "]+",
        flags=re.UNICODE,
    )
    text = EMOJI.sub(" ", text)
    text = re.sub(r"[üîπüî∏‚óè‚ñ™‚ñ∏‚ñ∫‚Ä¢‚ó¶]", " ", text)
    text = text.replace("‚Äì", "-").replace("‚Äî", "-")
    text = re.sub(r"[ \t]{2,}", " ", text)
    text = re.sub(r"\n{3,}", "\n\n", text)
    return text.strip()


# ======================================================
# Chunked NER:  split into ~400-word chunks with 40-word
# overlap so long CVs are processed without truncation.
# ======================================================
def extract_ner(text, pipeline_fn, chunk_words=300, overlap=40):
    words = text.split()
    chunks, i = [], 0
    while i < len(words):
        chunks.append(" ".join(words[i : i + chunk_words]))
        i += chunk_words - overlap

    all_ents = []
    for chunk in chunks:
        if chunk.strip():
            all_ents.extend(pipeline_fn(chunk))
    return all_ents


# ======================================================
# CV-aware label mapping
# XLM-R Wikineural labels: PER, ORG, LOC, MISC -> richer CV categories
# ======================================================
NER_TO_CV = {
    "PER":  "Person / Candidate",
    "ORG":  "Company / Organization / University",
    "LOC":  "Location / City / Country",
    "MISC": "Miscellaneous",
}

def map_to_cv(entities):
    out = {v: [] for v in NER_TO_CV.values()}
    for e in entities:
        label = e["entity_group"]
        cv_label = NER_TO_CV.get(label, "Miscellaneous")
        out[cv_label].append((e["word"].strip(), round(e["score"], 3)))
    return out


# ======================================================
# Merge: case-insensitive dedup, keep highest score
# ======================================================
def dedup(entities_by_label):
    result = {}
    for label, items in entities_by_label.items():
        seen = {}
        for word, score in items:
            key = word.lower()
            if key not in seen or score > seen[key][1]:
                seen[key] = (word, score)
        result[label] = sorted(seen.values(), key=lambda x: -x[1])
    return result


print("Preprocessing & helper functions defined.")


Preprocessing & helper functions defined.


In [9]:

# ======================================================
# Rule-based extractor  (supplements transformer NER)
# ======================================================

TECH_SKILLS = [
    "Python","TensorFlow","PyTorch","Scikit-learn","Pandas","NumPy","Matplotlib",
    "Seaborn","NLP","Power BI","Flask","Django","Laravel","Git","GitHub","Docker",
    "Linux","MySQL","SQLite","PostgreSQL","MongoDB","Hadoop","MapReduce","Java",
    "SQL","FAISS","BM25","SentenceTransformers","RoBERTa","CamemBERT","Selenium",
    "JavaScript","Chart.js","n8n","Gemini LLM","GPT-4o-mini","RAG","REST API",
    "Keras","XGBoost","Spark","FastAPI","Random Forest","Machine Learning",
    "Deep Learning","Generative AI","LLM","Jupyter","Power BI","WordCloud",
    "Matplotlib","TF-IDF","BERT","Transformer","Hugging Face","OpenAI",
]

SOFT_SKILLS = [
    "Leadership","Teamwork","Communication","Problem-solving","Adaptability",
    "Creativity","Critical thinking","Time management","Autonomy","Rigour",
]

CERTIFICATIONS = [
    "365 Data Science","Deep Learning with TensorFlow 2",
    "Oracle Cloud Infrastructure 2025 Certified Generative AI Professional",
    "Oracle Cloud Infrastructure 2025","CCNAv7","Introduction to Networks",
]

LANGUAGES_KNOWN = [
    "Arabic","French","English","Spanish","Amazigh",
    "Arabe","Fran√ßais","Anglais",
    "Native","Intermediate","Beginner","Fluent","Bilingue","Interm√©diaire",
    "Langue maternelle",
]

EDUCATION_KW = [
    "Engineering Degree","Master","Bachelor","DUT",
    "Diplome d'Ing√©nieur","Diplome Universitaire de Technologie",
    "Baccalaureate","ENSIASD","EST","Cadi Ayyad",
]

EXTRACURRICULAR_KW = [
    "Google Developer Group","GDG","Nakama Club","Chess Club",
    "President","Vice-President","Pr√©sident","Universiapolis",
]

# -------------------------------------------------------
# Regex patterns for personal information
# -------------------------------------------------------
REGEX_PATTERNS = {
    "Phone":    r'\+?[\d\s\-().]{9,}',
    "Email":    r'[\w.\-]+@[\w.\-]+\.\w{2,}',
    "GitHub":   r'github\.com/[\w\-/]+',
    "LinkedIn": r'linkedin\.com/in/[\w\-]+',
    "Website":  r'(?:https?://)?(?:www\.)?[\w\-]+\.(?:com|ma|io|dev|ai|org|net)/[\w\-/]*',
}

# Detect person name: first non-empty line that looks like "FIRSTNAME LASTNAME"
NAME_RE = re.compile(r'^[A-Z√Å√Ä√Ç√â√à√ä√ã√é√è√î√ô√õ√ú√á√ë][a-z√°√†√¢√©√®√™√´√Æ√Ø√¥√π√ª√º√ß√±]+(?:\s+[A-Z√Å√Ä√Ç√â√à√ä√ã√é√è√î√ô√õ√ú√á√ë][A-Z√Å√Ä√Ç√â√à√ä√ã√é√è√î√ô√õ√ú√á√ëa-z√°√†√¢√©√®√™√´√Æ√Ø√¥√π√ª√º√ß√±]+)+$')


def rule_based_extract(text):
    found = {
        "Person / Name":         [],
        "Contact Information":   [],
        "Technical Skills":      [],
        "Soft Skills":           [],
        "Certifications":        [],
        "Languages":             [],
        "Education Keywords":    [],
        "Extracurricular":       [],
    }
    t = text.lower()

    # --- Person name (first line matching pattern) ---
    for line in text.splitlines():
        line = line.strip()
        if line and NAME_RE.match(line):
            found["Person / Name"].append((line, 1.0))
            break   # only grab the first match (resume owner)

    # --- Contact info via regex ---
    for field, pattern in REGEX_PATTERNS.items():
        for m in re.finditer(pattern, text):
            val = m.group().strip()
            if len(val) > 4:
                found["Contact Information"].append((f"{field}: {val}", 1.0))

    # --- Tech skills ---
    for kw in TECH_SKILLS:
        if kw.lower() in t:
            found["Technical Skills"].append((kw, 1.0))

    # --- Soft skills ---
    for kw in SOFT_SKILLS:
        if kw.lower() in t:
            found["Soft Skills"].append((kw, 1.0))

    # --- Certifications ---
    for kw in CERTIFICATIONS:
        if kw.lower() in t:
            found["Certifications"].append((kw, 1.0))

    # --- Languages ---
    for kw in LANGUAGES_KNOWN:
        if kw.lower() in t:
            found["Languages"].append((kw, 1.0))

    # --- Education keywords ---
    for kw in EDUCATION_KW:
        if kw.lower() in t:
            found["Education Keywords"].append((kw, 1.0))

    # --- Extracurricular ---
    for kw in EXTRACURRICULAR_KW:
        if kw.lower() in t:
            found["Extracurricular"].append((kw, 1.0))

    # Dedup each category
    for key in found:
        seen = {}
        for word, score in found[key]:
            k = word.lower()
            if k not in seen:
                seen[k] = (word, score)
        found[key] = list(seen.values())

    return found


print("Rule-based extractor (v2 ‚Äî with regex personal info) defined.")


Rule-based extractor (v2 ‚Äî with regex personal info) defined.


In [10]:

# ======================================================
# Display helper
# ======================================================
def display_cv_results(ner_results, rule_results, lang="FR"):
    W = 68
    print(f"\n{'='*W}")
    print(f"  CV EXTRACTION RESULTS ‚Äî {lang}")
    print(f"{'='*W}")

    # --- Transformer NER ---
    print(f"\n{'‚îÄ'*W}")
    print("  [XLM-RoBERTa NER ‚Äî transformer entities]")
    print(f"{'‚îÄ'*W}")
    total_ner = 0
    for category, items in ner_results.items():
        if items:
            print(f"\n  [{category}]  ({len(items)} entities)")
            for word, score in items[:20]:   # cap at 20 to keep output readable
                bar = "‚ñà" * int(score * 10) + "‚ñë" * (10 - int(score * 10))
                print(f"    ‚Ä¢ {word:<42} {bar} {score:.3f}")
            if len(items) > 20:
                print(f"    ... (+{len(items)-20} more)")
            total_ner += len(items)
    print(f"\n  NER entities total: {total_ner}")

    # --- Rule-based ---
    print(f"\n{'‚îÄ'*W}")
    print("  [Rule-based extraction ‚Äî keywords + regex]")
    print(f"{'‚îÄ'*W}")
    total_rule = 0
    for category, items in rule_results.items():
        if items:
            print(f"\n  [{category}]  ({len(items)} entities)")
            for word, _ in items:
                print(f"    ‚Ä¢ {word}")
            total_rule += len(items)
    print(f"\n  Rule-based entities total: {total_rule}")

    print(f"\n{'='*W}")
    print(f"  Grand total ({lang}): {total_ner + total_rule} entities detected")
    print(f"{'='*W}\n")


# ======================================================
# Run full pipeline
# ======================================================
for lang, raw_text in [("FR", text_fr), ("EN", text_en)]:
    clean     = preprocess(raw_text)
    ner_cvmap = dedup(map_to_cv(extract_ner(clean, nlp)))
    rule_ents = rule_based_extract(clean)
    display_cv_results(ner_cvmap, rule_ents, lang=lang)



  CV EXTRACTION RESULTS ‚Äî FR

‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
  [XLM-RoBERTa NER ‚Äî transformer entities]
‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ

  [Company / Organization / University]  (12 entities)
    ‚Ä¢ √âcole Nationale Sup√©rieure de l ' Intelligence Artificielle et Sciences des Donn√©es ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñë 0.914
    ‚Ä¢ Big Data & Intelligence Artificielle       ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñë‚ñë‚ñë 0.767
    ‚Ä¢ Centrale Populaire                         ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñë‚ñë‚ñë 0.702
    ‚Ä¢ Google Developer Group                     ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñë‚ñë‚ñë‚ñë 0.700
    ‚Ä¢ √âcole Sup√©rieure de Technologie            ‚ñà‚ñà‚

In [11]:

# ======================================================
# Summary table: entity counts per source + category
# ======================================================
import pandas as pd

rows = []
for lang, raw_text in [("FR", text_fr), ("EN", text_en)]:
    clean   = preprocess(raw_text)
    ner_map = dedup(map_to_cv(extract_ner(clean, nlp)))
    rule    = rule_based_extract(clean)

    for cat, items in ner_map.items():
        rows.append({"Lang": lang, "Source": "XLM-R NER",  "Category": cat, "Count": len(items)})
    for cat, items in rule.items():
        rows.append({"Lang": lang, "Source": "Rule-based", "Category": cat, "Count": len(items)})

df    = pd.DataFrame(rows)
pivot = (
    df.pivot_table(index=["Source", "Category"], columns="Lang", values="Count", aggfunc="sum")
      .fillna(0).astype(int)
)
print(pivot.to_string())
ner_total  = df[df.Source == "XLM-R NER"]["Count"].sum()
rule_total = df[df.Source == "Rule-based"]["Count"].sum()
print(f"\nXLM-R NER   total : {ner_total}")
print(f"Rule-based  total : {rule_total}")
print(f"Grand total        : {ner_total + rule_total}")


Lang                                            EN  FR
Source     Category                                   
Rule-based Certifications                        5   6
           Contact Information                  16  16
           Education Keywords                    5   6
           Extracurricular                       6   5
           Languages                             5   5
           Person / Name                         1   1
           Soft Skills                           0   0
           Technical Skills                     45  45
XLM-R NER  Company / Organization / University  17  12
           Location / City / Country             4   3
           Miscellaneous                        72  75
           Person / Candidate                    0   0

XLM-R NER   total : 183
Rule-based  total : 167
Grand total        : 350
