In [3]:
import spacy
from spacy.training.example import Example
import csv
import random
import os

def train_and_save_spacy_model(output_dir="TrainedModel/test2", iterations=20):
    nlp = spacy.blank("en")  # Create a blank English model

    # Add NER pipeline
    ner = nlp.add_pipe("ner", last=True)
    ner.add_label("SKILL")

    # Load training data from CSV
    TRAIN_DATA = []
    with open('newSkills.csv', newline='', encoding='utf-8') as csvfile:
        reader = csv.reader(csvfile)
        for row in reader:
            skill_text = row[0].strip()
            if skill_text:
                doc = nlp.make_doc(skill_text)
                entities = [(0, len(skill_text), "SKILL")]
                TRAIN_DATA.append((doc, {"entities": entities}))

    # Begin training
    optimizer = nlp.begin_training()

    for itn in range(iterations):
        random.shuffle(TRAIN_DATA)
        losses = {}
        for text, annotations in TRAIN_DATA:
            example = Example.from_dict(text, annotations)
            nlp.update([example], drop=0.5, losses=losses, sgd=optimizer)
        print("Iteration:", itn+1, "Loss:", losses)

    # Save model
    os.makedirs(output_dir, exist_ok=True)
    nlp.to_disk(output_dir)
    print("Trained model saved to:", output_dir)

    return nlp

# Train the model
trained_model = train_and_save_spacy_model()

Iteration: 1 Loss: {'ner': 599.632755397381}
Iteration: 2 Loss: {'ner': 26.03317319958548}
Iteration: 3 Loss: {'ner': 27.755264260488303}
Iteration: 4 Loss: {'ner': 0.21524103076839743}
Iteration: 5 Loss: {'ner': 13.49837215350828}
Iteration: 6 Loss: {'ner': 27.36020478327103}
Iteration: 7 Loss: {'ner': 16.683412254806782}
Iteration: 8 Loss: {'ner': 12.565058419044357}
Iteration: 9 Loss: {'ner': 1.4472368077383332e-08}
Iteration: 10 Loss: {'ner': 2.1265517465164298e-09}
Iteration: 11 Loss: {'ner': 6.2675624934271495e-06}
Iteration: 12 Loss: {'ner': 3.4346597747373585e-07}
Iteration: 13 Loss: {'ner': 4.2936852514971005}
Iteration: 14 Loss: {'ner': 24.168548929549946}
Iteration: 15 Loss: {'ner': 5.852141195661485}
Iteration: 16 Loss: {'ner': 21.30412135407271}
Iteration: 17 Loss: {'ner': 6.235532081594258}
Iteration: 18 Loss: {'ner': 6.523334224614372e-08}
Iteration: 19 Loss: {'ner': 2.0726303317001156e-07}
Iteration: 20 Loss: {'ner': 6.616554983219406e-09}
Trained model saved to: Traine

In [5]:
import spacy
import random
import os
from spacy.training.example import Example

# Create blank English model
nlp = spacy.blank("en")

# Add NER to the pipeline
if "ner" not in nlp.pipe_names:
    ner = nlp.add_pipe("ner")
else:
    ner = nlp.get_pipe("ner")


UPDATED_TRAIN_DATA = [
    ("Proficient in Python, Java, and C++", {"entities": [(13, 19, "SKILL"), (21, 25, "SKILL"), (30, 33, "SKILL")]}),
    ("Experience with machine learning algorithms", {"entities": [(12, 28, "SKILL")]}),
    ("Familiar with TensorFlow and PyTorch", {"entities": [(13, 24, "SKILL"), (29, 36, "SKILL")]}),
    ("Strong understanding of HTML, CSS, and JavaScript", {"entities": [(19, 22, "SKILL"), (24, 27, "SKILL"), (32, 41, "SKILL")]}),
    ("Expertise in data analysis and visualization using Tableau", {"entities": [(20, 30, "SKILL"), (35, 46, "SKILL")]}),
    ("Skilled in SQL database management", {"entities": [(10, 13, "SKILL"), (24, 35, "SKILL")]}),
    ("Knowledge of React and Angular frameworks", {"entities": [(12, 17, "SKILL"), (22, 29, "SKILL")]}),
    ("Proficiency in MATLAB for numerical computing", {"entities": [(14, 19, "SKILL"), (28, 44, "SKILL")]}),
    ("Experience with cloud technologies like AWS and Azure", {"entities": [(17, 20, "SKILL"), (25, 30, "SKILL")]}),
    ("Expertise in natural language processing and NLP techniques", {"entities": [(20, 46, "SKILL"), (51, 54, "SKILL")]}),
    ("Familiarity with Git version control system", {"entities": [(16, 18, "SKILL"), (30, 36, "SKILL")]}),
    ("Knowledgeable in DevOps practices and CI/CD pipelines", {"entities": [(14, 20, "SKILL"), (37, 41, "SKILL")]}),
    ("Proficient in Java Spring and Hibernate frameworks", {"entities": [(14, 24, "SKILL"), (29, 38, "SKILL")]}),
    ("Expert in Linux kernel and system administration", {"entities": [(5, 10, "SKILL"), (17, 26, "SKILL"), (33, 53, "SKILL")]}),
    ("Advanced networking skills - configuring routers, switches, firewalls", {"entities": [(8, 22, "SKILL"), (25, 41, "SKILL"), (44, 54, "SKILL"), (57, 66, "SKILL")]}), 
    ("Experience setting up Kubernetes clusters on AWS and GCP", {"entities": [(17, 28, "SKILL"), (37, 57, "SKILL"), (61, 64, "SKILL")]}),
    ("Skilled software developer with 5 years building scalable web apps", {"entities": [(10, 26, "SKILL"), (52, 68, "SKILL"), (73, 83, "SKILL")]}),
    ("Proficiency in Java, Python, C++, JavaScript, and Golang", {"entities": [(14, 19, "SKILL"), (22, 28, "SKILL"), (31, 34, "SKILL"), (37, 47, "SKILL"), (52, 58, "SKILL")]}),
    ("Expertise in full stack development using MongoDB, Express, React, Node", {"entities": [(20, 33, "SKILL"), (44, 51, "SKILL"), (55, 61, "SKILL"), (65, 70, "SKILL"), (74, 78, "SKILL")]}),
    ("Experience with machine learning libraries PyTorch, TensorFlow, Keras", {"entities": [(17, 45, "SKILL"), (50, 60, "SKILL"), (65, 75, "SKILL"), (80, 85, "SKILL")]}),
    ("Skilled in CI/CD pipelines, GitLab, Jenkins, Bamboo, CircleCI", {"entities": [[10, 24, "SKILL"], [29, 35, "SKILL"], [40, 47, "SKILL"], [52, 59, "SKILL"], [64, 72, "SKILL"]]}),
    # Additional examples with skill-related keywords
    ("SQL", {"entities": [(0, 3, "SKILL")]}),
    ("JavaScript", {"entities": [(0, 10, "SKILL")]}),
    ("Machine Learning", {"entities": [(0, 16, "SKILL")]}),
    ("Data Analysis", {"entities": [(0, 13, "SKILL")]}),
    ("React.js", {"entities": [(0, 8, "SKILL")]}),
    ("AngularJS", {"entities": [(0, 9, "SKILL")]}),
    ("Node.js", {"entities": [(0, 7, "SKILL")]}),
    ("MongoDB", {"entities": [(0, 7, "SKILL")]}),
    ("AWS Cloud", {"entities": [(0, 8, "SKILL")]}),
    ("Azure Cloud", {"entities": [(0, 11, "SKILL")]}),
    ("Statistical Modeling", {"entities": [(0, 20, "SKILL")]}),
    ("Linux operating system", {"entities": [(0, 5, "SKILL")]}),
    ("Windows Server administration", {"entities": [(0, 6, "SKILL"), (17, 28, "SKILL")]}),
    ("Network configuration and troubleshooting", {"entities": [(0, 8, "SKILL"), (25, 42, "SKILL")]}),
    ("TCP/IP, OSI model", {"entities": [(0, 7, "SKILL"), (12, 17, "SKILL")]}), 
    ("Routing protocols like OSPF, BGP", {"entities": [(0, 15, "SKILL"), (22, 25, "SKILL"), (30, 33, "SKILL")]}),
    ("Cisco switching and routing", {"entities": [(0, 5, "SKILL"), (13, 19, "SKILL")]}),
    ("VPN configuration", {"entities": [(0, 13, "SKILL")]}),
    ("Firewall administration", {"entities": [(0, 9, "SKILL"), (17, 28, "SKILL")]}),
    ("Network security", {"entities": [(0, 14, "SKILL")]}),
    ("Penetration testing", {"entities": [(0, 19, "SKILL")]}),
    ("Burp Suite", {"entities": [(0, 9, "SKILL")]}),
    ("Wireshark network analysis", {"entities": [(0, 9, "SKILL"), (17, 32, "SKILL")]}),
    ("CCNA certification", {"entities": [(0, 10, "SKILL")]}), 
    ("VMware administration", {"entities": [(0, 6, "SKILL"), (17, 28, "SKILL")]}),
    ("SAN storage configuration", {"entities": [(0, 3, "SKILL"), (14, 32, "SKILL")]}),
    ("NAS storage administration", {"entities": [(0, 3, "SKILL"), (15, 33, "SKILL")]}),
    ("RAID arrays", {"entities": [(0, 7, "SKILL")]}),
    ("Docker containerization", {"entities": [(0, 6, "SKILL"), (17, 32, "SKILL")]}),
    ("Kubernetes", {"entities": [(0, 10, "SKILL")]}),
    ("Jenkins CI/CD pipelines", {"entities": [(0, 7, "SKILL"), (17, 30, "SKILL")]}), 
    ("Ansible automation", {"entities": [(0, 6, "SKILL"), (17, 27, "SKILL")]}),
    ("Terraform infrastructure-as-code", {"entities": [(0, 9, "SKILL"), (18, 38, "SKILL")]}),
    ("Azure administration", {"entities": [(0, 6, "SKILL"), (17, 29, "SKILL")]}),
    ("AWS cloud architecture", {"entities": [(0, 3, "SKILL"), (13, 29, "SKILL")]}),
    ("Google Cloud Platform", {"entities": [(0, 22, "SKILL")]}),
    ("DevOps culture and practices ", {"entities": [(0, 6, "SKILL"), (18, 34, "SKILL")]}),
    ("Agile development methodologies", {"entities": [(0, 5, "SKILL"), (20, 42, "SKILL")]}),
    ("Waterfall SDLC", {"entities": [(0, 8, "SKILL"), (13, 16, "SKILL")]}),
    ("Object-oriented analysis and design", {"entities": [(0, 25, "SKILL"), (31, 36, "SKILL")]}), 
    ("SQL database programming", {"entities": [(0, 3, "SKILL"), (14, 28, "SKILL")]}),
    ("Oracle database administration", {"entities": [(0, 6, "SKILL"), (17, 35, "SKILL")]}),
    ("MongoDB NoSQL databases", {"entities": [(0, 7, "SKILL"), (15, 29, "SKILL")]}),
    ("Redis in-memory caching", {"entities": [(0, 5, "SKILL"), (17, 29, "SKILL")]}),
    ("Data modeling and warehousing", {"entities": [(0, 15, "SKILL"), (21, 32, "SKILL")]}), 
    ("ETL processing pipelines", {"entities": [(0, 3, "SKILL"), (15, 31, "SKILL")]}),
    ("Hadoop cluster configuration", {"entities": [(0, 6, "SKILL"), (20, 38, "SKILL")]}),
    ("Spark big data processing", {"entities": [(0, 5, "SKILL"), (16, 30, "SKILL")]}),
    ("Tableau data visualization", {"entities": [(0, 7, "SKILL"), (17, 33, "SKILL")]}), 
    ("Power BI business analytics", {"entities": [(0, 8, "SKILL"), (17, 33, "SKILL")]}),
    ("Python programming", {"entities": [(0, 6, "SKILL"), (17, 28, "SKILL")]}), 
    ("Java Spring Boot framework", {"entities": [(0, 4, "SKILL"), (10, 26, "SKILL")]}),  
    ("PHP web application development", {"entities": [(0, 3, "SKILL"), (14, 37, "SKILL")]}),
    ("Ruby on Rails web framework", {"entities": [(0, 3, "SKILL"), (10, 27, "SKILL")]}),
    ("JavaScript front-end development", {"entities": [(0, 10, "SKILL"), (22, 40, "SKILL")]}),
    ("React web applications", {"entities": [(0, 5, "SKILL"), (16, 30, "SKILL")]}),
    ("Angular single page applications", {"entities": [(0, 7, "SKILL"), (17, 37, "SKILL")]}),
    ("Node.js back-end services", {"entities": [(0, 7, "SKILL"), (17, 29, "SKILL")]}),
    ("REST API design and development", {"entities": [(0, 10, "SKILL"), (28, 47, "SKILL")]}),
    ("GraphQL API development", {"entities": [(0, 7, "SKILL"), (17, 31, "SKILL")]}),
    ("Unit testing frameworks like JUnit", {"entities": [(0, 15, "SKILL"), (24, 29, "SKILL")]}),
    ("UX design and usability", {"entities": [(0, 7, "SKILL"), (16, 27, "SKILL")]}),
    ("Git version control system", {"entities": [(0, 18, "SKILL"), (30, 36, "SKILL")]}),
    ("Continuous integration and delivery", {"entities": [(0, 28, "SKILL"), (36, 44, "SKILL")]}), 
    ("R language data analysis", {"entities": [(0, 11, "SKILL"), (20, 33, "SKILL")]}), 
    ("MATLAB numerical computing", {"entities": [(0, 6, "SKILL"), (17, 31, "SKILL")]}),
    ("C++ high performance programming", {"entities": [(0, 2, "SKILL"), (18, 37, "SKILL")]}),
    ("Multithreading and concurrency", {"entities": [(0, 15, "SKILL"), (22, 34, "SKILL")]}),
    ("Cryptography and encryption algorithms", {"entities": [(0, 13, "SKILL"), (24, 44, "SKILL")]}),
    ("Cybersecurity awareness ", {"entities": [(0, 14, "SKILL"), (26, 40, "SKILL")]}),
    ("Penetration testing and ethical hacking", {"entities": [(0, 23, "SKILL"), (31, 47, "SKILL")]}),
    ("Artificial intelligence and machine learning", {"entities": [(0, 25, "SKILL"), (35, 52, "SKILL")]}),
    ("Neural networks and deep learning", {"entities": [(0, 15, "SKILL"), (25, 39, "SKILL")]}), 
    ("Computer vision with OpenCV", {"entities": [(0, 16, "SKILL"), (26, 32, "SKILL")]}),
    ("Natural language processing techniques", {"entities": [(0, 33, "SKILL"), (46, 58, "SKILL")]}),
    ("Recommender systems algorithms", {"entities": [(0, 23, "SKILL"), (30, 42, "SKILL")]}),
    ("Python", {"entities": [(0, 6, "SKILL")]}),
    ("Java", {"entities": [(0, 4, "SKILL")]}),
    ("JavaScript", {"entities": [(0, 10, "SKILL")]}),
    ("TypeScript", {"entities": [(0, 10, "SKILL")]}),
    ("C++", {"entities": [(0, 3, "SKILL")]}),
    ("C#", {"entities": [(0, 2, "SKILL")]}),
    ("Go", {"entities": [(0, 2, "SKILL")]}),
    ("Ruby", {"entities": [(0, 4, "SKILL")]}),
    ("PHP", {"entities": [(0, 3, "SKILL")]}),
    ("Swift", {"entities": [(0, 5, "SKILL")]}),
    ("Rust", {"entities": [(0, 4, "SKILL")]}),
    ("Dart", {"entities": [(0, 4, "SKILL")]}),
    ("Kotlin", {"entities": [(0, 6, "SKILL")]}),
    ("SQL", {"entities": [(0, 3, "SKILL")]}),
    ("NoSQL", {"entities": [(0, 5, "SKILL")]}),
    ("C", {"entities": [(0, 1, "SKILL")]}),
    ("Scala", {"entities": [(0, 5, "SKILL")]}),
    ("Perl", {"entities": [(0, 4, "SKILL")]}),
    ("Haskell", {"entities": [(0, 7, "SKILL")]}),
    ("Bash", {"entities": [(0, 4, "SKILL")]}),
    ("Shell", {"entities": [(0, 5, "SKILL")]}),
    ("Cobol", {"entities": [(0,5, "SKILL")]}),
    ("Fortran", {"entities": [(0,7, "SKILL")]}),
    ("Visual Basic", {"entities": [(0,13, "SKILL")]}),
    ("Assembly", {"entities": [(0,9, "SKILL")]}),
    ("Pascal", {"entities": [(0,6, "SKILL")]}),
    ("Ada", {"entities": [(0,3, "SKILL")]}),
    ("ABAP", {"entities": [(0,4, "SKILL")]}), 
    ("RPG", {"entities": [(0,3, "SKILL")]}),
    ("Lisp", {"entities": [(0,4, "SKILL")]}),
    ("Prolog", {"entities": [(0,6, "SKILL")]}),
    ("F#", {"entities": [(0,2, "SKILL")]}),
    ("Lua", {"entities": [(0,3, "SKILL")]}),
    ("MATLAB", {"entities": [(0,6, "SKILL")]}),
    ("SAS", {"entities": [(0,3, "SKILL")]}),
    ("SPSS", {"entities": [(0,4, "SKILL")]}),
    ("R", {"entities": [(0,1, "SKILL")]}),
    ("Julia", {"entities": [(0,5, "SKILL")]}),
    ("Mahout", {"entities": [(0,6, "SKILL")]}), 
    ("Solr", {"entities": [(0,4, "SKILL")]}),
    ("Lucene", {"entities": [(0,6, "SKILL")]}),
    ("HBase", {"entities": [(0,4, "SKILL")]}),
    ("Cassandra", {"entities": [(0,9, "SKILL")]}), 
    ("Neo4j", {"entities": [(0,5, "SKILL")]}),
    ("Unix", {"entities": [(0,4, "SKILL")]}),
    ("Linux", {"entities": [(0,5, "SKILL")]}),
    ("Windows", {"entities": [(0,7, "SKILL")]}),
    ("MacOS", {"entities": [(0,5, "SKILL")]}),
    ("Android", {"entities": [(0,7, "SKILL")]}),
]

# Add labels to the NER component
for _, annotations in UPDATED_TRAIN_DATA:
    for ent in annotations["entities"]:
        ner.add_label(ent[2])

# Disable other pipeline components
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
with nlp.disable_pipes(*other_pipes):
    optimizer = nlp.begin_training()
    for iteration in range(30):
        random.shuffle(UPDATED_TRAIN_DATA)
        losses = {}
        for text, annotations in UPDATED_TRAIN_DATA:
            doc = nlp.make_doc(text)
            example = Example.from_dict(doc, annotations)
            nlp.update([example], sgd=optimizer, losses=losses)
        print(f"Iteration {iteration + 1} - Losses: {losses}")

# Save the model to disk
output_dir = "TrainedModel/test"  # Replace with your desired output directory
os.makedirs(output_dir, exist_ok=True)
nlp.to_disk(output_dir)
print("Model saved to:", output_dir)

Iteration 1 - Losses: {'ner': 85.03980814815755}
Iteration 2 - Losses: {'ner': 29.35014350329278}
Iteration 3 - Losses: {'ner': 6.024684175243379}
Iteration 4 - Losses: {'ner': 11.49478102215916}
Iteration 5 - Losses: {'ner': 9.761210468435332}
Iteration 6 - Losses: {'ner': 5.101833671680879}
Iteration 7 - Losses: {'ner': 3.364990013620455}
Iteration 8 - Losses: {'ner': 7.241660664727357}
Iteration 9 - Losses: {'ner': 6.037677697462498}
Iteration 10 - Losses: {'ner': 7.959434991282007}
Iteration 11 - Losses: {'ner': 4.093117474465162}
Iteration 12 - Losses: {'ner': 14.475566689619606}
Iteration 13 - Losses: {'ner': 11.415569011486644}
Iteration 14 - Losses: {'ner': 11.907036756395186}
Iteration 15 - Losses: {'ner': 14.69548488303088}
Iteration 16 - Losses: {'ner': 3.4958934772343886}
Iteration 17 - Losses: {'ner': 9.553184905197543}
Iteration 18 - Losses: {'ner': 0.00409040797071807}
Iteration 19 - Losses: {'ner': 1.1784302383533616}
Iteration 20 - Losses: {'ner': 6.116161342075631e-06

In [1]:
import spacy
import random
import os
from spacy.training.example import Example

# Create blank English model
nlp = spacy.blank("en")

# Add NER to the pipeline
if "ner" not in nlp.pipe_names:
    ner = nlp.add_pipe("ner")
else:
    ner = nlp.get_pipe("ner")

# Training data
UPDATED_TRAIN_DATA = [
    ("Proficient in Python, Java, and C++", {"entities": [(13, 19, "SKILL"), (21, 25, "SKILL"), (30, 33, "SKILL")]}),
    ("Experience with machine learning algorithms", {"entities": [(12, 28, "SKILL")]}),
    ("Familiar with TensorFlow and PyTorch", {"entities": [(13, 24, "SKILL"), (29, 36, "SKILL")]}),
    ("Strong understanding of HTML, CSS, and JavaScript", {"entities": [(19, 22, "SKILL"), (24, 27, "SKILL"), (32, 41, "SKILL")]}),
    ("Expertise in data analysis and visualization using Tableau", {"entities": [(20, 30, "SKILL"), (35, 46, "SKILL")]}),
    ("Skilled in SQL database management", {"entities": [(10, 13, "SKILL"), (24, 35, "SKILL")]}),
    ("Knowledge of React and Angular frameworks", {"entities": [(12, 17, "SKILL"), (22, 29, "SKILL")]}),
    ("Proficiency in MATLAB for numerical computing", {"entities": [(14, 19, "SKILL"), (28, 44, "SKILL")]}),
    ("Experience with cloud technologies like AWS and Azure", {"entities": [(17, 20, "SKILL"), (25, 30, "SKILL")]}),
    ("Expertise in natural language processing and NLP techniques", {"entities": [(20, 46, "SKILL"), (51, 54, "SKILL")]}),
    ("Familiarity with Git version control system", {"entities": [(16, 18, "SKILL"), (30, 36, "SKILL")]}),
    ("Knowledgeable in DevOps practices and CI/CD pipelines", {"entities": [(14, 20, "SKILL"), (37, 41, "SKILL")]}),
    ("Proficient in Java Spring and Hibernate frameworks", {"entities": [(14, 24, "SKILL"), (29, 38, "SKILL")]}),
    ("Expert in Linux kernel and system administration", {"entities": [(5, 10, "SKILL"), (17, 26, "SKILL"), (33, 53, "SKILL")]}),
    ("Advanced networking skills - configuring routers, switches, firewalls", {"entities": [(8, 22, "SKILL"), (25, 41, "SKILL"), (44, 54, "SKILL"), (57, 66, "SKILL")]}), 
    ("Experience setting up Kubernetes clusters on AWS and GCP", {"entities": [(17, 28, "SKILL"), (37, 57, "SKILL"), (61, 64, "SKILL")]}),
    ("Skilled software developer with 5 years building scalable web apps", {"entities": [(10, 26, "SKILL"), (52, 68, "SKILL"), (73, 83, "SKILL")]}),
    ("Proficiency in Java, Python, C++, JavaScript, and Golang", {"entities": [(14, 19, "SKILL"), (22, 28, "SKILL"), (31, 34, "SKILL"), (37, 47, "SKILL"), (52, 58, "SKILL")]}),
    ("Expertise in full stack development using MongoDB, Express, React, Node", {"entities": [(20, 33, "SKILL"), (44, 51, "SKILL"), (55, 61, "SKILL"), (65, 70, "SKILL"), (74, 78, "SKILL")]}),
    ("Experience with machine learning libraries PyTorch, TensorFlow, Keras", {"entities": [(17, 45, "SKILL"), (50, 60, "SKILL"), (65, 75, "SKILL"), (80, 85, "SKILL")]}),
    ("Skilled in CI/CD pipelines, GitLab, Jenkins, Bamboo, CircleCI", {"entities": [[10, 24, "SKILL"], [29, 35, "SKILL"], [40, 47, "SKILL"], [52, 59, "SKILL"], [64, 72, "SKILL"]]}),
    # Additional examples with skill-related keywords
    ("SQL", {"entities": [(0, 3, "SKILL")]}),
    ("JavaScript", {"entities": [(0, 10, "SKILL")]}),
    ("Machine Learning", {"entities": [(0, 16, "SKILL")]}),
    ("Data Analysis", {"entities": [(0, 13, "SKILL")]}),
    ("React.js", {"entities": [(0, 8, "SKILL")]}),
    ("AngularJS", {"entities": [(0, 9, "SKILL")]}),
    ("Node.js", {"entities": [(0, 7, "SKILL")]}),
    ("MongoDB", {"entities": [(0, 7, "SKILL")]}),
    ("AWS Cloud", {"entities": [(0, 8, "SKILL")]}),
    ("Azure Cloud", {"entities": [(0, 11, "SKILL")]}),
    ("Statistical Modeling", {"entities": [(0, 20, "SKILL")]}),
    ("Linux operating system", {"entities": [(0, 5, "SKILL")]}),
    ("Windows Server administration", {"entities": [(0, 6, "SKILL"), (17, 28, "SKILL")]}),
    ("Network configuration and troubleshooting", {"entities": [(0, 8, "SKILL"), (25, 42, "SKILL")]}),
    ("TCP/IP, OSI model", {"entities": [(0, 7, "SKILL"), (12, 17, "SKILL")]}), 
    ("Routing protocols like OSPF, BGP", {"entities": [(0, 15, "SKILL"), (22, 25, "SKILL"), (30, 33, "SKILL")]}),
    ("Cisco switching and routing", {"entities": [(0, 5, "SKILL"), (13, 19, "SKILL")]}),
    ("VPN configuration", {"entities": [(0, 13, "SKILL")]}),
    ("Firewall administration", {"entities": [(0, 9, "SKILL"), (17, 28, "SKILL")]}),
    ("Network security", {"entities": [(0, 14, "SKILL")]}),
    ("Penetration testing", {"entities": [(0, 19, "SKILL")]}),
    ("Burp Suite", {"entities": [(0, 9, "SKILL")]}),
    ("Wireshark network analysis", {"entities": [(0, 9, "SKILL"), (17, 32, "SKILL")]}),
    ("CCNA certification", {"entities": [(0, 10, "SKILL")]}), 
    ("VMware administration", {"entities": [(0, 6, "SKILL"), (17, 28, "SKILL")]}),
    ("SAN storage configuration", {"entities": [(0, 3, "SKILL"), (14, 32, "SKILL")]}),
    ("NAS storage administration", {"entities": [(0, 3, "SKILL"), (15, 33, "SKILL")]}),
    ("RAID arrays", {"entities": [(0, 7, "SKILL")]}),
    ("Docker containerization", {"entities": [(0, 6, "SKILL"), (17, 32, "SKILL")]}),
    ("Kubernetes", {"entities": [(0, 10, "SKILL")]}),
    ("Jenkins CI/CD pipelines", {"entities": [(0, 7, "SKILL"), (17, 30, "SKILL")]}), 
    ("Ansible automation", {"entities": [(0, 6, "SKILL"), (17, 27, "SKILL")]}),
    ("Terraform infrastructure-as-code", {"entities": [(0, 9, "SKILL"), (18, 38, "SKILL")]}),
    ("Azure administration", {"entities": [(0, 6, "SKILL"), (17, 29, "SKILL")]}),
    ("AWS cloud architecture", {"entities": [(0, 3, "SKILL"), (13, 29, "SKILL")]}),
    ("Google Cloud Platform", {"entities": [(0, 22, "SKILL")]}),
    ("DevOps culture and practices ", {"entities": [(0, 6, "SKILL"), (18, 34, "SKILL")]}),
    ("Agile development methodologies", {"entities": [(0, 5, "SKILL"), (20, 42, "SKILL")]}),
    ("Waterfall SDLC", {"entities": [(0, 8, "SKILL"), (13, 16, "SKILL")]}),
    ("Object-oriented analysis and design", {"entities": [(0, 25, "SKILL"), (31, 36, "SKILL")]}), 
    ("SQL database programming", {"entities": [(0, 3, "SKILL"), (14, 28, "SKILL")]}),
    ("Oracle database administration", {"entities": [(0, 6, "SKILL"), (17, 35, "SKILL")]}),
    ("MongoDB NoSQL databases", {"entities": [(0, 7, "SKILL"), (15, 29, "SKILL")]}),
    ("Redis in-memory caching", {"entities": [(0, 5, "SKILL"), (17, 29, "SKILL")]}),
    ("Data modeling and warehousing", {"entities": [(0, 15, "SKILL"), (21, 32, "SKILL")]}), 
    ("ETL processing pipelines", {"entities": [(0, 3, "SKILL"), (15, 31, "SKILL")]}),
    ("Hadoop cluster configuration", {"entities": [(0, 6, "SKILL"), (20, 38, "SKILL")]}),
    ("Spark big data processing", {"entities": [(0, 5, "SKILL"), (16, 30, "SKILL")]}),
    ("Tableau data visualization", {"entities": [(0, 7, "SKILL"), (17, 33, "SKILL")]}), 
    ("Power BI business analytics", {"entities": [(0, 8, "SKILL"), (17, 33, "SKILL")]}),
    ("Python programming", {"entities": [(0, 6, "SKILL"), (17, 28, "SKILL")]}), 
    ("Java Spring Boot framework", {"entities": [(0, 4, "SKILL"), (10, 26, "SKILL")]}),  
    ("PHP web application development", {"entities": [(0, 3, "SKILL"), (14, 37, "SKILL")]}),
    ("Ruby on Rails web framework", {"entities": [(0, 3, "SKILL"), (10, 27, "SKILL")]}),
    ("JavaScript front-end development", {"entities": [(0, 10, "SKILL"), (22, 40, "SKILL")]}),
    ("React web applications", {"entities": [(0, 5, "SKILL"), (16, 30, "SKILL")]}),
    ("Angular single page applications", {"entities": [(0, 7, "SKILL"), (17, 37, "SKILL")]}),
    ("Node.js back-end services", {"entities": [(0, 7, "SKILL"), (17, 29, "SKILL")]}),
    ("REST API design and development", {"entities": [(0, 10, "SKILL"), (28, 47, "SKILL")]}),
    ("GraphQL API development", {"entities": [(0, 7, "SKILL"), (17, 31, "SKILL")]}),
    ("Unit testing frameworks like JUnit", {"entities": [(0, 15, "SKILL"), (24, 29, "SKILL")]}),
    ("UX design and usability", {"entities": [(0, 7, "SKILL"), (16, 27, "SKILL")]}),
    ("Git version control system", {"entities": [(0, 18, "SKILL"), (30, 36, "SKILL")]}),
    ("Continuous integration and delivery", {"entities": [(0, 28, "SKILL"), (36, 44, "SKILL")]}), 
    ("R language data analysis", {"entities": [(0, 11, "SKILL"), (20, 33, "SKILL")]}), 
    ("MATLAB numerical computing", {"entities": [(0, 6, "SKILL"), (17, 31, "SKILL")]}),
    ("C++ high performance programming", {"entities": [(0, 2, "SKILL"), (18, 37, "SKILL")]}),
    ("Multithreading and concurrency", {"entities": [(0, 15, "SKILL"), (22, 34, "SKILL")]}),
    ("Cryptography and encryption algorithms", {"entities": [(0, 13, "SKILL"), (24, 44, "SKILL")]}),
    ("Cybersecurity awareness ", {"entities": [(0, 14, "SKILL"), (26, 40, "SKILL")]}),
    ("Penetration testing and ethical hacking", {"entities": [(0, 23, "SKILL"), (31, 47, "SKILL")]}),
    ("Artificial intelligence and machine learning", {"entities": [(0, 25, "SKILL"), (35, 52, "SKILL")]}),
    ("Neural networks and deep learning", {"entities": [(0, 15, "SKILL"), (25, 39, "SKILL")]}), 
    ("Computer vision with OpenCV", {"entities": [(0, 16, "SKILL"), (26, 32, "SKILL")]}),
    ("Natural language processing techniques", {"entities": [(0, 33, "SKILL"), (46, 58, "SKILL")]}),
    ("Recommender systems algorithms", {"entities": [(0, 23, "SKILL"), (30, 42, "SKILL")]}),
    ("Python", {"entities": [(0, 6, "SKILL")]}),
    ("Java", {"entities": [(0, 4, "SKILL")]}),
    ("JavaScript", {"entities": [(0, 10, "SKILL")]}),
    ("TypeScript", {"entities": [(0, 10, "SKILL")]}),
    ("C++", {"entities": [(0, 3, "SKILL")]}),
    ("C#", {"entities": [(0, 2, "SKILL")]}),
    ("Go", {"entities": [(0, 2, "SKILL")]}),
    ("Ruby", {"entities": [(0, 4, "SKILL")]}),
    ("PHP", {"entities": [(0, 3, "SKILL")]}),
    ("Swift", {"entities": [(0, 5, "SKILL")]}),
    ("Rust", {"entities": [(0, 4, "SKILL")]}),
    ("Dart", {"entities": [(0, 4, "SKILL")]}),
    ("Kotlin", {"entities": [(0, 6, "SKILL")]}),
    ("SQL", {"entities": [(0, 3, "SKILL")]}),
    ("NoSQL", {"entities": [(0, 5, "SKILL")]}),
    ("C", {"entities": [(0, 1, "SKILL")]}),
    ("Scala", {"entities": [(0, 5, "SKILL")]}),
    ("Perl", {"entities": [(0, 4, "SKILL")]}),
    ("Haskell", {"entities": [(0, 7, "SKILL")]}),
    ("Bash", {"entities": [(0, 4, "SKILL")]}),
    ("Shell", {"entities": [(0, 5, "SKILL")]}),
    ("Cobol", {"entities": [(0,5, "SKILL")]}),
    ("Fortran", {"entities": [(0,7, "SKILL")]}),
    ("Visual Basic", {"entities": [(0,13, "SKILL")]}),
    ("Assembly", {"entities": [(0,9, "SKILL")]}),
    ("Pascal", {"entities": [(0,6, "SKILL")]}),
    ("Ada", {"entities": [(0,3, "SKILL")]}),
    ("ABAP", {"entities": [(0,4, "SKILL")]}), 
    ("RPG", {"entities": [(0,3, "SKILL")]}),
    ("Lisp", {"entities": [(0,4, "SKILL")]}),
    ("Prolog", {"entities": [(0,6, "SKILL")]}),
    ("F#", {"entities": [(0,2, "SKILL")]}),
    ("Lua", {"entities": [(0,3, "SKILL")]}),
    ("MATLAB", {"entities": [(0,6, "SKILL")]}),
    ("SAS", {"entities": [(0,3, "SKILL")]}),
    ("SPSS", {"entities": [(0,4, "SKILL")]}),
    ("R", {"entities": [(0,1, "SKILL")]}),
    ("Julia", {"entities": [(0,5, "SKILL")]}),
    ("Mahout", {"entities": [(0,6, "SKILL")]}), 
    ("Solr", {"entities": [(0,4, "SKILL")]}),
    ("Lucene", {"entities": [(0,6, "SKILL")]}),
    ("HBase", {"entities": [(0,4, "SKILL")]}),
    ("Cassandra", {"entities": [(0,9, "SKILL")]}), 
    ("Neo4j", {"entities": [(0,5, "SKILL")]}),
    ("Unix", {"entities": [(0,4, "SKILL")]}),
    ("Linux", {"entities": [(0,5, "SKILL")]}),
    ("Windows", {"entities": [(0,7, "SKILL")]}),
    ("MacOS", {"entities": [(0,5, "SKILL")]}),
    ("Android", {"entities": [(0,7, "SKILL")]}),
]

# Add labels to the NER component
for _, annotations in UPDATED_TRAIN_DATA:
    for ent in annotations["entities"]:
        ner.add_label(ent[2])

# Disable other pipeline components
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
with nlp.disable_pipes(*other_pipes):
    optimizer = nlp.begin_training()
    for iteration in range(30):
        random.shuffle(UPDATED_TRAIN_DATA)
        losses = {}
        for text, annotations in UPDATED_TRAIN_DATA:
            doc = nlp.make_doc(text)
            example = Example.from_dict(doc, annotations)
            nlp.update([example], sgd=optimizer, losses=losses)
        print(f"Iteration {iteration + 1} - Losses: {losses}")

# Save the model to disk
output_dir = "./skill_ner_model"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

nlp.to_disk(output_dir)
print(f"Model saved to {output_dir}")




Iteration 1 - Losses: {'ner': 84.96824044412855}
Iteration 2 - Losses: {'ner': 21.468073239118315}
Iteration 3 - Losses: {'ner': 15.868812313659031}
Iteration 4 - Losses: {'ner': 3.2747951893986476}
Iteration 5 - Losses: {'ner': 8.486852572880094}
Iteration 6 - Losses: {'ner': 8.823313766516556}
Iteration 7 - Losses: {'ner': 5.500560921390335}
Iteration 8 - Losses: {'ner': 8.269143143030805}
Iteration 9 - Losses: {'ner': 9.937797281824174}
Iteration 10 - Losses: {'ner': 8.726608790584832}
Iteration 11 - Losses: {'ner': 10.635228689722481}
Iteration 12 - Losses: {'ner': 11.670302940214933}
Iteration 13 - Losses: {'ner': 5.143954640930843}
Iteration 14 - Losses: {'ner': 5.174478770073166}
Iteration 15 - Losses: {'ner': 6.3727307285800965}
Iteration 16 - Losses: {'ner': 5.486863064142205}
Iteration 17 - Losses: {'ner': 1.3879695269016235}
Iteration 18 - Losses: {'ner': 2.864384559970838}
Iteration 19 - Losses: {'ner': 1.0108673480262669}
Iteration 20 - Losses: {'ner': 2.0625273401880437}
