In [None]:
from transformers import pipeline

# Load the pre-trained NER model and use GPU (device=0)
ner_pipeline = pipeline("ner", model="dbmdz/bert-large-cased-finetuned-conll03-english")

# Define the list of words (sample from the above list)
text = """
Python, Ubuntu, Docker, Git, TensorFlow, MySQL, JavaScript, Kubernetes, PostgreSQL, React, and tools for the environment.
"""

# Apply the NER model to find technical terms
ner_results = ner_pipeline(text)
# Initialize variables to store the full words
full_technical_terms = []
current_word = ""
current_entity = None

# Iterate through NER results to combine subwords into full words
for result in ner_results:
    entity_label = result['entity']
    word_piece = result['word']

    # If the word piece starts with '##', it's a continuation of the previous token
    if word_piece.startswith("##"):
        current_word += word_piece[2:]  # Append without '##'
    else:
        if current_word:  # If there is a current word, save it before starting a new one
            full_technical_terms.append({'word': current_word, 'entity': current_entity})
        current_word = word_piece  # Start a new word
        current_entity = entity_label  # Track the entity type

# Append the last word after the loop
if current_word:
    full_technical_terms.append({'word': current_word, 'entity': current_entity})

# Filter for technical words (you can adjust entity labels based on output)
technical_words = [term['word'] for term in full_technical_terms if term['entity'] in ['I-MISC', 'I-ORG']]

# Print the technical words
print("Technical Words:", technical_words)


[{'entity': 'I-MISC',
  'score': 0.51515263,
  'index': 1,
  'word': 'Python',
  'start': 1,
  'end': 7},
 {'entity': 'I-MISC',
  'score': 0.59438086,
  'index': 3,
  'word': 'U',
  'start': 9,
  'end': 10},
 {'entity': 'I-ORG',
  'score': 0.72493184,
  'index': 4,
  'word': '##bu',
  'start': 10,
  'end': 12},
 {'entity': 'I-MISC',
  'score': 0.6969585,
  'index': 5,
  'word': '##nt',
  'start': 12,
  'end': 14},
 {'entity': 'I-ORG',
  'score': 0.75506896,
  'index': 6,
  'word': '##u',
  'start': 14,
  'end': 15},
 {'entity': 'I-ORG',
  'score': 0.6313323,
  'index': 8,
  'word': 'Dock',
  'start': 17,
  'end': 21},
 {'entity': 'I-ORG',
  'score': 0.68530065,
  'index': 9,
  'word': '##er',
  'start': 21,
  'end': 23},
 {'entity': 'I-ORG',
  'score': 0.61052924,
  'index': 11,
  'word': 'G',
  'start': 25,
  'end': 26},
 {'entity': 'I-ORG',
  'score': 0.49405947,
  'index': 12,
  'word': '##it',
  'start': 26,
  'end': 28},
 {'entity': 'I-MISC',
  'score': 0.68763834,
  'index': 14,
