In [7]:
from sentence_transformers import SentenceTransformer
from keybert import KeyBERT
import time

# different sections in a job description
overall = """
A healthier future. It’s what drives us to innovate. To continuously advance science and ensure everyone has access to the healthcare they need today and for generations to come. Creating a world where we all have more time with the people we love. That’s what makes us Roche.

Advances in AI, data, and computational sciences are transforming drug discovery and development. Roche’s Research and Early Development organisations at Genentech (gRED) and Pharma (pRED) have demonstrated how these technologies accelerate R&D, leveraging data and novel computational models to drive impact. Seamless data sharing and access to models across gRED and pRED are essential to maximising these opportunities. The new computational sciences Center of Excellence (CoE) is a strategic, unified group whose goal is to harness this transformative power of data and Artificial Intelligence (AI) to assist our scientists in both pRED and gRED to deliver more innovative and transformative medicines for patients worldwide.

Within the CoE organisation, the Data and Digital Catalyst organisation drives the modernisation of our computational and data ecosystems and integration of digital technologies across Research and Early Development to enable our stakeholders, power data-driven science and accelerate decision-making.

This internship position is located in South San Francisco, on-site.

The Opportunity

Our platform Identity & Access Management (IAM) team has developed the foundational tools needed to build scientific software solutions using security best practices in AWS, but often the scientific computing community at Roche does not utilize them fully because advanced capabilities appear complex. We’d love to make these advanced capabilities easy to use so the whole organization can leverage AWS, and Roche’s data, to its fullest, securely.

We are looking for a junior engineer to help make these advanced concepts easy for the scientific community at Roche to understand and implement into their software solutions, through example solutions and architectures. This internship is to develop a suite of well-documented project templates, demonstrating solution architectures for advanced data access patterns in AWS, including Amazon Athena, Redshift, LakeFormation, and Bedrock.
"""

key_respon = """
Develop Solution Architectures: Create templates for individualized access to AWS Redshift (data warehouse) and AWS RDS (database).
Implement Fine-Grained Access Control: Design patterns for AWS Athena using LakeFormation.
Demonstrate Secure Agentic Patterns: Define state-of-the-art authentication & authorization for AI agents using AWS Bedrock and AgentCore.
Documentation & Knowledge Sharing: Produce clear, scientific and engineering-centric documentation that allows other teams to easily adopt these advanced technologies and patterns to their work.
"""

required = """
You meet one of the following criteria:

Must be pursuing or have attained an Associate's Degree.
Must be pursuing a Bachelor's Degree (enrolled student).
Must have attained a Bachelor's Degree (not currently enrolled in a graduate program).

Required Majors: Computer Science, Information Systems, Data Engineering or a related field.

Required Skills: 

Basic exposure or experience with developing infrastructure-as-code projects, ideally for user-facing web applications and/or data engineering pipelines.
Must have experience with at least two programming languages, e.g.: JavaScript/TypeScript, Python, Rust, Go, SQL.
Familiarity with standard software engineering tools such as git, npm, pip.
Passion for best practices, architecture, software engineering and design.
"""

preferred = """
Excellent communication, collaboration, and interpersonal skills.
Complements our culture and the standards that guide our daily behavior & decisions: Integrity, Courage, and Passion.
Experience working with serverless and cloud-native architectures, ideally AWS, helps.
Experience with biological data and processes, or working with scientists or in a research environment is a plus.
Experience with Amazon Web Services (AWS) is a bonus, but not required.
"""

In [8]:
# loading default all-MiniLM-L6-v2 model (384 dimensional dense vector space)
embedded_model = SentenceTransformer('all-MiniLM-L6-v2')
miniLM_model = KeyBERT(model=embedded_model)

# run 10 times to get average time
start = time.time()
for i in range(10):
    keywords_overall = miniLM_model.extract_keywords(overall, keyphrase_ngram_range=(1, 2))
    keywords_respon = miniLM_model.extract_keywords(key_respon, keyphrase_ngram_range=(1, 2))
    keywords_required = miniLM_model.extract_keywords(required, keyphrase_ngram_range=(1, 2))
    keywords_preferred = miniLM_model.extract_keywords(preferred, keyphrase_ngram_range=(1, 2))
end = time.time()
print("Average time for KeyBERT with all-MiniLM-L6-v2:", (end - start) / 10)
print()

print("Overall Keywords:", keywords_overall)
print()
print("Key Responsibilities Keywords:", keywords_respon)
print()
print("Required Skills Keywords:", keywords_required)
print()
print("Preferred Skills Keywords:", keywords_preferred)

Average time for KeyBERT with all-MiniLM-L6-v2: 0.25675647258758544

Overall Keywords: [('scientific computing', 0.5074), ('data novel', 0.4953), ('scientific software', 0.4668), ('data computational', 0.4572), ('data ecosystems', 0.4437)]

Key Responsibilities Keywords: [('aws rds', 0.5651), ('patterns aws', 0.5267), ('using aws', 0.5189), ('aws athena', 0.5167), ('access aws', 0.5117)]

Required Skills Keywords: [('software engineering', 0.4889), ('required majors', 0.4303), ('required skills', 0.4156), ('computer science', 0.4054), ('experience programming', 0.4037)]

Preferred Skills Keywords: [('services aws', 0.6037), ('aws', 0.5734), ('ideally aws', 0.5269), ('experience amazon', 0.5181), ('aws bonus', 0.5126)]


In [9]:
# loading all-mpnet-base-v2 model (768 dimensional dense vector space)
embedded_model = SentenceTransformer('all-mpnet-base-v2')
mpnet_model = KeyBERT(model=embedded_model)


# run 10 times to get average time
start = time.time()
for i in range(10):
    keywords_overall = mpnet_model.extract_keywords(overall, keyphrase_ngram_range=(1, 2))
    keywords_respon = mpnet_model.extract_keywords(key_respon, keyphrase_ngram_range=(1, 2))
    keywords_required = mpnet_model.extract_keywords(required, keyphrase_ngram_range=(1, 2))
    keywords_preferred = mpnet_model.extract_keywords(preferred, keyphrase_ngram_range=(1, 2))
end = time.time()
print("Average time for KeyBERT with all-mpnet-base-v2:", (end - start) / 10)
print()

print("Overall Keywords:", keywords_overall)
print()
print("Key Responsibilities Keywords:", keywords_respon)
print()
print("Required Skills Keywords:", keywords_required)
print()
print("Preferred Skills Keywords:", keywords_preferred)

Average time for KeyBERT with all-mpnet-base-v2: 0.6158007144927978

Overall Keywords: [('aws scientific', 0.4947), ('data ecosystems', 0.4772), ('scientific software', 0.4636), ('data novel', 0.4572), ('assist scientists', 0.4488)]

Key Responsibilities Keywords: [('solution architectures', 0.5165), ('patterns aws', 0.483), ('warehouse aws', 0.4639), ('agentcore documentation', 0.4559), ('aws athena', 0.4385)]

Required Skills Keywords: [('required skills', 0.4854), ('required majors', 0.4468), ('pipelines experience', 0.4314), ('software engineering', 0.4169), ('infrastructure code', 0.4078)]

Preferred Skills Keywords: [('services aws', 0.4101), ('aws', 0.3936), ('skills', 0.3901), ('aws helps', 0.3847), ('architectures ideally', 0.3841)]
