In [1]:
from google.colab import files
uploaded = files.upload()  # choose your careers_data.json when the dialog opens


print("Uploaded files:", list(uploaded.keys()))

Saving Untitled.json to Untitled.json
Uploaded files: ['Untitled.json']


In [2]:
# Step 2: Load JSON and show a sample
import json

filename = "Untitled.json"  # change only if your file has a different name

with open(filename, "r", encoding="utf-8") as f:
    careers = json.load(f)

print("Total careers loaded:", len(careers))
print("\nSample entry (first career):")
import pprint
pprint.pprint(careers[0])


Total careers loaded: 30

Sample entry (first career):
{'career': 'Artificial Intelligence Engineer',
 'interest_tags': ['AI', 'technology', 'automation', 'research', 'innovation'],
 'required_skills': ['Python',
                     'Machine Learning',
                     'Deep Learning',
                     'Neural Networks',
                     'TensorFlow/PyTorch'],
 'resources': ["Coursera - Andrew Ng's Machine Learning & Deep Learning "
               'Specialization',
               'Fast.ai - Practical Deep Learning for Coders',
               'Papers With Code - Latest AI Research',
               'YouTube - Sentdex & StatQuest'],
 'roadmap': {'Advanced': ['Build deep learning architectures and deploy models '
                          'in production',
                          'Apply MLOps practices: model versioning, '
                          'monitoring, CI/CD for AI',
                          'Specialize in NLP, Computer Vision, or Generative '
                      

In [10]:
# Step 3: Take input from user
user_skills_input = input("Enter your skills (comma-separated): ")
user_interests_input = input("Enter your interests (comma-separated): ")

# normalize into lists
def to_list(s):
    return [item.strip().lower() for item in s.split(",") if item.strip()]

user_skills = to_list(user_skills_input)
user_interests = to_list(user_interests_input)

print("User skills:", user_skills)
print("User interests:", user_interests)


Enter your skills (comma-separated): python,sql,excel
Enter your interests (comma-separated): data,ai
User skills: ['python', 'sql', 'excel']
User interests: ['data', 'ai']


In [4]:

!pip install -q sentence-transformers


In [5]:
# Step 4: Vectorize careers dataset
from sentence_transformers import SentenceTransformer
import numpy as np

# Load model
model = SentenceTransformer('all-MiniLM-L6-v2')  # small and fast

# Prepare text for each career (skills + interests)
career_texts = [" ".join(c["required_skills"] + c["interest_tags"]) for c in careers]

# Encode each career into a vector
career_vectors = model.encode(career_texts, normalize_embeddings=True)

print("Vectorization done. Sample vector shape:", career_vectors[0].shape)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Vectorization done. Sample vector shape: (384,)


In [6]:
# Step 5: Vectorize user input
from sklearn.metrics.pairwise import cosine_similarity

# Combine user skills + interests into one string
user_text = " ".join(user_skills + user_interests)

# Encode user vector
user_vector = model.encode([user_text], normalize_embeddings=True)

# Compute cosine similarity with all career vectors
similarities = cosine_similarity(user_vector, career_vectors)[0]

# Rank careers by similarity
ranked_indices = np.argsort(similarities)[::-1]  # descending
top3_indices = ranked_indices[:3]

print("Top 3 career matches based on AI embedding:")
for i, idx in enumerate(top3_indices, 1):
    career = careers[idx]
    score = similarities[idx]
    print(f"{i}. {career['career']} — Similarity: {round(score*100,1)}%")


Top 3 career matches based on AI embedding:
1. Data Scientist — Similarity: 60.79999923706055%
2. Data Analyst — Similarity: 58.70000076293945%
3. Machine Learning Engineer — Similarity: 33.20000076293945%


In [7]:

for i in range(3):
    print(f"Career: {careers[i]['career']}")
    print("Vector:", career_vectors[i][:10], "...")  # show only first 10 values


Career: Artificial Intelligence Engineer
Vector: [-0.14360023 -0.04212828  0.03225614  0.00049433 -0.01598395 -0.05851234
 -0.01650006 -0.0431514  -0.08835013 -0.08266564] ...
Career: Data Scientist
Vector: [-0.01331916 -0.01410186 -0.04274965  0.04462224 -0.05666999 -0.10681754
  0.02811587 -0.02133336 -0.14044338  0.00270154] ...
Career: Machine Learning Engineer
Vector: [-0.00566655 -0.05644878  0.03432629  0.01277831 -0.00540437 -0.10667866
 -0.00261072 -0.08023132 -0.09226695  0.00580543] ...


In [11]:
# Step 6: Personalized cumulative roadmap
def get_cumulative_phases(user_skills, career):
    career_skills = [s.lower() for s in career.get("required_skills", [])]
    match_ratio = len(set(user_skills) & set(career_skills)) / max(1, len(career_skills))

    if match_ratio < 0.3:
        return ["Beginner", "Intermediate", "Advanced"]
    elif match_ratio < 0.7:
        return ["Intermediate", "Advanced"]
    else:
        return ["Advanced"]

# Display roadmap for top 3 careers
for i, idx in enumerate(top3_indices, 1):
    career = careers[idx]
    phases = get_cumulative_phases(user_skills, career)
    print(f"\n{i}. {career['career']} — Showing phases: {', '.join(phases)}")
    print("Next steps:")
    for phase in phases:
        for step in career["roadmap"][phase]:
            print("-", step)
    print("Resources:")
    for res in career["resources"]:
        print("-", res)



1. Data Scientist — Showing phases: Intermediate, Advanced
Next steps:
- Build predictive models using scikit-learn (regression, classification, clustering)
- Master feature engineering, cross-validation, and model evaluation metrics
- Learn advanced visualization with Tableau/Power BI
- Work on 3-5 end-to-end projects from data collection to insights presentation
- Create a GitHub portfolio showcasing diverse data science projects
- Specialize in deep learning, time series forecasting, or causal inference
- Learn big data tools (Spark) and cloud platforms (AWS/GCP)
- Deploy ML models in production using Docker, APIs, and monitoring tools
- Lead business-critical projects and communicate insights to stakeholders
- Mentor junior data scientists and contribute to DS communities
Resources:
- DataCamp - Data Scientist with Python Track
- Kaggle - Competitions & Datasets
- YouTube - Ken Jee & StatQuest
- Book - 'Python for Data Analysis' by Wes McKinney

2. Data Analyst — Showing phases: I