In [None]:
import pandas as pd
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


processed_path = os.path.join('..', 'data', 'processed', 'jobs_processed.csv')
df = pd.read_csv(processed_path)

#convert real strings to python lists and then into plain text with spaces for TF-IDF
import ast
df['skills_list'] = df['skills_list'].apply(ast.literal_eval)
df['skills_text'] = df['skills_list'].apply(lambda skills: ' '.join(skills))
print("Data loaded and prepared.")
display(df.head())

#create and fitTF-IDF model
tfidf_vectorizer = TfidfVectorizer(token_pattern=r"\b[a-zA-Z0-9#+-]+\b")
tfidf_matrix = tfidf_vectorizer.fit_transform(df['skills_text'])
print(f"TF-IDF matrix created with shape: {tfidf_matrix.shape}")


#recommendation function
def get_recommendations(user_skills, top_n=20):
    user_skills_text = ' '.join(user_skills)
    
    user_vector = tfidf_vectorizer.transform([user_skills_text])
    
    cosine_similarities = cosine_similarity(user_vector, tfidf_matrix).flatten()
    
    top_job_indices = cosine_similarities.argsort()[:-top_n-1:-1] # Get the indices of the top N jobs
    
    recommendations = df.iloc[top_job_indices]
    
    return recommendations[['job_title', 'company', 'job_link', 'skills_list']]

#test the recommender
my_skills = ['C++', 'Python', 'PyTorch', 'CUDA']
print(f"\n--- Recommendations for a user with skills: {my_skills} ---")
recommended_jobs = get_recommendations(my_skills)
display(recommended_jobs)

Data loaded and prepared.


Unnamed: 0,job_link,job_title,company,skills_list,skills_text
0,https://www.linkedin.com/jobs/view/senior-mach...,Senior Machine Learning Engineer,Jobs for Humanity,"[Machine Learning, Programming, Python, Scala,...",Machine Learning Programming Python Scala Java...
1,https://www.linkedin.com/jobs/view/principal-s...,"Principal Software Engineer, ML Accelerators",Aurora,"[C++, Python, PyTorch, TensorFlow, MXNet, CUDA...",C++ Python PyTorch TensorFlow MXNet CUDA OpenC...
2,https://www.linkedin.com/jobs/view/senior-etl-...,Senior ETL Data Warehouse Specialist,Adame Services LLC,"[ETL, Data Integration, Data Transformation, D...",ETL Data Integration Data Transformation Data ...
3,https://www.linkedin.com/jobs/view/senior-data...,Senior Data Warehouse Developer / Architect,Morph Enterprise,"[Data Lakes, Data Bricks, Azure Data Factory P...",Data Lakes Data Bricks Azure Data Factory Pipe...
4,https://www.linkedin.com/jobs/view/lead-data-e...,Lead Data Engineer,Dice,"[Java, Scala, Python, RDBMS, NoSQL, Redshift, ...",Java Scala Python RDBMS NoSQL Redshift Snowfla...


TF-IDF matrix created with shape: (12217, 16220)

--- Recommendations for a user with skills: ['C++', 'Python', 'PyTorch', 'CUDA'] ---


Unnamed: 0,job_title,company,job_link,skills_list
4697,Machine Learning Engineer for Advanced Manufac...,Re:Build Manufacturing,https://www.linkedin.com/jobs/view/machine-lea...,"[Machine Learning, Artificial intelligence, Py..."
12147,"ML Infrastructure Engineer, Autopilot AI & Tes...",Tesla,https://www.linkedin.com/jobs/view/ml-infrastr...,"[Neural Networks, Deep Learning, GPU Computing..."
4592,Machine Learning Engineer (Austin),Optiver,https://www.linkedin.com/jobs/view/machine-lea...,"[Machine Learning, Deep Learning, Python, C/C+..."
4238,Machine Learning Engineer,DRW,https://ca.linkedin.com/jobs/view/machine-lear...,"[Machine Learning, Data Science, Python, Go, T..."
1851,"Senior Solutions Architect, Machine Learning",NVIDIA,https://www.linkedin.com/jobs/view/senior-solu...,"[Machine Learning, Deep Learning, TensorFlow, ..."
11375,Machine Learning Engineer-Model Training Infra...,ByteDance,https://www.linkedin.com/jobs/view/machine-lea...,"[C++, CUDA, Python, TensorFlow, PyTorch, ML in..."
8559,"Sr. ML Infrastructure Software Engineer, Autop...",Tesla,https://www.linkedin.com/jobs/view/sr-ml-infra...,"[Python, Deep learning frameworks, PyTorch, CP..."
7260,Senior Data Scientist,Kognition Inc,https://www.linkedin.com/jobs/view/senior-data...,"[Machine Learning, Deep Learning, Python, Bash..."
717,Staff Machine Learning Feature Engineer,XPENG,https://www.linkedin.com/jobs/view/staff-machi...,"[Machine Learning, Deep Learning, Object Detec..."
10270,Machine Learning Engineering Manager,AMD,https://www.linkedin.com/jobs/view/machine-lea...,"[Machine Learning Engineering, GPU Kernel Opti..."


In [None]:
def analyze_skill_gap(user_skills, job_row):

    #lowercase and convert to set
    user_skill_set = set([skill.lower() for skill in user_skills])
    required_skills = set([skill.lower() for skill in job_row['skills_list']])

    #matching and missing skills
    matching_skills = user_skill_set.intersection(required_skills)
    missing_skills = required_skills - user_skill_set
    
    print(f"--- Skill Gap Analysis for: '{job_row['job_title']}' ---")
    print(f"\nSkills You Have ({len(matching_skills)}):")
    print(sorted(list(matching_skills)))
    
    print(f"\nSkills You Are Missing ({len(missing_skills)}):")
    print(sorted(list(missing_skills)))
    
    return {
        "matching_skills": list(matching_skills),
        "missing_skills": list(missing_skills)
    }

#test the skill gap analysis
my_skills = ['C++', 'Python', 'PyTorch', 'CUDA']
recommended_jobs = get_recommendations(my_skills)
top_job = recommended_jobs.iloc[0]
analyze_skill_gap(my_skills, top_job)

--- Skill Gap Analysis for: 'Machine Learning Engineer for Advanced Manufacturing Artificial Intelligence' ---

Skills You Have (4):
['c++', 'cuda', 'python', 'pytorch']

Skills You Are Missing (12):
['artificial intelligence', 'c#', 'cad', 'cam', 'docker', 'git', 'java', 'linux', 'machine learning', 'statistics', 'tensorflow', 'windows']


{'matching_skills': ['pytorch', 'cuda', 'c++', 'python'],
 'missing_skills': ['windows',
  'cam',
  'cad',
  'linux',
  'artificial intelligence',
  'c#',
  'git',
  'machine learning',
  'tensorflow',
  'docker',
  'statistics',
  'java']}

In [3]:
# skills from the user
print("Enter your skills, separated by commas (e.g., python, sql, aws):")
user_input_skills = input()

#Process the input string into a list
my_skills = [skill.strip().lower() for skill in user_input_skills.split(',')]

#Get and display recommendations
print("\n" + "="*50)
print(f"Finding job recommendations for the skills: {my_skills}")
recommended_jobs = get_recommendations(my_skills)

recommended_jobs.reset_index(drop=True, inplace=True)

print("\n--- Top 10 Job Recommendations ---")
# Display a numbered list of job titles and companies
for i, row in recommended_jobs.iterrows():
    print(f"{i}: {row['job_title']} at {row['company']}")
print("="*50)


# 4. Ask the user to choose a job for gap analysis
chosen_index = -1
while chosen_index not in range(len(recommended_jobs)):
    try:
        print("\nEnter the number of the job you want to analyze for a skill gap (0-9):")
        chosen_index = int(input())
        if chosen_index not in range(len(recommended_jobs)):
            print("Invalid number. Please enter a number from the list above.")
    except ValueError:
        print("Invalid input. Please enter a number.")

# 5. Run the skill gap analysis for the chosen job
chosen_job = recommended_jobs.iloc[chosen_index]
analyze_skill_gap(my_skills, chosen_job)

Enter your skills, separated by commas (e.g., python, sql, aws):


 C++, python, CUDA, machine learning



Finding job recommendations for the skills: ['c++', 'python', 'cuda', 'machine learning']

--- Top 10 Job Recommendations ---
0: Machine Learning Engineer for Advanced Manufacturing Artificial Intelligence at Re:Build Manufacturing
1: Machine Learning Engineer-Model Training Infrastructure at ByteDance
2: Machine Learning Engineer-Model Training Infrastructure at ByteDance
3: Machine Learning Engineer (Austin) at Optiver
4: Senior Solutions Architect, Machine Learning at NVIDIA
5: Sr. ML Infrastructure Software Engineer, Autopilot AI & Tesla Bot at Tesla
6: ML Infrastructure Engineer, Autopilot AI & Tesla Bot at Tesla
7: Senior Data Scientist at Kognition Inc
8: Staff Machine Learning Feature Engineer at XPENG
9: Machine Learning Engineer at DRW
10: Machine Learning Engineer-Deep Model Infrastructure at ByteDance
11: Senior Data Science Engineer, Risk & Fraud at DraftKings Inc.
12: Head of Machine Learning at Glocomms
13: Machine Learning Engineer at Rockwell Automation
14: Business S

 7


--- Skill Gap Analysis for: 'Senior Data Scientist' ---

Skills You Have (3):
['cuda', 'machine learning', 'python']

Skills You Are Missing (14):
['aws', 'azure', 'bash', 'data science workflows', 'deep learning', 'etl', 'keras', 'linux', 'mlflow', 'nvidia gpu', 'pytorch', 'scikit', 'tensorflow', 'windows']


{'matching_skills': ['cuda', 'machine learning', 'python'],
 'missing_skills': ['windows',
  'azure',
  'etl',
  'linux',
  'mlflow',
  'keras',
  'scikit',
  'bash',
  'tensorflow',
  'data science workflows',
  'nvidia gpu',
  'aws',
  'pytorch',
  'deep learning']}