In [1]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import MinMaxScaler

# Download stopwords
nltk.download('stopwords')

# Initialize global variables
skills_pool = [
    'Python', 'Java', 'SQL', 'Machine Learning', 'AWS',
    'TensorFlow', 'PyTorch', 'Data Analysis', 'Spark', 'Hadoop',
    'React', 'Angular', 'Docker', 'Kubernetes', 'Git', 'CI/CD'
]
job_categories = {
    'Data Scientist': 'Data Science',
    'Machine Learning Engineer': 'Data Science',
    'Data Analyst': 'Data Analysis',
    'Software Engineer': 'Software Development',
    'DevOps Engineer': 'DevOps',
    'Data Engineer': 'Data Engineering'
}

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [5]:
def generate_data(num_resumes=200, num_jobs=50):
    np.random.seed(42)

    # Possible values
    degrees = ['BS Computer Science', 'MS Data Science', 'MBA', 'BS Engineering',
               'PhD Machine Learning', 'BS Mathematics', 'MS Statistics']
    job_titles = list(job_categories.keys())

    # Generate resumes
    resumes = []
    for i in range(num_resumes):
        num_skills = np.random.randint(3, 8)
        skills = list(np.random.choice(skills_pool, num_skills, replace=False))
        education = list(np.random.choice(degrees, np.random.randint(1, 3), replace=False))
        experience = np.random.randint(0, 15)
        resume_text = f"Skills: {', '.join(skills)}. Education: {', '.join(education)}. Experience: {experience} years."
        resumes.append({
            'resume_id': i,
            'text': resume_text,
            'skills': skills,
            'education': education,
            'experience': experience
        })

    # Generate jobs
    jobs = []
    for i in range(num_jobs):
        req_skills = list(np.random.choice(skills_pool, np.random.randint(3, 6), replace=False))
        req_education = list(np.random.choice(degrees, np.random.randint(1, 2), replace=False))
        req_experience = np.random.randint(1, 10)
        job_text = f"Required Skills: {', '.join(req_skills)}. Required Education: {', '.join(req_education)}. Minimum Experience: {req_experience} years."
        jobs.append({
            'job_id': i,
            'title': np.random.choice(job_titles),
            'text': job_text,
            'req_skills': req_skills,
            'req_education': req_education,
            'req_experience': req_experience
        })

    # Create ground truth
    ground_truth = {}
    for job in jobs:
        qualified_resumes = []
        for resume in resumes:
            skill_match = len(set(resume['skills']) & set(job['req_skills'])) / len(job['req_skills'])
            edu_match = any(edu in job['req_education'] for edu in resume['education'])
            exp_match = resume['experience'] >= job['req_experience']
            if skill_match >= 0.6 and edu_match and exp_match:
                qualified_resumes.append(resume['resume_id'])
        ground_truth[job['job_id']] = qualified_resumes

    return pd.DataFrame(resumes), pd.DataFrame(jobs), ground_truth

# Generate data
resumes_df, jobs_df, ground_truth = generate_data()
print(jobs_df.head())
print(resumes_df.head())
print(ground_truth)

   job_id                      title  \
0       0               Data Analyst   
1       1          Software Engineer   
2       2  Machine Learning Engineer   
3       3  Machine Learning Engineer   
4       4               Data Analyst   

                                                text  \
0  Required Skills: Kubernetes, Docker, Spark. Re...   
1  Required Skills: Hadoop, PyTorch, Data Analysi...   
2  Required Skills: React, Python, Spark. Require...   
3  Required Skills: CI/CD, TensorFlow, Docker. Re...   
4  Required Skills: Angular, Java, Data Analysis,...   

                            req_skills           req_education  req_experience  
0          [Kubernetes, Docker, Spark]        [BS Mathematics]               5  
1     [Hadoop, PyTorch, Data Analysis]         [MS Statistics]               7  
2               [React, Python, Spark]                   [MBA]               6  
3          [CI/CD, TensorFlow, Docker]  [PhD Machine Learning]               8  
4  [Angular, Java

In [6]:
def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    # Lowercase
    text = text.lower()
    # Remove special characters
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    # Remove stopwords
    text = " ".join([word for word in text.split() if word not in stop_words])
    return text

# Preprocess data
resumes_df['processed_text'] = resumes_df['text'].apply(preprocess_text)
jobs_df['processed_text'] = jobs_df['text'].apply(preprocess_text)

# Create combined corpus
corpus = list(resumes_df['processed_text']) + list(jobs_df['processed_text'])


In [8]:
vectorizer = TfidfVectorizer(max_features=500)
tfidf_matrix = vectorizer.fit_transform(corpus)

# Split back into resumes and jobs
resume_vectors = tfidf_matrix[:len(resumes_df)]
job_vectors = tfidf_matrix[len(resumes_df):]

In [7]:
def recommend_jobs(resume_text, top_n=5):
    # Preprocess input
    processed_text = preprocess_text(resume_text)
    # Vectorize
    input_vector = vectorizer.transform([processed_text])
    # Calculate similarities
    similarities = cosine_similarity(input_vector, job_vectors)
    # Get top jobs
    top_job_indices = similarities.argsort()[0][-top_n:][::-1]

    recommendations = []
    for idx in top_job_indices:
        job = jobs_df.iloc[idx]
        recommendations.append({
            'job_id': job['job_id'],
            'title': job['title'],
            'description': job['text'],
            'similarity_score': round(similarities[0, idx], 4)
        })

    return recommendations


In [9]:
jobs_df['category'] = jobs_df['title'].map(job_categories)

# Create features for classification
X = []
for _, row in resumes_df.iterrows():
    features = [row['experience']]
    features += [1 if skill in row['skills'] else 0 for skill in skills_pool]
    X.append(features)

# Create target labels
y = []
for _, resume in resumes_df.iterrows():
    skill_counts = {category: 0 for category in set(job_categories.values())}
    for skill in resume['skills']:
        for job_title, category in job_categories.items():
            if skill in jobs_df[jobs_df['title'] == job_title]['req_skills'].values[0]:
                skill_counts[category] += 1
    y.append(max(skill_counts, key=skill_counts.get))

# Train model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Show training data
print("\n🧪 Training Data (X_train):")
for features, label in zip(X_train, y_train):
    print(f"Features: {features} → Label: {label}")

# Show testing data
print("\n🔬 Testing Data (X_test):")
for features, label in zip(X_test, y_test):
    print(f"Features: {features} → Label: {label}")

# Evaluate
y_pred = clf.predict(X_test)
print("Classification Model Performance:")
print(classification_report(y_test, y_pred))



🧪 Training Data (X_train):
Features: [12, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1] → Label: Data Science
Features: [5, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0] → Label: Data Engineering
Features: [9, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0] → Label: DevOps
Features: [3, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0] → Label: Data Science
Features: [6, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0] → Label: DevOps
Features: [0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1] → Label: DevOps
Features: [12, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0] → Label: Data Science
Features: [6, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0] → Label: Data Science
Features: [1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0] → Label: DevOps
Features: [12, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0] → Label: DevOps
Features: [1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0] → Label: Data Science
Features: [8, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0] → Label: 

In [10]:
def calculate_matching_score(resume_data, job_id):
    job = jobs_df[jobs_df['job_id'] == job_id].iloc[0]

    # Extract features from resume
    skills = re.search(r'Skills: (.*?)\.', resume_data).group(1).split(', ')
    education = re.search(r'Education: (.*?)\.', resume_data).group(1).split(', ')
    experience = int(re.search(r'Experience: (\d+)', resume_data).group(1))

    # Skill similarity
    job_skills = job['req_skills']
    skill_match = len(set(skills) & set(job_skills)) / len(job_skills)

    # Education match
    edu_match = 1 if any(edu in job['req_education'] for edu in education) else 0

    # Experience match
    exp_match = min(1, experience / max(1, job['req_experience']))

    # Text similarity
    processed_resume = preprocess_text(resume_data)
    resume_vec = vectorizer.transform([processed_resume])
    job_vec = job_vectors[job_id]
    text_sim = cosine_similarity(resume_vec, job_vec)[0][0]

    # Combined score
    weights = [0.4, 0.2, 0.2, 0.2]  # skill, education, experience, text
    score = (skill_match * weights[0] +
             edu_match * weights[1] +
             exp_match * weights[2] +
             text_sim * weights[3])

    return round(score, 4)

In [None]:
def get_user_input():
    print("Enter your profile details:")
    skills = input("Skills (comma separated): ")
    education = input("Education (comma separated degrees): ")
    experience = input("Years of experience: ")

    profile_text = f"Skills: {skills}. Education: {education}. Experience: {experience} years."
    return profile_text

# Main recommendation function
def get_recommendations():
    profile = get_user_input()

    # Get recommendations
    recommendations = recommend_jobs(profile, top_n=5)

    print("\nTop 5 Job Recommendations:")
    for i, job in enumerate(recommendations, 1):
        print(f"\n{i}. {job['title']} (ID: {job['job_id']})")
        print(f"   Similarity Score: {job['similarity_score']:.4f}")
        print(f"   Description: {job['description']}")

        # Calculate matching score
        match_score = calculate_matching_score(profile, job['job_id'])
        print(f"   Matching Score: {match_score:.4f}")

    # Predict category
    experience_val = int(re.search(r'Experience: (\d+)', profile).group(1))
    skills_list = [s.strip() for s in re.search(r'Skills: (.*?)\.', profile).group(1).split(',')]
    features = [experience_val] + [1 if skill in skills_list else 0 for skill in skills_pool]
    category = clf.predict([features])[0]
    print(f"\nPredicted Job Category: {category}")

# Run the recommendation system
get_recommendations()

Enter your profile details:
Skills (comma separated): nodejs, git, sql, aws, python, machine learning
Education (comma separated degrees): MBA, BS Computer Science
Years of experience: 5

Top 5 Job Recommendations:

1. Data Analyst (ID: 46)
   Similarity Score: 0.6195
   Description: Required Skills: Hadoop, AWS, Machine Learning, Git, Python. Required Education: BS Computer Science. Minimum Experience: 1 years.
   Matching Score: 0.5239

2. Data Scientist (ID: 28)
   Similarity Score: 0.5397
   Description: Required Skills: CI/CD, AWS, Git, Python. Required Education: BS Computer Science. Minimum Experience: 6 years.
   Matching Score: 0.4746

3. Software Engineer (ID: 15)
   Similarity Score: 0.4779
   Description: Required Skills: Angular, Machine Learning, SQL. Required Education: BS Computer Science. Minimum Experience: 3 years.
   Matching Score: 0.4956

4. Data Scientist (ID: 35)
   Similarity Score: 0.3959
   Description: Required Skills: Machine Learning, AWS, TensorFlow, Dock