In [1]:
import pandas as pd

df = pd.read_csv('job_descriptions.csv')

In [2]:
# Drop column: 'Contact'
df = df.drop(columns=['Contact'])
df = df.drop(columns=['Job Id'])

In [3]:
df.columns

Index(['Experience', 'Qualifications', 'Salary Range', 'location', 'Country',
       'latitude', 'longitude', 'Work Type', 'Company Size',
       'Job Posting Date', 'Preference', 'Contact Person', 'Job Title', 'Role',
       'Job Portal', 'Job Description', 'Benefits', 'skills',
       'Responsibilities', 'Company', 'Company Profile'],
      dtype='object')

In [17]:
df.isnull().sum()

Job Id                 0
Experience             0
Qualifications         0
Salary Range           0
location               0
Country                0
latitude               0
longitude              0
Work Type              0
Company Size           0
Job Posting Date       0
Preference             0
Contact Person         0
Contact                0
Job Title              0
Role                   0
Job Portal             0
Job Description        0
Benefits               0
skills                 0
Responsibilities       0
Company                0
Company Profile     5478
Posting Year           0
Posting Month          0
Min Salary             0
Max Salary             0
Avg Salary             0
dtype: int64

In [3]:
df['Job Posting Date'] = pd.to_datetime(df['Job Posting Date'])

df['Posting Year'] = df['Job Posting Date'].dt.year
df['Posting Month'] = df['Job Posting Date'].dt.month

In [3]:
# Group by Posting Year and count job postings
job_trends = df.groupby('Posting Year').size().reset_index(name='Job Postings')

# Feature matrix (X) and target variable (y)
X = job_trends[['Posting Year']]
y = job_trends['Job Postings']

In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [23]:
print(y_test.shape)

(1,)


In [5]:
from xgboost import XGBRegressor

# Initialize and train the XGBoost model
model = XGBRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predict on test set
y_pred = model.predict(X_test)

# Evaluate model performance
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

ModuleNotFoundError: No module named 'xgboost'

In [7]:
import matplotlib.pyplot as plt

# Menggunakan model untuk memprediksi jumlah job postings untuk tahun 2021-2023
X_test = job_trends[['Posting Year']]  # Fitur yang digunakan (misalnya Posting Year)
y_pred = model.predict(X_test)  # Prediksi dari model

# Tambahkan kolom 'Predicted Job Postings' ke DataFrame
job_trends['Predicted Job Postings'] = y_pred

# Filter data untuk tahun 2021 hingga 2023
filtered_data = job_trends[(job_trends['Posting Year'] >= 2021) & (job_trends['Posting Year'] <= 2023)]

# Visualisasikan data aktual dan prediksi
plt.figure(figsize=(10, 6))
plt.plot(filtered_data['Posting Year'], filtered_data['Job Postings'], label='Actual Job Postings', marker='o')
plt.plot(filtered_data['Posting Year'], filtered_data['Predicted Job Postings'], label='Predicted Job Postings', linestyle='--', marker='x')

# Tambahkan label dan judul
plt.xlabel('Posting Year')
plt.ylabel('Job Postings')
plt.title('Actual vs Predicted Job Market Trends (2021-2023)')
plt.legend()
plt.grid(True)

# Tampilkan plot
plt.show()


NameError: name 'model' is not defined

In [4]:
def clean_salary_range(salary_range):
    # Hapus tanda dolar dan huruf 'K'
    salary_range = salary_range.replace('$', '').replace('K', '')
    
    # Pisahkan range gaji (misal: '59-99' menjadi [59, 99])
    salary_min, salary_max = salary_range.split('-')
    
    # Konversi ke numerik dan kalikan 1000
    salary_min = float(salary_min) * 1000
    salary_max = float(salary_max) * 1000
    
    return salary_min, salary_max
# Clean and preprocess the 'Skills', 'Qualifications', and 'Job Description' columns
df['skills'] = df['skills'].fillna('').str.lower()
df['Qualifications'] = df['Qualifications'].fillna('').str.lower()
df['Job Description'] = df['Job Description'].fillna('').str.lower()

# Convert 'Salary Range' into numeric values (use the previously discussed method to extract Min and Max salary)
df[['Min Salary', 'Max Salary']] = df['Salary Range'].apply(lambda x: pd.Series(clean_salary_range(x)))


In [5]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Use TF-IDF to vectorize the 'Skills' column
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['skills'])

# Function to recommend jobs based on skills
def recommend_jobs(job_seeker_skills, top_n=5):
    # Convert job seeker's skills to a vector
    seeker_skills_vector = tfidf.transform([job_seeker_skills.lower()])
    
    # Compute cosine similarity between job seeker's skills and all job postings
    cosine_sim = cosine_similarity(seeker_skills_vector, tfidf_matrix).flatten()
    
    # Get the top N jobs with highest similarity
    top_jobs_indices = cosine_sim.argsort()[-top_n:][::-1]
    
    # Return the recommended jobs
    return df.iloc[top_jobs_indices][['Job Title', 'Company', 'skills', 'Salary Range']]

In [6]:
# Example usage: recommend jobs based on a job seeker's skills
job_seeker_skills = "python, machine learning, data analysis"
recommended_jobs = recommend_jobs(job_seeker_skills)
print(recommended_jobs)

              Job Title                   Company  \
630408   Data Scientist      Bristol-Myers Squibb   
137643   Data Scientist                      Olin   
655749   Data Scientist  TravelCenters of America   
1069428  Data Scientist     L3Harris Technologies   
857581   Data Scientist              W.R. Berkley   

                                                    skills Salary Range  
630408   machine learning algorithms python programming...   $58K-$125K  
137643   machine learning algorithms python programming...   $58K-$112K  
655749   machine learning algorithms python programming...   $60K-$114K  
1069428  machine learning algorithms python programming...    $58K-$85K  
857581   machine learning algorithms python programming...   $64K-$120K  


In [9]:
# Filter jobs based on a desired salary range (e.g., $60,000 - $100,000)
desired_min_salary = 60000
desired_max_salary = 100000

# Filter the DataFrame
salary_matched_jobs = df[(df['Min Salary'] >= desired_min_salary) & (df['Max Salary'] <= desired_max_salary)]

In [7]:
# Combine all criteria: skills, location, and salary
def recommend_jobs_combined(job_seeker_skills, desired_min_salary, desired_max_salary, top_n=5):
    # Step 1: Skills filtering
    seeker_skills_vector = tfidf.transform([job_seeker_skills.lower()])
    cosine_sim = cosine_similarity(seeker_skills_vector, tfidf_matrix).flatten()
    df['Skill Similarity'] = cosine_sim
    
   
    
    # Step 3: Salary filtering
    filtered_jobs = df[(df['Min Salary'] >= desired_min_salary) & 
                       (df['Max Salary'] <= desired_max_salary) ]  # Within 50 km
    
    # Sort by skill similarity and get top N jobs
    top_jobs = filtered_jobs.sort_values(by='Skill Similarity', ascending=False).head(top_n)
    
    return top_jobs[['Job Title', 'Company', 'skills', 'Salary Range',  'Skill Similarity']]

# Example: Recommend jobs based on all criteria
recommended_jobs_combined = recommend_jobs_combined("python, machine learning, data analysis", 60000, 100000)
print(recommended_jobs_combined)


              Job Title               Company  \
1182591  Data Scientist                  AGCO   
88352    Data Scientist           Wells Fargo   
659611   Data Scientist                Sempra   
939425   Data Scientist  State Farm Insurance   
134294   Data Scientist          Ryder System   

                                                    skills Salary Range  \
1182591  machine learning algorithms python programming...    $63K-$95K   
88352    machine learning algorithms python programming...    $60K-$84K   
659611   machine learning algorithms python programming...    $61K-$86K   
939425   machine learning algorithms python programming...    $62K-$98K   
134294   machine learning algorithms python programming...    $64K-$91K   

         Skill Similarity  
1182591          0.609868  
88352            0.609868  
659611           0.609868  
939425           0.609868  
134294           0.609868  


In [8]:
# Example: Recommend jobs based on all criteria
recommended_jobs_combined = recommend_jobs_combined("electrical", 10000, 120000)
recommended_jobs_combined

Unnamed: 0,Job Title,Company,skills,Salary Range,Skill Similarity
1160895,Electrical Designer,Constellation Energy,electrical engineering circuit design electron...,$55K-$112K,0.467321
348030,Electrical Designer,AutoZone,electrical engineering circuit design electron...,$58K-$88K,0.467321
335545,Electrical Designer,Cisco Systems,electrical engineering circuit design electron...,$60K-$97K,0.467321
762816,Electrical Designer,Global Partners,electrical engineering circuit design electron...,$58K-$91K,0.467321
1068505,Electrical Designer,Pacific Life,electrical engineering circuit design electron...,$65K-$117K,0.467321


In [10]:
import spacy
import re

# Load pre-trained NLP model (spaCy's en_core_web_sm)
nlp = spacy.load('en_core_web_sm')

# Example resume text
resume_text = """
John Doe
Python Developer with 5 years of experience in machine learning and web development.
Skills: Python, Django, TensorFlow, Docker, AWS
Experience: Python Developer at ABC Corp (2018-2023)
Education: BSc in Computer Science, XYZ University
Contact: johndoe@example.com, +1234567890
"""

# Preprocess the resume text
def clean_resume(text):
    # Remove email addresses, phone numbers, and other irrelevant information
    text = re.sub(r'\S+@\S+', '', text)  # Remove emails
    text = re.sub(r'\+\d{9,}', '', text)  # Remove phone numbers
    return text

# Parse the resume using spaCy
doc = nlp(clean_resume(resume_text))

# Extract specific entities (NER)
for ent in doc.ents:
    print(f"{ent.label_}: {ent.text}")

# Example: Extracting skills, experience, and education manually
skills = re.findall(r"Skills: (.+)", resume_text)
experience = re.findall(r"Experience: (.+)", resume_text)
education = re.findall(r"Education: (.+)", resume_text)

print("Skills:", skills)
print("Experience:", experience)
print("Education:", education)


PERSON: John Doe
Python
DATE: 5 years
PERSON: Django
ORG: TensorFlow
PERSON: Docker
ORG: ABC Corp
DATE: 2018-2023
ORG: Computer Science
ORG: XYZ University
Skills: ['Python, Django, TensorFlow, Docker, AWS']
Experience: ['Python Developer at ABC Corp (2018-2023)']
Education: ['BSc in Computer Science, XYZ University']


In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Job descriptions dataset (simplified example)
job_descriptions = df['Job Description'].fillna('').tolist()

# Resume skills extracted earlier
resume_skills = skills[0]  # Assuming skills are extracted as a string

# Create TF-IDF vectors for both resume and job descriptions
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(job_descriptions + [resume_skills])

# Calculate cosine similarity between the resume and all job descriptions
cosine_sim = cosine_similarity(tfidf_matrix[-1], tfidf_matrix[:-1]).flatten()

# Get top 5 job matches based on similarity
top_matches = cosine_sim.argsort()[-5:][::-1]
recommended_jobs = df.iloc[top_matches][['Job Title', 'Company', 'Job Description', 'skills']]

print("Recommended Jobs based on resume:")
print(recommended_jobs)


Recommended Jobs based on resume:
               Job Title                 Company  \
101519  Systems Engineer               JSW Steel   
157953  Systems Engineer    Publix Super Markets   
259665  Systems Engineer                BT Group   
800489  Systems Engineer          Brewin Dolphin   
560364  Systems Engineer  Casey's General Stores   

                                          Job Description  \
101519  As a Cloud Systems Engineer, you will be respo...   
157953  As a Cloud Systems Engineer, you will be respo...   
259665  As a Cloud Systems Engineer, you will be respo...   
800489  As a Cloud Systems Engineer, you will be respo...   
560364  As a Cloud Systems Engineer, you will be respo...   

                                                   skills  
101519  Cloud systems engineering Cloud infrastructure...  
157953  Cloud systems engineering Cloud infrastructure...  
259665  Cloud systems engineering Cloud infrastructure...  
800489  Cloud systems engineering Cloud infras

In [11]:
def parse_resume(resume_text):
    doc = nlp(clean_resume(resume_text))
    skills = re.findall(r"Skills: (.+)", resume_text)
    experience = re.findall(r"Experience: (.+)", resume_text)
    education = re.findall(r"Education: (.+)", resume_text)
    
    return {
        'skills': skills[0] if skills else '',
        'experience': experience[0] if experience else '',
        'education': education[0] if education else ''
    }

def recommend_jobs(resume_text, job_descriptions_df):
    # Parse the resume
    resume_data = parse_resume(resume_text)
    
    # Job descriptions from the dataset
    job_descriptions = job_descriptions_df['Job Description'].fillna('').tolist()
    
    # Create TF-IDF vectors
    vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = vectorizer.fit_transform(job_descriptions + [resume_data['skills']])
    
    # Calculate similarity
    cosine_sim = cosine_similarity(tfidf_matrix[-1], tfidf_matrix[:-1]).flatten()
    
    # Get top job matches
    top_matches = cosine_sim.argsort()[-5:][::-1]
    recommended_jobs = job_descriptions_df.iloc[top_matches][['Job Title', 'Company', 'skills']]
    
    return recommended_jobs

# Example usage
resume_text = """
John Doe
Python Developer with 5 years of experience in machine learning and web development.
Skills: Python, Django, TensorFlow, Docker, AWS
Experience: Python Developer at ABC Corp (2018-2023)
Education: BSc in Computer Science, XYZ University
Contact: johndoe@example.com, +1234567890
"""

recommended_jobs = recommend_jobs(resume_text, df)
print(recommended_jobs)


               Job Title                 Company  \
101519  Systems Engineer               JSW Steel   
157953  Systems Engineer    Publix Super Markets   
259665  Systems Engineer                BT Group   
800489  Systems Engineer          Brewin Dolphin   
560364  Systems Engineer  Casey's General Stores   

                                                   skills  
101519  cloud systems engineering cloud infrastructure...  
157953  cloud systems engineering cloud infrastructure...  
259665  cloud systems engineering cloud infrastructure...  
800489  cloud systems engineering cloud infrastructure...  
560364  cloud systems engineering cloud infrastructure...  


In [16]:
from sentence_transformers import SentenceTransformer

# Parsing resume dan menyimpan hasilnya ke dalam resume_data
resume_data = parse_resume(resume_text)
# Load a pre-trained BERT-based model
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# Encode the resume skills and job descriptions
resume_embedding = model.encode(resume_data['skills'])
job_embeddings = model.encode(job_descriptions)

# Compute cosine similarities
cosine_sim = cosine_similarity([resume_embedding], job_embeddings).flatten()

# Get top job matches
top_matches = cosine_sim.argsort()[-5:][::-1]
recommended_jobs = df.iloc[top_matches][['Job Title', 'Company', 'skills']]

print(recommended_jobs)


KeyboardInterrupt: 