In [1]:
#Load the libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
#Reading the file
job = pd.read_csv("Jobs_Dataset.csv")
job

FileNotFoundError: [Errno 2] No such file or directory: 'Jobs_Dataset.csv'

In [None]:
job.info()

In [None]:
job.head()

In [None]:
#Data Structure
print(type(job))
print(job.shape)

In [None]:
print(len(job))

In [None]:
#data types
job.dtypes

In [None]:
job.isnull().sum()

In [None]:
cols = job.columns
colours = ['blue','pink']
sns.heatmap(job[cols].isnull(),cmap=sns.color_palette(colours),cbar=True)

In [None]:
df = job[['jobTitle','description','skills','client','recruiter']]

In [None]:
df.head(3)

In [None]:
df['jobTitle']

In [None]:
df['description']

In [None]:
df[df.duplicated()]

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load Jobs Dataset
jobs_df = pd.read_csv("Jobs_Dataset.csv")

# Check for missing values and drop them
jobs_df = jobs_df.dropna(subset=["minBudget"])

# Create figure with two vertically stacked subplots
fig, axes = plt.subplots(2, 1, figsize=(8, 6), gridspec_kw={'height_ratios': [1, 3]})

# Boxplot (Top subplot)
sns.boxplot(data=jobs_df["minBudget"], ax=axes[0], color='skyblue', width=0.5, orient='h')
axes[0].set_title("Boxplot of minBudget Distribution")
axes[0].set_xlabel("minBudget")

# Histogram with KDE (Bottom subplot)
sns.histplot(jobs_df["minBudget"], kde=True, ax=axes[1], color='purple', bins=30)
axes[1].set_title("Histogram with KDE for minBudget Distribution")
axes[1].set_xlabel("minBudget")
axes[1].set_ylabel("status")

# Adjust layout for better spacing
plt.tight_layout()

# Show the plot
plt.show()

In [None]:
data1 = pd.get_dummies(job,columns=['jobTitle','skills'])
data1

In [None]:
edu = pd.read_csv("education.csv")
edu

In [None]:
edu.isnull().sum()

In [None]:
edu["graduationPassoutYear"].value_counts()

In [None]:
edu.info()

#### Observations:
- The education dataset has a column applicantId.
- The employment dataset lists jobs but doesnâ€™t have direct user interaction data.
- Therefore we will build a content-based job recommendation system, where each user's education profile is matched with the most relevant job descriptions/requirements.

### Preprocess the Job profiles dataset

In [None]:
import warnings
warnings.filterwarnings("ignore")

# Keep only relevant columns
job_df = job[['jobId', 'jobTitle', 'skills', 'location', 'minExp', 'maxExp', 'minBudget', 'maxBudget', 'description', 'jobType']].copy()

# Drop rows with null values in critical columns
job_df.dropna(subset=['jobId', 'skills', 'description', 'minExp', 'maxExp'], inplace=True)

# Fill budget columns if needed (optional)
job_df['minBudget'].fillna(0, inplace=True)
job_df['maxBudget'].fillna(0, inplace=True)

# Create a column for average experience required
job_df['avgExp'] = (job_df['minExp'] + job_df['maxExp']) / 2

# Combine skills and description into a single text column for NLP
job_df['jobText'] = job_df['skills'] + ' ' + job_df['description']

# Reset index after cleaning
job_df.reset_index(drop=True, inplace=True)
job_df

### Preprocess the Education data

In [None]:
# Keep essential columns
edu_df = edu[['applicantId', 'graduationPercentage', 'graduationPassoutYear']].copy()

# Drop rows with missing applicantId or graduation year
edu_df.dropna(subset=['applicantId', 'graduationPassoutYear'], inplace=True)

# Estimate experience as current year - graduation year
edu_df['estExp'] = 2025 - edu_df['graduationPassoutYear']

# Handle invalid values (e.g., negative experience)
edu_df['estExp'] = edu_df['estExp'].apply(lambda x: max(0, x))

# Normalize education percentage (optional but useful for scoring)
edu_df['edu_score'] = edu_df['graduationPercentage'] / 100.0

# Reset index after cleaning
edu_df.reset_index(drop=True, inplace=True)
edu_df

In [None]:
print(len(edu_df[edu_df["graduationPercentage"]==0]))
print(len(edu_df[edu_df["graduationPassoutYear"]==0]))

### NLP: Vectorizing Job Content with TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF Vectorizer
tfidf = TfidfVectorizer(stop_words='english', max_features=500)

# Fit and transform the jobText column
job_tfidf_matrix = tfidf.fit_transform(job_df['jobText'])

# Save the vocabulary if needed later for inverse transform
tfidf_feature_names = tfidf.get_feature_names_out()
tfidf_feature_names

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

def recommend_jobs(applicant_id, top_n=5):
    candidate = edu_df[edu_df['applicantId'] == applicant_id]
    if candidate.empty:
        return f"No candidate found with applicantId {applicant_id}"
    
    candidate = candidate.iloc[0]
    cand_exp = candidate['estExp']
    cand_score = candidate['edu_score']
    
    eligible_jobs = job_df[job_df['avgExp'] <= cand_exp].copy()
    if eligible_jobs.empty:
        return f"No jobs found matching experience level of {cand_exp} years"
    
    eligible_indices = eligible_jobs.index.tolist()
    eligible_tfidf = job_tfidf_matrix[eligible_indices]
    
    # Candidate vector (mean of job vectors as a placeholder profile)
    candidate_vector = eligible_tfidf.mean(axis=0)
    candidate_vector = np.asarray(candidate_vector)
    
    similarities = cosine_similarity(candidate_vector, eligible_tfidf).flatten()
    eligible_jobs['similarity'] = similarities
    
    top_jobs = eligible_jobs.sort_values(by='similarity', ascending=False).head(top_n)
    
    return top_jobs[['jobId', 'jobTitle', 'location', 'minExp', 'maxExp', 'skills', 'similarity']]


In [None]:
# Recommend jobs for a sample applicant
recommend_jobs('AISA9946', top_n=5)