In [2]:
import pandas as pd
import numpy as np
import re
import gradio as gr
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from surprise import SVD
from surprise import Dataset, Reader
from surprise.model_selection import cross_validate
import pdfplumber
import nltk
from nltk.corpus import stopwords

In [3]:
# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to C:\Users\Jainivas
[nltk_data]     Anandhan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Jainivas
[nltk_data]     Anandhan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
# Load IT Job Roles dataset
jobs_df = pd.read_csv("data\\IT_Job_Roles_Skills.csv", encoding="ISO-8859-1") 
jobs_df.head()

Unnamed: 0,Job Title,Job Description,Skills,Certifications
0,Admin Big Data,Responsible for managing and overseeing big da...,"Hadoop, Spark, MapReduce, Data Lakes, Data War...","Cloudera Certified Professional (CCP), Hortonw..."
1,Ansible Operations Engineer,Focuses on automating IT processes using Ansib...,"Ansible, Linux, Automation, Cloud Platforms, C...",Red Hat Certified Specialist in Ansible Automa...
2,Artifactory Administrator,Manages the Artifactory repository for build a...,"Artifactory, CI/CD, Jenkins, Docker, Maven, Gr...","JFrog Artifactory Certification, DevOps Instit..."
3,Artificial Intelligence / Machine Learning Leader,"Leads AI/ML projects and teams, defining strat...","AI Strategy, Machine Learning, Team Management...","AI-900: Microsoft Azure AI Fundamentals, Certi..."
4,Artificial Intelligence / Machine Learning Sr....,Senior role overseeing multiple AI/ML initiati...,"AI Strategy, Machine Learning, Team Management...",Certified Artificial Intelligence Practitioner...


In [5]:
# Load Coursera Courses dataset
courses_df = pd.read_csv("data\\coursera_course_dataset_v2_no_null.csv")  
courses_df.head()

Unnamed: 0.1,Unnamed: 0,Title,Organization,Skills,Ratings,Review counts,Metadata
0,0,Google Cybersecurity,Google,"Network Security, Python Programming, Linux, ...",4.8,4.8(20K reviews),Beginner · Professional Certificate · 3 - 6 Mo...
1,1,Google Data Analytics,Google,"Data Analysis, R Programming, SQL, Business C...",4.8,4.8(137K reviews),Beginner · Professional Certificate · 3 - 6 Mo...
2,2,Google Project Management:,Google,"Project Management, Strategy and Operations, ...",4.8,4.8(100K reviews),Beginner · Professional Certificate · 3 - 6 Mo...
3,3,IBM Data Science,IBM,"Python Programming, Data Science, Machine Lea...",4.6,4.6(120K reviews),Beginner · Professional Certificate · 3 - 6 Mo...
4,4,Google Digital Marketing & E-commerce,Google,"Digital Marketing, Marketing, Marketing Manag...",4.8,4.8(23K reviews),Beginner · Professional Certificate · 3 - 6 Mo...


In [6]:
# Preprocessing: Extract necessary columns
courses_df = courses_df[['Title', 'Skills', 'Ratings', 'Review counts']]
jobs_df = jobs_df[['Job Title', 'Skills']]

# Handling missing values
courses_df.dropna(inplace=True)
jobs_df.dropna(inplace=True)

In [7]:
# Extract numeric ratings from "Ratings" column
courses_df['Ratings'] = courses_df['Ratings'].apply(lambda x: float(re.search(r"\d+\.\d+", str(x)).group()) if pd.notnull(x) else np.nan)

# Drop NaN ratings
courses_df.dropna(subset=['Ratings'], inplace=True)


In [8]:
# Add a dummy user ID column since CF requires three columns (user, item, rating)
courses_df['User_ID'] = np.arange(len(courses_df))  # Assign unique IDs

# TF-IDF Model

In [10]:
# TF-IDF for Skill Matching
tfidf = TfidfVectorizer()
courses_tfidf = tfidf.fit_transform(courses_df['Skills'])
jobs_tfidf = tfidf.transform(jobs_df['Skills'])
similarity_matrix = cosine_similarity(jobs_tfidf, courses_tfidf)


# Collaborative Filtering

In [12]:
# Collaborative Filtering (CF) using SVD
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(courses_df[['User_ID', 'Title', 'Ratings']], reader)
trainset = data.build_full_trainset()
svd = SVD()
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x2213294dc10>

In [31]:
import pickle

# Save scaler and model in the current directory
with open('svd.pkl', 'wb') as f:
    pickle.dump(svd, f)

with open('similarity_matrix.pkl', 'wb') as f:
    pickle.dump(similarity_matrix, f)

# Resume Skill Extraction

In [14]:
SKILLS_DB = list(set(sum(jobs_df['Skills'].str.split(',').tolist(), [])))

def extract_text_from_pdf(file_path):
    text = ""
    with pdfplumber.open(file_path) as pdf:
        for page in pdf.pages:
            if page.extract_text():
                text += page.extract_text() + " "
    return text

def extract_skills_from_text(text):
    text = text.lower()
    tokens = nltk.word_tokenize(text)
    tokens = [word for word in tokens if word.isalpha() and word not in stopwords.words('english')]
    extracted = set()
    for skill in SKILLS_DB:
        if skill.strip().lower() in text:
            extracted.add(skill.strip())
    return list(extracted)


In [15]:
def recommend(skills_input, resume_file, job_role):
    if not job_role:
        return "Please select a job role.", None
    
    # Extract skills from either file or textbox
    resume_skills = []
    if resume_file is not None:
        text = extract_text_from_pdf(resume_file.name)
        resume_skills = extract_skills_from_text(text)
    elif skills_input.strip() != "":
        resume_skills = [s.strip() for s in skills_input.split(',') if s.strip()]
    else:
        return "Please upload a resume or enter your skills.", None

    if not resume_skills:
        return "No skills found in the input. Please check the content.", None

    job_index = jobs_df[jobs_df['Job Title'] == job_role].index[0]
    required_skills = [skill.strip() for skill in jobs_df.iloc[job_index]['Skills'].split(',')]

    # Skill Gap Analysis
    missing_skills = [skill for skill in required_skills if skill not in resume_skills]

    # Content-based recommendations
    similar_courses = list(enumerate(similarity_matrix[job_index]))
    similar_courses = sorted(similar_courses, key=lambda x: x[1], reverse=True)[:5]
    skill_based = courses_df.iloc[[i[0] for i in similar_courses]]

    # Collaborative filtering
    cf_recommend = courses_df.copy()
    cf_recommend['Predicted_Rating'] = cf_recommend['Title'].apply(lambda x: svd.predict(uid="user", iid=x).est)
    cf_recommend = cf_recommend.sort_values(by='Predicted_Rating', ascending=False).head(5)

    # Merge both recommendations
    hybrid = pd.concat([skill_based, cf_recommend]).drop_duplicates(subset="Title").head(5)

    # Output formatting
    result = "### ✅ Extracted Skills:\n" + ", ".join(resume_skills) + "\n\n"
    result += "### ❌ Missing Skills for '{}':\n".format(job_role) + (", ".join(missing_skills) if missing_skills else "None") + "\n\n"
    result += "### 📚 Recommended Courses:\n"

    course_list = hybrid[['Title', 'Ratings']].values.tolist()
    return result, course_list

# Gradio UI

In [17]:
job_list = [""] + sorted(jobs_df['Job Title'].unique().tolist())

with gr.Blocks() as demo:
    gr.Markdown("## 🧠 Resume Skill Gap Analyzer + Course Recommender")
    
    with gr.Row():
        resume_file = gr.File(label="📄 Upload Resume (PDF)", file_types=[".pdf"])
        skills_input = gr.Textbox(label="✍️ Or Enter Your Skills (comma-separated)", placeholder="e.g. Python, SQL, Machine Learning")

    job_dropdown = gr.Dropdown(choices=job_list, label="🎯 Select Desired Job Role", value="")

    recommend_btn = gr.Button("🚀 Recommend")

    output_text = gr.Markdown()
    output_courses = gr.Dataframe(headers=["Course Title", "Rating"], interactive=False)

    recommend_btn.click(
        fn=recommend,
        inputs=[skills_input, resume_file, job_dropdown],
        outputs=[output_text, output_courses]
    )

demo.launch()

* Running on local URL:  http://127.0.0.1:7864

To create a public link, set `share=True` in `launch()`.


