In [47]:
pip install --upgrade numpy scikit-learn pyarrow


Collecting numpy
  Downloading numpy-2.3.1-cp312-cp312-win_amd64.whl.metadata (60 kB)
Collecting pyarrow
  Downloading pyarrow-21.0.0-cp312-cp312-win_amd64.whl.metadata (3.4 kB)
Downloading pyarrow-21.0.0-cp312-cp312-win_amd64.whl (26.2 MB)
   ---------------------------------------- 0.0/26.2 MB ? eta -:--:--
   - -------------------------------------- 1.0/26.2 MB 5.6 MB/s eta 0:00:05
   --- ------------------------------------ 2.1/26.2 MB 5.6 MB/s eta 0:00:05
   ---- ----------------------------------- 3.1/26.2 MB 5.8 MB/s eta 0:00:04
   ------ --------------------------------- 4.2/26.2 MB 5.2 MB/s eta 0:00:05
   ------- -------------------------------- 5.0/26.2 MB 5.0 MB/s eta 0:00:05
   -------- ------------------------------- 5.8/26.2 MB 4.9 MB/s eta 0:00:05
   ---------- ----------------------------- 6.8/26.2 MB 4.8 MB/s eta 0:00:05
   ------------ --------------------------- 7.9/26.2 MB 4.9 MB/s eta 0:00:04
   ------------- -------------------------- 8.7/26.2 MB 4.9 MB/s eta 0:0

In [17]:
# Import necessary Libraries

import PyPDF2
import re
import spacy
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import joblib

In [19]:
# Load spaCy English model

nlp = spacy.load("en_core_web_sm")

In [21]:
#2. Function to Extract Text from Resume PDF

def extract_text_from_pdf(file_path):
    text = ""
    with open(file_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        for page in reader.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text + "\n"
    return text.strip()

In [23]:

#3. Split Resume into Sections to Understand the Resume Templeate

def split_sections(text):
    text = text.lower()
    sections = {
        "skills": "",
        "education": "",
        "experience": ""
    }

    pattern = r"(skills|education|academic background|experience|work experience|internship|projects)"
    matches = list(re.finditer(pattern, text))

    for i in range(len(matches)):
        start = matches[i].end()
        end = matches[i + 1].start() if i + 1 < len(matches) else len(text)
        section_title = matches[i].group().strip()
        content = text[start:end].strip()

        if "skill" in section_title:
            sections["skills"] += content
        elif "education" in section_title or "academic" in section_title:
            sections["education"] += content
        elif "experience" in section_title or "internship" in section_title:
            sections["experience"] += content

    return sections

In [25]:
# 4. Extract Informations from Each Section

def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text)
    return re.sub(r'\s+', ' ', text).strip()

def extract_skills(text):
    doc = nlp(text)
    skills = set()
    for chunk in doc.noun_chunks:
        if len(chunk.text.split()) <= 4:
            skills.add(chunk.text.strip().lower())
    return list(skills)

def extract_education(text):
    # Use regex to match degree-like patterns
    patterns = r"(b\.tech|btech|m\.tech|mtech|b\.sc|m\.sc|mba|phd|bachelor|master|ba|ma)"
    matches = re.findall(patterns, text, re.IGNORECASE)
    return list(set([m.lower() for m in matches])) if matches else ["Not found"]

def extract_experience(text):
    match = re.search(r'(\d+)\s+years?', text)
    return match.group(0) if match else "Not mentioned"


In [27]:
# 7.Training of Job Role Classifier

def train_classifier(dataset_path):
    df = pd.read_csv(dataset_path)
    df['resume_text'] = df['resume_text'].apply(clean_text)

    pipeline = Pipeline([
        ('tfidf', TfidfVectorizer()),
        ('clf', LogisticRegression(max_iter=1000))
    ])

    pipeline.fit(df['resume_text'], df['job_role'])
    joblib.dump(pipeline, "job_role_classifier.joblib")
    print("!!!Model trained and saved.!!!")


In [29]:
# 5. Predict N-Job Role According to Ranking Based on the Skills 

def predict_job_role(resume_path, model_path="job_role_classifier.joblib",top_n=5):
    model = joblib.load(model_path)
    raw_text = extract_text_from_pdf(resume_path)
    sections = split_sections(raw_text)

#Extract details from resume sections
    skills = extract_skills(sections['skills'])
    education = extract_education(sections['education'])
    experience = extract_experience(sections['experience'])

#Predict top N roles
    cleaned_text = clean_text(raw_text)
    probabilities = model.predict_proba([cleaned_text])[0]
    job_roles = model.classes_

    top_indices = probabilities.argsort()[-top_n:][::-1]
    top_roles = [(job_roles[i], round(probabilities[i] * 100, 2)) for i in top_indices]

#Final Output
    print("\n📄 Extracted Resume Details:")
    print("Skills     :", skills if skills else "Not found")
    print("Education  :", education)
    print("Experience :", experience)
    print("\n Top {top_n} Predicted Job Roles:")
    for role, score in top_roles:
        print(f"- {role} ({score}%)")


In [31]:
# 6. Main Entry

train_classifier(r"C:\Users\HP\Downloads\job_role_dataset.csv") 
predict_job_role(r"C:\Users\HP\Downloads\Jeswin-P-Vincent-Resume.pdf", top_n=5)

✅ Model trained and saved.

📄 Extracted Resume Details:
Skills     : ['postman api', 'keras', 'rest apis', 'r programming', 'ethic,\nresponsibility', 'tableau', 'that', 'professional growth', 'ineterests', 'a dynamic work environment', 'adaptability', 'innovation', 'ml', 'ms\nexcel', 'power bi', 'deep\nlearning', 'the companyʼs success', 'django rest framework', 'nlp\nsoft: team work', 'pandas', 'ai', 'tensor flow', 'a role', 'english malayalam hindi', 'nltk', 'creativity\nlanguages', 'neural networks', 'my sql', 'python', 'c programming', 'data analysis', 'django']
Education  : ['b.tech', 'ma']
Experience : Not mentioned

 Top {top_n} Predicted Job Roles:
- Backend Developer (17.59%)
- Data Analyst (15.05%)
- Accountant (14.0%)
- Data Engineer (14.0%)
- ML Engineer (13.79%)


In [None]:

# --------------------------------------

# --------------------------------------

# --------------------------------------

