In [252]:
import pandas as pd
import numpy as np

In [254]:
import re
import nltk
import joblib

In [256]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [258]:
from sklearn.model_selection import train_test_split

In [260]:
from sklearn.naive_bayes import MultinomialNB

In [262]:
from imblearn.over_sampling import SMOTE

In [264]:
from sklearn.ensemble import RandomForestClassifier

In [266]:
from sklearn.model_selection import GridSearchCV

In [268]:
from sklearn.pipeline import Pipeline

In [270]:
from sklearn.metrics import accuracy_score , classification_report

In [272]:
from nltk.corpus import stopwords

In [274]:
from nltk.tokenize import word_tokenize

In [276]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/keerthika/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/keerthika/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [280]:
def preprocess_text(text):
    " " " Clean and Preprocess text data." " "
    if not isinstance(text,str):
        return ""
    text = text.lower()
    text = re.sub(r'[^a-zA-Z]', ' ',text)
    words = word_tokenize(text)
    words = [word for word in words if word not in stopwords.words('english')]
    return ''.join(words)

In [25]:
#Load dataset

In [454]:
data = {
    "resume": [
       "Experienced software engineer with expertise in Python, Java, and cloud computing.",
        "Data scientist skilled in machine learning, deep learning, and data visualization.",
        "Marketing specialist with experience in social media marketing and SEO.",
        "HR professional with experience in recruitment, employee relations, and compliance.",
        "Python developer with experience in Django, Flask, and backend systems.",
        "Machine learning engineer with expertise in neural networks and deep learning.",
        "Digital marketing expert in content creation and social media advertising.",
        "Human resources manager with experience in talent acquisition and training.",
        "Full-stack developer experienced in JavaScript, React, Node.js, and MongoDB.",
        "Business analyst with skills in data analysis, Power BI, and financial modeling.",
        "Cybersecurity analyst skilled in threat detection, penetration testing, and network security.",
        "Graphic designer with expertise in Adobe Photoshop, Illustrator, and branding.",
        "Sales executive with a strong background in lead generation and CRM management.",
        "AI researcher with experience in reinforcement learning and generative AI models."
    ],
    "category": ["Software Engineer", "Data Scientist", "Marketing", "HR",
        "Software Engineer", "Data Scientist", "Marketing", "HR",
        "Software Engineer", "Business Analyst", "Cybersecurity", "Graphic Designer",
        "Sales", "Data Scientist"]
}

In [456]:
df=pd.DataFrame(data)

In [458]:
df

Unnamed: 0,resume,category
0,Experienced software engineer with expertise i...,Software Engineer
1,"Data scientist skilled in machine learning, de...",Data Scientist
2,Marketing specialist with experience in social...,Marketing
3,HR professional with experience in recruitment...,HR
4,"Python developer with experience in Django, Fl...",Software Engineer
5,Machine learning engineer with expertise in ne...,Data Scientist
6,Digital marketing expert in content creation a...,Marketing
7,Human resources manager with experience in tal...,HR
8,Full-stack developer experienced in JavaScript...,Software Engineer
9,"Business analyst with skills in data analysis,...",Business Analyst


In [460]:
df.dropna (inplace=True)

In [462]:
df= df[df['resume'].str.strip() != " "]

In [464]:
df['cleaned_resume']=df['resume'].apply(preprocess_text)

In [466]:
df['cleaned_resume']

0     experiencedsoftwareengineerexpertisepythonjava...
1     datascientistskilledmachinelearningdeeplearnin...
2     marketingspecialistexperiencesocialmediamarket...
3     hrprofessionalexperiencerecruitmentemployeerel...
4     pythondeveloperexperiencedjangoflaskbackendsys...
5     machinelearningengineerexpertiseneuralnetworks...
6     digitalmarketingexpertcontentcreationsocialmed...
7     humanresourcesmanagerexperiencetalentacquisiti...
8     fullstackdeveloperexperiencedjavascriptreactno...
9     businessanalystskillsdataanalysispowerbifinanc...
10    cybersecurityanalystskilledthreatdetectionpene...
11    graphicdesignerexpertiseadobephotoshopillustra...
12    salesexecutivestrongbackgroundleadgenerationcr...
13    airesearcherexperiencereinforcementlearninggen...
Name: cleaned_resume, dtype: object

In [476]:
df = pd.concat([df] * 3, ignore_index=True) 

In [478]:
df

Unnamed: 0,resume,category,cleaned_resume
0,Experienced software engineer with expertise i...,Software Engineer,experiencedsoftwareengineerexpertisepythonjava...
1,"Data scientist skilled in machine learning, de...",Data Scientist,datascientistskilledmachinelearningdeeplearnin...
2,Marketing specialist with experience in social...,Marketing,marketingspecialistexperiencesocialmediamarket...
3,HR professional with experience in recruitment...,HR,hrprofessionalexperiencerecruitmentemployeerel...
4,"Python developer with experience in Django, Fl...",Software Engineer,pythondeveloperexperiencedjangoflaskbackendsys...
5,Machine learning engineer with expertise in ne...,Data Scientist,machinelearningengineerexpertiseneuralnetworks...
6,Digital marketing expert in content creation a...,Marketing,digitalmarketingexpertcontentcreationsocialmed...
7,Human resources manager with experience in tal...,HR,humanresourcesmanagerexperiencetalentacquisiti...
8,Full-stack developer experienced in JavaScript...,Software Engineer,fullstackdeveloperexperiencedjavascriptreactno...
9,"Business analyst with skills in data analysis,...",Business Analyst,businessanalystskillsdataanalysispowerbifinanc...


In [480]:
x = df['cleaned_resume']
y = df['category']

In [484]:
X_train,X_test,Y_train,Y_test = train_test_split(x,y,test_size=8,random_state=42,stratify =y )

In [486]:
pipeline= Pipeline([
    ('vectorizer',TfidfVectorizer(max_features=5000,ngram_range=(1,2))),
    ('classifier',RandomForestClassifier(random_state=42))
])

In [620]:
param_grid= {
    'vectorizer__max_features' : [3000,5000],
    'vectorizer__ngram_range' : [(1,1),(1,2)],
    'classifier__n_estimators' : [100,200],
    'classifier__max_depth' : [3,10,None],
    'classifier__min_samples_split' : [5,10],
    'classifier__min_samples_leaf':[1,2]
}

In [622]:
grid_search = GridSearchCV(pipeline, param_grid, cv=2, n_jobs=-1,verbose=1)

In [624]:
print(Y_train.value_counts())

category
Data Scientist       7
Software Engineer    7
HR                   5
Marketing            5
Graphic Designer     3
Cybersecurity        3
Sales                2
Business Analyst     2
Name: count, dtype: int64


In [626]:
grid_search.fit(X_train,Y_train)

Fitting 2 folds for each of 96 candidates, totalling 192 fits


In [627]:
best_model = grid_search.best_estimator_

In [630]:
#Save the model
joblib.dump(best_model,'resume_classifier.pkl')

['resume_classifier.pkl']

In [632]:
pred = best_model.predict(X_test)

In [634]:
print(f"Model Accuracy: {accuracy_score(Y_test,pred):.4f}")

print(f"\nClassification Report: \n ", classification_report(Y_test,pred))

Model Accuracy: 1.0000

Classification Report: 
                     precision    recall  f1-score   support

 Business Analyst       1.00      1.00      1.00         1
   Data Scientist       1.00      1.00      1.00         2
               HR       1.00      1.00      1.00         1
        Marketing       1.00      1.00      1.00         1
            Sales       1.00      1.00      1.00         1
Software Engineer       1.00      1.00      1.00         2

         accuracy                           1.00         8
        macro avg       1.00      1.00      1.00         8
     weighted avg       1.00      1.00      1.00         8



In [636]:
def classify_resume(resume_text):
    model = joblib.load('resume_classifier.pkl')
    return model.predict([resume_text])[0]

In [566]:
new_resume = "AI researcher with experience in reinforcement learning and generative AI models."

In [514]:
print("Predicted category :" ,classify_resume(new_resume))

Predicted category : Data Scientist


In [538]:
new_resume = "Sales executive with a strong background in lead generation and CRM management."

In [540]:
print("Predicted category :", classify_resume(new_resume))

Predicted category : Software Engineer


In [638]:
new_resume = "Machine learning engineer with expertise in neural networks and deep learning."

In [640]:
print("Predicted category: ", classify_resume(new_resume))

Predicted category:  Data Scientist
