In [1]:

import pandas as pd

data = pd.read_csv("Resume Screening.csv")

print(data.head())

print(data.shape)
print(data.info())

# Check missing values
print(data.isnull().sum())



       Category                                             Resume
0  Data Science  Skills * Programming Languages: Python (pandas...
1  Data Science  Education Details \r\nMay 2013 to May 2017 B.E...
2  Data Science  Areas of Interest Deep Learning, Control Syste...
3  Data Science  Skills â¢ R â¢ Python â¢ SAP HANA â¢ Table...
4  Data Science  Education Details \r\n MCA   YMCAUST,  Faridab...
(962, 2)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 962 entries, 0 to 961
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  962 non-null    object
 1   Resume    962 non-null    object
dtypes: object(2)
memory usage: 15.2+ KB
None
Category    0
Resume      0
dtype: int64


In [2]:
print(data['Category'].value_counts())


Category
Java Developer               84
Testing                      70
DevOps Engineer              55
Python Developer             48
Web Designing                45
HR                           44
Hadoop                       42
Blockchain                   40
ETL Developer                40
Operations Manager           40
Data Science                 40
Sales                        40
Mechanical Engineer          40
Arts                         36
Database                     33
Electrical Engineering       30
Health and fitness           30
PMO                          30
Business Analyst             28
DotNet Developer             28
Automation Testing           26
Network Security Engineer    25
SAP Developer                24
Civil Engineer               24
Advocate                     20
Name: count, dtype: int64


# data cleaning

In [3]:

import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

STOPWORDS = set(stopwords.words('english'))
TECH_TERMS = {"ms", "os", "c", "r", "sql"}  # Keep important short words
LEMMATIZER = WordNetLemmatizer()

def clean_resume_v3(text):
    text = text.encode('ascii', 'ignore').decode('utf-8')
    text = text.lower()
    text = re.sub(r'\S+@\S+', ' ', text)
    text = re.sub(r'http\S+|www\.\S+', ' ', text)
    text = re.sub(r'\+?\d[\d -]{8,}\d', ' ', text)
    text = re.sub(r'[^a-z\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    tokens = nltk.word_tokenize(text)
    cleaned = []
    for w in tokens:
        if (w not in STOPWORDS) and (len(w) > 1 or w in {"a", "i"} or w in TECH_TERMS):
            cleaned.append(LEMMATIZER.lemmatize(w))
    # Remove consecutive duplicates
    final = []
    for word in cleaned:
        if not final or word != final[-1]:
            final.append(word)
    return ' '.join(final)

# Apply cleaning
data['clean_text'] = data['Resume'].apply(clean_resume_v3)
data




[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\KARAN\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\KARAN\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\KARAN\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\KARAN\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Unnamed: 0,Category,Resume,clean_text
0,Data Science,Skills * Programming Languages: Python (pandas...,skill programming language python panda numpy ...
1,Data Science,Education Details \r\nMay 2013 to May 2017 B.E...,education detail may uit rgpv data scientist d...
2,Data Science,"Areas of Interest Deep Learning, Control Syste...",area interest deep learning control system des...
3,Data Science,Skills â¢ R â¢ Python â¢ SAP HANA â¢ Table...,skill r python sap hana tableau sap hana sql s...
4,Data Science,"Education Details \r\n MCA YMCAUST, Faridab...",education detail mca ymcaust faridabad haryana...
...,...,...,...
957,Testing,Computer Skills: â¢ Proficient in MS office (...,computer skill proficient m office word basic ...
958,Testing,â Willingness to accept the challenges. â ...,willingness accept challenge positive thinking...
959,Testing,"PERSONAL SKILLS â¢ Quick learner, â¢ Eagerne...",personal skill quick learner eagerness learn n...
960,Testing,COMPUTER SKILLS & SOFTWARE KNOWLEDGE MS-Power ...,computer skill software knowledge m power poin...



# Encoding Job Categories into Numerical Labels for Model Training

In [4]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
data['Category_Label'] = label_encoder.fit_transform(data['Category'])

print(label_encoder.classes_)  # Check category names


['Advocate' 'Arts' 'Automation Testing' 'Blockchain' 'Business Analyst'
 'Civil Engineer' 'Data Science' 'Database' 'DevOps Engineer'
 'DotNet Developer' 'ETL Developer' 'Electrical Engineering' 'HR' 'Hadoop'
 'Health and fitness' 'Java Developer' 'Mechanical Engineer'
 'Network Security Engineer' 'Operations Manager' 'PMO' 'Python Developer'
 'SAP Developer' 'Sales' 'Testing' 'Web Designing']



# Text Feature Extraction using TF-IDF Vectorization

In [5]:
from sklearn.model_selection import train_test_split

train_data, temp_data = train_test_split(
    data, test_size=0.3, stratify=data['Category'], random_state=42
)
val_data, test_data = train_test_split(
    temp_data, test_size=0.5, stratify=temp_data['Category'], random_state=42
)

print("Train size:", train_data.shape)
print("Validation size:", val_data.shape)
print("Test size:", test_data.shape)


Train size: (673, 4)
Validation size: (144, 4)
Test size: (145, 4)



# Dataset Splitting into Train, Validation, and Test Sets

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

# Define and fit TF-IDF
tfidf = TfidfVectorizer(max_features=20000,  stop_words='english')
tfidf.fit(data['clean_text'])

# Features and labels
X = tfidf.transform(data['clean_text'])
y = data['Category']

# Split into train, validation, and test
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42
)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42
)

print("Train size:", X_train.shape)
print("Validation size:", X_val.shape)
print("Test size:", X_test.shape)




Train size: (673, 6236)
Validation size: (144, 6236)
Test size: (145, 6236)


# Defining and traing model

In [7]:
from sklearn.linear_model import LogisticRegression

logisticR = LogisticRegression(
    solver="liblinear",      # compatible solver
    penalty="l2",
    C=0.1,                   # regularization
    max_iter=5000,
    class_weight="balanced",
    multi_class="ovr"
)

logisticR.fit(X_train, y_train)
print("LogisticRegression")
print(logisticR)


LogisticRegression
LogisticRegression(C=0.1, class_weight='balanced', max_iter=5000,
                   multi_class='ovr', solver='liblinear')



# Model Evaluation using Confusion Matrix and Classification Report

In [8]:
from sklearn.metrics import classification_report, confusion_matrix
import pandas as pd

# Predict on test data
y_pred = logisticR.predict(X_test)

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm, index=label_encoder.classes_, columns=label_encoder.classes_)

print("Confusion Matrix (with labels):\n")
print(cm_df)

# Classification Report (Precision, Recall, F1-score)
report = classification_report(y_test, y_pred, target_names=label_encoder.classes_)
print("\nClassification Report:\n", report)



Confusion Matrix (with labels):

                           Advocate  Arts  Automation Testing  Blockchain  \
Advocate                          3     0                   0           0   
Arts                              0     6                   0           0   
Automation Testing                0     0                   3           0   
Blockchain                        0     0                   0           6   
Business Analyst                  0     0                   0           0   
Civil Engineer                    0     0                   0           0   
Data Science                      0     0                   0           0   
Database                          0     0                   0           0   
DevOps Engineer                   0     0                   0           0   
DotNet Developer                  0     0                   0           0   
ETL Developer                     0     0                   0           0   
Electrical Engineering            0     0  

In [9]:
import joblib

joblib.dump(tfidf, "tfidf_vectorizer.pkl")

joblib.dump(logisticR, "logistic_model.pkl")

joblib.dump(label_encoder, "label_encoder.pkl")

['label_encoder.pkl']