In [32]:
import pandas as pd 
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV, train_test_split , cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [2]:
df = pd.read_csv('processed_data.csv')
df

Unnamed: 0,Category,Resume
0,Java Developer,skills language java operating system windows ...
1,Java Developer,java spring hibernate sql rest apis git maven ...
2,Java Developer,java spring hibernate sql rest apis git maven ...
3,Java Developer,computer skills languages and script jsp servl...
4,Java Developer,java hibernate sql rest apis maven numpypatric...
...,...,...
2431,SAP Developer,sap abap sap fiori sap hana sap modules sql da...
2432,SAP Developer,sap abap sap fiori sap modules data migration ...
2433,SAP Developer,education details july to february be computer...
2434,SAP Developer,sap abap sap fiori sap hana sap modules sqlrya...


In [3]:
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

In [8]:
df

Unnamed: 0,Category,Resume
0,11,selenium appium testng junit cicd python java ...
1,8,informatica talend sql etl testing seleniumale...
2,4,technical proficiencies platform ubuntufedorac...
3,13,sap abap sap hana sql data migration hadoop pa...
4,7,solidity ethereum smart contracts truffle webj...
...,...,...
2431,0,sql mysql oracle postgresql tsql stored proced...
2432,7,key skills programing languages c c python ape...
2433,7,solidity ethereum smart contracts truffle webj...
2434,8,informatica data warehousing python etl testin...


In [7]:
df['Category'].value_counts()

Category
11    174
8     174
4     174
13    174
7     174
3     174
2     174
9     174
10    174
5     174
6     174
12    174
0     174
1     174
Name: count, dtype: int64

In [6]:
df['Category'] = df['Category'].map({
"Java Developer" : 1,
"Testing": 2,
"DevOps Engineer" : 3,
"Python Developer": 4,
"Web Designing" : 5,
"Hadoop" : 6,
"Blockchain" : 7,
"ETL Developer" : 8,
"Data Science" : 9,
"Database" : 0,
"DotNet Developer" : 10,
"Automation Testing" : 11,
"Network Security Engineer" : 12,
"SAP Developer" : 13})

In [10]:
df.isna().sum()

Category    0
Resume      0
dtype: int64

In [12]:
tfidf = TfidfVectorizer(
    max_features=2000,       
    ngram_range=(1, 2),  
    sublinear_tf = True, 
    min_df = 3, 
    max_df =0.9,
    stop_words='english'     
)

X = tfidf.fit_transform(df['Resume'])

In [15]:
le = LabelEncoder()
y = le.fit_transform(df['Category'])

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, df['Category'], test_size=0.2, random_state=42)

param_grid = {
    'C': [0.1, 1, 5],             
    'max_iter': [200, 400, 600],  
    'solver': ['lbfgs', 'saga']   
}

log_reg = LogisticRegression(multi_class='multinomial', n_jobs=-1)
grid = GridSearchCV(log_reg, param_grid, cv=3, scoring='f1_macro', verbose=2)
grid.fit(X_train, y_train)

best_model = grid.best_estimator_
print("Best Parameters:", grid.best_params_)

Fitting 3 folds for each of 18 candidates, totalling 54 fits
[CV] END ..................C=0.1, max_iter=200, solver=lbfgs; total time=   4.2s
[CV] END ..................C=0.1, max_iter=200, solver=lbfgs; total time=   1.7s
[CV] END ..................C=0.1, max_iter=200, solver=lbfgs; total time=   1.8s
[CV] END ...................C=0.1, max_iter=200, solver=saga; total time=   0.7s
[CV] END ...................C=0.1, max_iter=200, solver=saga; total time=   0.7s
[CV] END ...................C=0.1, max_iter=200, solver=saga; total time=   0.8s
[CV] END ..................C=0.1, max_iter=400, solver=lbfgs; total time=   2.7s
[CV] END ..................C=0.1, max_iter=400, solver=lbfgs; total time=   2.6s
[CV] END ..................C=0.1, max_iter=400, solver=lbfgs; total time=   2.6s
[CV] END ...................C=0.1, max_iter=400, solver=saga; total time=   0.6s
[CV] END ...................C=0.1, max_iter=400, solver=saga; total time=   0.7s
[CV] END ...................C=0.1, max_iter=400,

In [39]:
model = LogisticRegression(
    C=5,
    max_iter=200,
    solver='lbfgs',
    n_jobs=-1,
    random_state=42
)
model.fit(X_train_tfidf, y_train)

In [40]:
y_pred = model.predict(X_test_tfidf)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 1.0

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        40
           1       1.00      1.00      1.00        38
           2       1.00      1.00      1.00        43
           3       1.00      1.00      1.00        31
           4       1.00      1.00      1.00        47
           5       1.00      1.00      1.00        32
           6       1.00      1.00      1.00        27
           7       1.00      1.00      1.00        38
           8       1.00      1.00      1.00        31
           9       1.00      1.00      1.00        35
          10       1.00      1.00      1.00        37
          11       1.00      1.00      1.00        29
          12       1.00      1.00      1.00        35
          13       1.00      1.00      1.00        25

    accuracy                           1.00       488
   macro avg       1.00      1.00      1.00       488
weighted avg       1.00      1.00      1.

In [38]:
X_train, X_test, y_train, y_test = train_test_split(df_noisy['Resume'], df_noisy['Category'], test_size=0.2, random_state=42)

tfidf = TfidfVectorizer(
    max_features=1000,       
    ngram_range=(1, 2),  
    sublinear_tf = True, 
    min_df = 3, 
    max_df =0.9,
    stop_words='english'     
)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)


In [41]:
scores = cross_val_score(model, X_train_tfidf, y_train, cv=5)
print("Cross-validation accuracy:", scores.mean())


Cross-validation accuracy: 0.9979447630347373


In [34]:
import random

soft_skills = [
    "communication skills", "problem solving", "teamwork", "time management",
    "leadership", "critical thinking", "adaptability", "creativity",
    "attention to detail", "decision making", "analytical thinking",
    "interpersonal skills", "collaboration", "project management",
    "emotional intelligence", "negotiation", "work ethic", "self motivation"
]

technical_soft_skills = [
    "mathematics", "statistics", "data analysis", "research skills",
    "presentation skills", "logical reasoning", "technical writing",
    "documentation", "troubleshooting", "analytical problem solving"
]

resume_fillers = [
    "seeking to leverage skills in a dynamic environment",
    "committed to continuous learning and development",
    "experienced in working under pressure to meet deadlines",
    "collaborated with cross-functional teams to achieve goals",
    "delivered high-quality solutions within tight timelines",
    "enthusiastic about innovation and growth opportunities",
    "motivated self-starter with a passion for technology"
]


In [35]:
def add_resume_noise(text):

    soft = random.sample(soft_skills, k=random.randint(1, 3))
    tech = random.sample(technical_soft_skills, k=random.randint(1, 2))
    filler = random.sample(resume_fillers, k=random.randint(1, 2))

    additions = " ".join(soft + tech + filler)

    if random.random() < 0.33:
        return additions + " " + text
    elif random.random() < 0.66:
        parts = text.split('. ')
        mid = len(parts)//2
        return '. '.join(parts[:mid]) + ". " + additions + ". " + '. '.join(parts[mid:])
    else:
        return text + " " + additions


In [36]:
import numpy as np

df_noisy = df.copy()
mask = np.random.rand(len(df_noisy)) < 0.7
df_noisy.loc[mask, 'Resume'] = df_noisy.loc[mask, 'Resume'].apply(add_resume_noise)


In [37]:
print(df_noisy['Resume'].sample(3).values)

['firewalls idsips network monitoring penetration testing pythonerica anderson is an experienced network security engineer with skills in firewalls idsips network monitoring penetration testing python adept at delivering quality work in fastpaced environmentsnetwork security engineer at montoya group worked on projects involving network monitoring idsips pythonmba from murphysullivan university class of project developed using python firewalls penetration testing'
 '. problem solving adaptability documentation data analysis collaborated with cross-functional teams to achieve goals enthusiastic about innovation and growth opportunities. selenium appium testng junit cicd python java python sqlmichael johnson is an experienced automation testing with skills in selenium appium testng junit cicd python java python sql adept at delivering quality work in fastpaced environmentsautomation testing at martinez johnson and meyer worked on projects involving python appium java automation testing a

In [49]:
sample_resume = ["""Experienced in Java, Spring, Hibernate, SQL, Git, and Maven. Built backend services and web applications. Some knowledge of JavaScript and HTML for frontend integration.\n\nGraduated with B.Tech in Computer Engineering from IIT Bombay in 2015.\n\nProject 1: Developed enterprise applications using Spring and Hibernate.\nProject 2: Optimized database queries and data processing.\nProject 3: Created APIs consumed by web applications."""]

sample_tfidf = tfidf.transform(sample_resume)

predicted_label = model.predict_proba(sample_tfidf)[0]

label_map = {
    0: "Database",
    1: "Java Developer",
    2: "Testing",
    3: "DevOps Engineer",
    4: "Python Developer",
    5: "Web Designing",
    6: "Hadoop",
    7: "Blockchain",
    8: "ETL Developer",
    9: "Data Science",
    10: "DotNet Developer",
    11: "Automation Testing",
    12: "Network Security Engineer",
    13: "SAP Developer"
}

# Print results
for i, prob in enumerate(predicted_label):
    print(f"{label_map[i]}: {prob:.2f}")


Database: 0.01
Java Developer: 0.89
Testing: 0.00
DevOps Engineer: 0.01
Python Developer: 0.01
Web Designing: 0.01
Hadoop: 0.01
Blockchain: 0.01
ETL Developer: 0.01
Data Science: 0.01
DotNet Developer: 0.00
Automation Testing: 0.01
Network Security Engineer: 0.00
SAP Developer: 0.01
