# Step 1: Import Libraries

In [63]:
import pandas as pd
import numpy as np
import joblib
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import classification_report, mean_squared_error

# Step 2: Load Dataset

In [64]:
df = pd.read_csv("../data/UpdatedResumeDataSet.csv")
df = df[['Resume', 'Category']]

# Step 3: Text Preprocessing

In [65]:
def clean_text(text):
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"[^A-Za-z\s]", "", text)
    text = re.sub(r"\s+", " ", text)
    return text.lower().strip()
df['Cleaned_Resume'] = df['Resume'].apply(clean_text)

# Step 4: Encode Target Labels

In [66]:
le = LabelEncoder()
df['Encoded_Category'] = le.fit_transform(df['Category'])

# Step 5: Vectorization

In [67]:

vectorizer = TfidfVectorizer(max_features=3000)
X = vectorizer.fit_transform(df['Cleaned_Resume']).toarray()
y = df['Encoded_Category']

# Step 6: Train Classifier

In [68]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
classifier = LogisticRegression(max_iter=1000)
classifier.fit(X_train, y_train)

In [69]:
# Print classification report
y_pred = classifier.predict(X_test)
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=le.classes_))

Classification Report:
                           precision    recall  f1-score   support

                 Advocate       1.00      1.00      1.00         3
                     Arts       1.00      1.00      1.00         6
       Automation Testing       1.00      1.00      1.00         5
               Blockchain       1.00      1.00      1.00         7
         Business Analyst       1.00      1.00      1.00         4
           Civil Engineer       1.00      1.00      1.00         9
             Data Science       1.00      1.00      1.00         5
                 Database       1.00      1.00      1.00         8
          DevOps Engineer       1.00      0.93      0.96        14
         DotNet Developer       1.00      1.00      1.00         5
            ETL Developer       1.00      1.00      1.00         7
   Electrical Engineering       1.00      1.00      1.00         6
                       HR       1.00      1.00      1.00        12
                   Hadoop       1.00  

# Step 7: Save Classifier and Vectorizer

In [70]:
import os
import joblib

model_dir = "../model"
os.makedirs(model_dir, exist_ok=True)

# Save classifier, vectorizer, and label encoder
joblib.dump(classifier, os.path.join(model_dir, "classifier.pkl"))
joblib.dump(vectorizer, os.path.join(model_dir, "vectorizer.pkl"))
joblib.dump(le, os.path.join(model_dir, "label_encoder.pkl"))

print("Models saved successfully to '../model/'")

Models saved successfully to '../model/'


In [71]:
# import pickle
# import os

# def load_model(model_path):
#     with open(model_path, "rb") as f:
#         return pickle.load(f)

# def load_label_encoder(encoder_path):
#     with open(encoder_path, "rb") as f:
#         return pickle.load(f)

# # Usage:
# model = load_model(os.path.join("..", "model", "classifier.pkl"))
# label_encoder = load_label_encoder(os.path.join("..", "model", "label_encoder.pkl"))


In [72]:
# joblib.dump(classifier, "../model/classifier.pkl")
# joblib.dump(vectorizer, "../model/vectorizer.pkl")
# joblib.dump(le, "../model/label_encoder.pkl")

# Step 8: Train Resume Scoring Model (Regression)

In [73]:
# Create artificial score based on keyword count or heuristic (for demo)
def calculate_score(resume):
    keywords = ['python', 'data', 'project', 'ml', 'ai', 'sql', 'analysis']
    score = sum([resume.lower().count(kw) for kw in keywords])
    return score

df['Score'] = df['Resume'].apply(calculate_score)

In [74]:
# Use same features from vectorizer
y_score = df['Score']
X_train_s, X_test_s, y_train_s, y_test_s = train_test_split(X, y_score, test_size=0.2, random_state=42)

scorer_model = RandomForestRegressor()
scorer_model.fit(X_train_s, y_train_s)

In [75]:
# Evaluate
pred_score = scorer_model.predict(X_test_s)
print("Resume Score RMSE:", np.sqrt(mean_squared_error(y_test_s, pred_score)))

Resume Score RMSE: 0.864891798917181


# Step 9: Save Scorer Model

In [76]:
joblib.dump(scorer_model, "../model/scorer_model.pkl")


['../model/scorer_model.pkl']