In [29]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
import joblib

# Load Dataset
df = pd.read_csv('resume_dataset.csv')

# Preprocessing: Extract only text-based columns
X = df['Resume_str']
y = df['Category']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Vectorize the text using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)



# Encode the job categories
label_encoder = LabelEncoder()
y_train_enc = label_encoder.fit_transform(y_train)
y_test_enc = label_encoder.transform(y_test)


# Train a Logistic Regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train_vec, y_train_enc)

joblib.dump(model, 'model/model.pkl')
joblib.dump(vectorizer, 'model/vectorizer.pkl')  # If you need to save it separately
joblib.dump(label_encoder, 'model/label_encoder.pkl')

# Evaluate the model
accuracy = model.score(X_test_vec, y_test_enc)
print(f'Accuracy: {accuracy}')

def extract_skills(resume_text):
    # Predefined list of skills
    skills_list = ['python', 'java', 'sql', 'machine learning', 'deep learning', 'data analysis']
    
    # Find skills mentioned in the resume
    skills = [skill for skill in skills_list if skill.lower() in resume_text.lower()]
    return skills

def calculate_experience_score(resume_text):
    keywords = ['years of experience', 'worked for', 'experience']
    score = sum(1 for word in keywords if word in resume_text.lower())
    return min(score * 20, 100)  # Scale to 100

def calculate_education_score(resume_text):
    education_keywords = ['bachelor', 'master', 'phd', 'graduate', 'diploma']
    score = sum(1 for word in education_keywords if word in resume_text.lower())
    return min(score * 20, 100)  # Scale to 100


Accuracy: 0.6338028169014085
