# ðŸ“„ Resume Classification & Intelligent Ranking System
This notebook covers the end-to-end pipeline for extracting text from resumes, classifying them into job categories using machine learning, and ranking them against a job description using cosine similarity.

### 1. Install Required Libraries
Uncomment the following line if you haven't installed the dependencies yet.

In [None]:
# !pip install pandas scikit-learn xgboost python-docx PyPDF2 matplotlib seaborn

### 2. Imports

In [None]:
import os
import pandas as pd
import re
from pathlib import Path
import docx
import PyPDF2
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from collections import Counter

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, cosine_similarity
from sklearn.preprocessing import LabelEncoder

### 3. Utility Functions: Extraction & Cleaning

In [None]:
def extract_text(file_path):
    file_path = Path(file_path)
    suffix = file_path.suffix.lower()
    text = ""
    try:
        if suffix == ".docx":
            doc = docx.Document(file_path)
            text = " ".join([p.text for p in doc.paragraphs])
        elif suffix == ".pdf":
            with open(file_path, "rb") as f:
                reader = PyPDF2.PdfReader(f)
                for page in reader.pages:
                    text += page.extract_text() or ""
        elif suffix == ".txt":
            with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
                text = f.read()
    except Exception as e:
        print(f"Error reading {file_path}: {e}")
    return text.strip()

def clean_text(text):
    text = text.lower()
    text = re.sub(r'http\S+\s*', ' ', text)
    text = re.sub(r'RT|cc', ' ', text)
    text = re.sub(r'#\S+', ' ', text)
    text = re.sub(r'@\S+', ' ', text)
    text = re.sub('[%s]' % re.escape("""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""), ' ', text)
    text = re.sub(r'[\x00-\x7f]', r' ', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

### 4. Data Loading

In [None]:
# Update the path if necessary
base_dir = Path(r"c:\Users\gopar\OneDrive\Desktop\Resume\Resume classification dataset\Dataset\Resumes")
data = []

categories = {
    "Peoplesoft resumes": "Peoplesoft",
    "SQL Developer Lightning insight": "SQL Developer",
    "workday resumes": "Workday"
}

for folder in base_dir.iterdir():
    if folder.is_dir():
        cat_name = categories.get(folder.name, folder.name)
        for file in folder.rglob("*"):
            if file.suffix.lower() in [".docx", ".pdf", ".txt"]:
                text = extract_text(file)
                if text:
                    data.append({"file_name": file.name, "text": text, "category": cat_name})
    else:
        if folder.suffix.lower() in [".docx", ".pdf", ".txt"]:
            text = extract_text(folder)
            if text:
                label = "React Developer" if "React" in folder.name else "Other"
                data.append({"file_name": folder.name, "text": text, "category": label})

df = pd.DataFrame(data)
df['cleaned_text'] = df['text'].apply(clean_text)

print(f"Total Resumes Processed: {len(df)}")
print(df['category'].value_counts())
df.head()

### 5. Exploratory Data Analysis (EDA)

In [None]:
plt.figure(figsize=(10, 6))
sns.countplot(data=df, x='category', palette='viridis')
plt.title("Distribution of Resume Categories")
plt.xticks(rotation=45)
plt.show()

df['word_count'] = df['text'].apply(lambda x: len(str(x).split()))
plt.figure(figsize=(10, 6))
sns.histplot(df['word_count'], bins=15, kde=True, color='skyblue')
plt.title("Resume Word Count Distribution")
plt.show()

### 6. Feature Engineering & Preprocessing

In [None]:
le = LabelEncoder()
df['category_encoded'] = le.fit_transform(df['category'])

tfidf = TfidfVectorizer(stop_words='english', max_features=2000)
X = tfidf.fit_transform(df['cleaned_text'])
y = df['category_encoded']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"Training set shape: {X_train.shape}")
print(f"Test set shape: {X_test.shape}")

### 7. Model Training & Comparison

In [None]:
models = {
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "SVM": SVC(probability=True, kernel='linear', random_state=42),
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
}

performance_metrics = []

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, average='weighted', zero_division=0)
    rec = recall_score(y_test, y_pred, average='weighted', zero_division=0)
    f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)
    
    performance_metrics.append({
        "Model": name,
        "Accuracy": acc,
        "Precision": prec,
        "Recall": rec,
        "F1 Score": f1
    })
    print(f"{name} - Accuracy: {acc:.4f}, F1: {f1:.4f}")

perf_df = pd.DataFrame(performance_metrics)
perf_df

### 8. Visualization of Model Performance

In [None]:
plt.figure(figsize=(10, 6))
sns.barplot(data=perf_df, x='Model', y='Accuracy', palette='coolwarm')
plt.title("Model Accuracy Comparison")
plt.ylim(0, 1.1)
plt.show()

### 9. Saving the Best Model

In [None]:
best_model_name = perf_df.sort_values(by='F1 Score', ascending=False).iloc[0]['Model']
best_model = models[best_model_name]

with open("model.pkl", "wb") as f:
    pickle.dump(best_model, f)
with open("tfidf.pkl", "wb") as f:
    pickle.dump(tfidf, f)
with open("label_encoder.pkl", "wb") as f:
    pickle.dump(le, f)

df.to_csv("processed_resumes.csv", index=False)
print(f"Best model ({best_model_name}) and artifacts saved successfully.")

### 10. AI Ranking & Shortlisting Simulation
Enter a job description to rank existing resumes.

In [None]:
job_description = "We are looking for a SQL Developer with experience in database management, query optimization, and PL/SQL."

# Clean and Vectorize Job Description
jd_cleaned = clean_text(job_description)
jd_vec = tfidf.transform([jd_cleaned])

# Calculate Similarity with all resumes
resume_vecs = tfidf.transform(df['cleaned_text'])
scores = cosine_similarity(jd_vec, resume_vecs).flatten()

df['Match Score'] = (scores * 100).round(2)
ranking = df[['file_name', 'category', 'Match Score']].sort_values(by='Match Score', ascending=False)

print("TOP 5 MATCHES:")
ranking.head(5)