<a href="https://colab.research.google.com/github/Malaiyarasan/ai-resume-screening-system/blob/main/notebooks/resume_screening_ai.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# --- CELL 1 ---
# Install libs (Colab)
!pip install -q scikit-learn pandas numpy joblib gradio

# --- CELL 1B ---
# Imports
import os
from pathlib import Path
import pandas as pd
import numpy as np
import joblib

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print("üöÄ Imports ready")


üöÄ Imports ready


In [2]:
# --- CELL 2 ---
# Create a small demo resume dataset (if you already have your own CSV, skip creation and load it)
DATA_DIR = Path("data")
DATA_DIR.mkdir(exist_ok=True)

demo_csv = DATA_DIR / "resumes_demo.csv"
if not demo_csv.exists():
    rows = [
        {"text": "Experienced data scientist with Python, pandas, TensorFlow, and model deployment experience. Built predictive models and dashboards.", "label": "fit"},
        {"text": "Mechanical engineer with 5 years experience in CAD and manufacturing. Skilled in SolidWorks and CNC programming.", "label": "not_fit"},
        {"text": "Machine learning engineer: CNN, computer vision, OpenCV, PyTorch, productionized models and CI/CD pipelines.", "label": "fit"},
        {"text": "Sales manager with strong client handling and target achievement. Expert in B2B sales and CRM tools.", "label": "not_fit"},
        {"text": "Data analyst with SQL, Excel, Power BI, dashboarding experience and performing EDA for stakeholders.", "label": "fit"},
        {"text": "Graphic designer skilled in Adobe Photoshop and Illustrator. Portfolio includes branding projects.", "label": "not_fit"},
        {"text": "NLP engineer worked on text classification, transformers, named entity recognition and deployment.", "label": "fit"},
        {"text": "HR recruiter with experience in hiring, onboarding and employee relations.", "label": "not_fit"}
    ]
    df_demo = pd.DataFrame(rows)
    df_demo.to_csv(demo_csv, index=False)
    print("Created demo dataset at", demo_csv)
else:
    df_demo = pd.read_csv(demo_csv)
    print("Loaded existing dataset at", demo_csv)

# Quick preview
df_demo.head(8)


Created demo dataset at data/resumes_demo.csv


Unnamed: 0,text,label
0,"Experienced data scientist with Python, pandas...",fit
1,Mechanical engineer with 5 years experience in...,not_fit
2,"Machine learning engineer: CNN, computer visio...",fit
3,Sales manager with strong client handling and ...,not_fit
4,"Data analyst with SQL, Excel, Power BI, dashbo...",fit
5,Graphic designer skilled in Adobe Photoshop an...,not_fit
6,"NLP engineer worked on text classification, tr...",fit
7,"HR recruiter with experience in hiring, onboar...",not_fit


In [3]:
# --- CELL A: train & save ---
import os
from pathlib import Path
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# Paths
DATA_DIR = Path("data")
MODEL_DIR = Path("models")
NOTEBOOKS_DIR = Path("notebooks")
MODEL_DIR.mkdir(exist_ok=True)
NOTEBOOKS_DIR.mkdir(exist_ok=True)

# Load dataset (if you used resumes_demo.csv, adapt the path)
csv_candidates = list(DATA_DIR.glob("resumes*.csv"))
if not csv_candidates:
    raise FileNotFoundError("No resumes CSV found in data/. Put resumes.csv or resumes_demo.csv in data/")
csv_path = csv_candidates[0]
print("Using dataset:", csv_path)

df = pd.read_csv(csv_path)
print("Classes distribution:\n", df['label'].value_counts())

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(df["text"], df["label"],
                                                    test_size=0.2, random_state=42, stratify=df["label"])

# Build pipeline (match your repo script)
pipeline = Pipeline([
    ("tfidf", TfidfVectorizer(lowercase=True, stop_words="english", max_features=20000, ngram_range=(1,2))),
    ("clf", LogisticRegression(max_iter=300))
])

# Fit
print("Training model...")
pipeline.fit(X_train, y_train)

# Eval
preds = pipeline.predict(X_test)
print("\nAccuracy:", accuracy_score(y_test, preds))
print("\nClassification Report:\n", classification_report(y_test, preds))
cm = confusion_matrix(y_test, preds)
print("\nConfusion Matrix:\n", cm)

# Save pipeline
model_path = MODEL_DIR / "resume_model.joblib"
joblib.dump(pipeline, model_path)
print("\nSaved trained pipeline to:", model_path)

# Save a small README snippet for the model (optional)
with open(MODEL_DIR / "README.txt", "w") as f:
    f.write("resume_model.joblib - TF-IDF + LogisticRegression pipeline\n")


Using dataset: data/resumes_demo.csv
Classes distribution:
 label
fit        4
not_fit    4
Name: count, dtype: int64
Training model...

Accuracy: 1.0

Classification Report:
               precision    recall  f1-score   support

         fit       1.00      1.00      1.00         1
     not_fit       1.00      1.00      1.00         1

    accuracy                           1.00         2
   macro avg       1.00      1.00      1.00         2
weighted avg       1.00      1.00      1.00         2


Confusion Matrix:
 [[1 0]
 [0 1]]

Saved trained pipeline to: models/resume_model.joblib


In [4]:
# --- CELL B: Gradio demo ---
import gradio as gr
import joblib
from pathlib import Path

model_path = Path("models/resume_model.joblib")
if not model_path.exists():
    raise FileNotFoundError("Model not found. Run the training cell first.")

pipeline = joblib.load(model_path)

def predict_resume(text):
    if not isinstance(text, str) or text.strip()=="":
        return "Please paste resume text or a short summary."
    pred = pipeline.predict([text])[0]
    proba = pipeline.predict_proba([text])[0].max()
    label = "‚úÖ Fit" if pred=="fit" else "‚ùå Not Fit"
    return f"{label}\nConfidence: {proba:.2f}"

demo = gr.Interface(
    fn=predict_resume,
    inputs=gr.Textbox(lines=6, placeholder="Paste resume text or short summary here..."),
    outputs="text",
    title="AI Resume Screening (Demo)",
    description="Paste resume text (or short summary) and see if the candidate is a fit for data/ML roles."
)

# Launch with share=True to get public link you can add to README/portfolio
gradio_result = demo.launch(share=True)
print("Gradio launched. Public URL:", gradio_result.share_url if hasattr(gradio_result, 'share_url') else "Check Colab output for the share link.")


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://bc0655af6e7d532ec1.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Gradio launched. Public URL: Check Colab output for the share link.


In [5]:
from google.colab import files
files.download('models/resume_model.joblib')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>