In [None]:
# job_monitoring_system.py

import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import os
import time
from datetime import datetime
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import joblib
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

# Constants
BASE_URL = "https://www.karkidi.com"
SEARCH_URL = "https://www.karkidi.com/Find-Jobs/{page}/all/India?search={query}"
HEADERS = {"User-Agent": "Mozilla/5.0"}
MODEL_DIR = "model"
DATA_PATH = "data/jobs.csv"
N_CLUSTERS = 5

# Ensure directories exist
os.makedirs(MODEL_DIR, exist_ok=True)
os.makedirs("data", exist_ok=True)

def log(msg):
    print(f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] {msg}")

# Scrape job listings from multiple pages
def scrape_jobs(keyword="data science", pages=2):
    jobs = []
    query = keyword.replace(" ", "%20")

    for page in range(1, pages + 1):
        url = SEARCH_URL.format(page=page, query=query)
        log(f"Scraping page {page}: {url}")
        try:
            response = requests.get(url, headers=HEADERS, timeout=10)
            soup = BeautifulSoup(response.content, "html.parser")
            job_blocks = soup.find_all("div", class_="ads-details")

            for job in job_blocks:
                try:
                    title = job.find("h4").get_text(strip=True)
                    company = job.find("a", href=lambda x: x and "Employer-Profile" in x)
                    company_name = company.get_text(strip=True) if company else ""
                    location = job.find("p").get_text(strip=True) if job.find("p") else ""
                    experience = job.find("p", class_="emp-exp").get_text(strip=True) if job.find("p", class_="emp-exp") else ""
                    skills_tag = job.find("span", string="Key Skills")
                    skills = skills_tag.find_next("p").get_text(strip=True) if skills_tag else ""
                    summary_tag = job.find("span", string="Summary")
                    summary = summary_tag.find_next("p").get_text(strip=True) if summary_tag else ""
                    url = BASE_URL + job.find_parent("a")["href"] if job.find_parent("a") else ""

                    jobs.append({
                        "title": title,
                        "company": company_name,
                        "location": location,
                        "experience": experience,
                        "skills": skills,
                        "summary": summary,
                        "url": url,
                        "scraped_at": datetime.now().isoformat()
                    })
                except Exception as e:
                    log(f"❌ Error parsing job block: {e}")
                    continue

            time.sleep(1)
        except Exception as e:
            log(f"❌ Failed to fetch page {page}: {e}")

    df = pd.DataFrame(jobs)
    if 'skills' not in df.columns:
        df['skills'] = "none"
    return df

# Preprocess skill strings
def preprocess_skills(skills_series):
    return skills_series.str.lower().str.replace(r"[^a-zA-Z0-9, ]", "", regex=True).fillna("")

# Train clustering model
def train_model(jobs_df):
    skills_cleaned = preprocess_skills(jobs_df['skills'])
    vectorizer = TfidfVectorizer(tokenizer=str.split, stop_words="english")
    skill_vectors = vectorizer.fit_transform(skills_cleaned)

    model = KMeans(n_clusters=N_CLUSTERS, random_state=42)
    model.fit(skill_vectors)

    jobs_df['cluster'] = model.labels_

    # Evaluate with silhouette score
    score = silhouette_score(skill_vectors, model.labels_)
    log(f"📊 Silhouette Score: {score:.3f}")

    # Manual inspection of clusters
    for i in range(N_CLUSTERS):
        log(f"\n📂 Cluster {i} sample skills:")
        print(jobs_df[jobs_df['cluster'] == i]['skills'].head(3).to_string(index=False))

    joblib.dump(vectorizer, os.path.join(MODEL_DIR, "skill_vectorizer.pkl"))
    joblib.dump(model, os.path.join(MODEL_DIR, "clustering_model.pkl"))

    return jobs_df

# Classify new jobs with saved model
def classify_jobs(jobs_df):
    vectorizer = joblib.load(os.path.join(MODEL_DIR, "skill_vectorizer.pkl"))
    model = joblib.load(os.path.join(MODEL_DIR, "clustering_model.pkl"))

    skills_cleaned = preprocess_skills(jobs_df['skills'])
    skill_vectors = vectorizer.transform(skills_cleaned)
    jobs_df['cluster'] = model.predict(skill_vectors)
    return jobs_df

# Save to CSV
def save_jobs(jobs_df):
    jobs_df.to_csv(DATA_PATH, index=False)
    log(f"✅ Saved {len(jobs_df)} jobs to {DATA_PATH}")

# Notify users
def notify_users(jobs_df, user_interest_clusters):
    matching_jobs = jobs_df[jobs_df['cluster'].isin(user_interest_clusters)]
    if not matching_jobs.empty:
        log("🔔 New jobs matching user interests:")
        print(matching_jobs[['title', 'company', 'location', 'url']])
    else:
        log("No matching jobs found today.")

# Main daily job
if __name__ == "__main__":
    log("🚀 Starting job scraping and classification...")
    jobs_df = scrape_jobs(keyword="data science", pages=2)

    if not os.path.exists(os.path.join(MODEL_DIR, "clustering_model.pkl")):
        log("🔧 Training new clustering model...")
        jobs_df = train_model(jobs_df)
    else:
        log("🔎 Classifying with existing model...")
        jobs_df = classify_jobs(jobs_df)

    save_jobs(jobs_df)

    # Example user preferences
    user_interest_clusters = [0, 2]
    notify_users(jobs_df, user_interest_clusters)

    log("✅ Job monitoring completed.")


[2025-05-20 06:29:05] 🚀 Starting job scraping and classification...
[2025-05-20 06:29:05] Scraping page 1: https://www.karkidi.com/Find-Jobs/1/all/India?search=data%20science
[2025-05-20 06:29:17] Scraping page 2: https://www.karkidi.com/Find-Jobs/2/all/India?search=data%20science
[2025-05-20 06:29:25] 🔎 Classifying with existing model...
[2025-05-20 06:29:25] ✅ Saved 20 jobs to data/jobs.csv
[2025-05-20 06:29:25] 🔔 New jobs matching user interests:
                                        title         company  \
4   Applied AI ML Director - Machine Learning  JPMorgan Chase   
5                     Senior Product Designer      Observe.AI   
7                              Data Scientist         Spotify   
14  Applied AI ML Director - Machine Learning  JPMorgan Chase   
15                    Senior Product Designer      Observe.AI   
17                             Data Scientist         Spotify   

                       location url  
4   Hyderabad, Telangana, India      
5   Bangalore,