In [None]:
# Install necessary libraries if not already installed
!pip install schedule
!pip install python-telegram-bot

Collecting schedule
  Downloading schedule-1.2.2-py3-none-any.whl.metadata (3.8 kB)
Downloading schedule-1.2.2-py3-none-any.whl (12 kB)
Installing collected packages: schedule
Successfully installed schedule-1.2.2
Collecting python-telegram-bot
  Downloading python_telegram_bot-22.1-py3-none-any.whl.metadata (17 kB)
Downloading python_telegram_bot-22.1-py3-none-any.whl (702 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m702.3/702.3 kB[0m [31m15.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: python-telegram-bot
Successfully installed python-telegram-bot-22.1


In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import joblib
import time


In [3]:
def scrape_karkidi_jobs(keyword="data science", pages=2):
    headers = {'User-Agent': 'Mozilla/5.0'}
    base_url = "https://www.karkidi.com/Find-Jobs/{page}/all/India?search={query}"
    jobs_list = []

    for page in range(1, pages + 1):
        url = base_url.format(page=page, query=keyword.replace(' ', '%20'))
        print(f"Scraping page: {page}")
        response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.content, "html.parser")

        job_blocks = soup.find_all("div", class_="ads-details")
        for job in job_blocks:
            try:
                title = job.find("h4").get_text(strip=True)
                company = job.find("a", href=lambda x: x and "Employer-Profile" in x).get_text(strip=True)
                location = job.find("p").get_text(strip=True)
                experience = job.find("p", class_="emp-exp").get_text(strip=True)
                key_skills_tag = job.find("span", string="Key Skills")
                Skills = key_skills_tag.find_next("p").get_text(strip=True) if key_skills_tag else ""
                summary_tag = job.find("span", string="Summary")
                summary = summary_tag.find_next("p").get_text(strip=True) if summary_tag else ""

                jobs_list.append({
                    "Title": title,
                    "Company": company,
                    "Location": location,
                    "Experience": experience,
                    "Summary": summary,
                    "Skills": Skills
                })
            except Exception as e:
                print(f"Error parsing job block: {e}")
                continue

        time.sleep(1)

    return pd.DataFrame(jobs_list)


if __name__ == "__main__":
    df_jobs = scrape_karkidi_jobs(keyword="data science", pages=2)
    print(df_jobs.head())


Scraping page: 1
Scraping page: 2
                                               Title         Company  \
0          Machine Learning Physical Design Engineer          Google   
1  Staff Software Engineer - Monetization, Poe (R...     Quora, Inc.   
2  Staff Backend Engineer - Bot Creator Ecosystem...     Quora, Inc.   
3  Senior Backend Engineer - Bot Creator Ecosyste...     Quora, Inc.   
4                         Data Scientist Lead - AIML  JPMorgan Chase   

                      Location Experience  \
0  Bengaluru, Karnataka, India   4-6 year   
1                        India  8-10 year   
2                        India  8-10 year   
3                        India   6-8 year   
4  Bengaluru, Karnataka, India   6-8 year   

                                             Summary  \
0  Minimum qualifications:Bachelor's degree in El...   
1  About Quora:Quora’s mission is to grow and sha...   
2  About Quora:Quora’s mission is to grow and sha...   
3  About Quora:Quora’s mission is to g

In [None]:
df_jobs.shape

(20, 6)

In [None]:
df_jobs.columns

Index(['Title', 'Company', 'Location', 'Experience', 'Summary', 'Skills'], dtype='object')

In [4]:
def preprocess_skills(df):
    df["Skills"] = df["Skills"].apply(lambda x: x.lower())
    df["Skills"] = df["Skills"].apply(lambda x: re.sub(r"[^a-z, ]", "", x))
    df["Skills"] = df["Skills"].apply(lambda x: ", ".join(set(x.split(", "))))
    return df

def cluster_jobs(df, n_clusters=5):
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(df["Skills"])

    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    kmeans.fit(X)

    df["cluster"] = kmeans.labels_

    # Save model and vectorizer
    joblib.dump(kmeans, "kmeans_model.pkl")
    joblib.dump(vectorizer, "vectorizer.pkl")

    return df

# Full Pipeline Execution
def main():
    print("Scraping jobs from Karkidi.com...")
    df = scrape_karkidi_jobs()
    print(f"Scraped {len(df)} jobs.")

    print("Preprocessing skills...")
    df = preprocess_skills(df)

    print("Clustering jobs...")
    df = cluster_jobs(df, n_clusters=5)

    print(df.head())
    df.to_csv("clustered_jobs.csv", index=False)
    print("Saved clustered job data to 'clustered_jobs.csv'")

if __name__ == "__main__":
    main()


Scraping jobs from Karkidi.com...
Scraping page: 1
Scraping page: 2
Scraped 20 jobs.
Preprocessing skills...
Clustering jobs...
                                               Title         Company  \
0          Machine Learning Physical Design Engineer          Google   
1  Staff Software Engineer - Monetization, Poe (R...     Quora, Inc.   
2  Staff Backend Engineer - Bot Creator Ecosystem...     Quora, Inc.   
3  Senior Backend Engineer - Bot Creator Ecosyste...     Quora, Inc.   
4                         Data Scientist Lead - AIML  JPMorgan Chase   

                      Location Experience  \
0  Bengaluru, Karnataka, India   4-6 year   
1                        India  8-10 year   
2                        India  8-10 year   
3                        India   6-8 year   
4  Bengaluru, Karnataka, India   6-8 year   

                                             Summary  \
0  Minimum qualifications:Bachelor's degree in El...   
1  About Quora:Quora’s mission is to grow and sha...   
