In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('dataset.csv')
df.head(3)

Unnamed: 0.1,Unnamed: 0,Job Title,Job Description
0,0,Flutter Developer,We are looking for hire experts flutter develo...
1,1,Django Developer,PYTHON/DJANGO (Developer/Lead) - Job Code(PDJ ...
2,2,Machine Learning,"Data Scientist (Contractor)\n\nBangalore, IN\n..."


In [3]:
df.shape

(2277, 3)

## Data Cleaning

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2277 entries, 0 to 2276
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Unnamed: 0       2277 non-null   int64 
 1   Job Title        2277 non-null   object
 2   Job Description  2277 non-null   object
dtypes: int64(1), object(2)
memory usage: 53.5+ KB


In [5]:
df.drop(columns=['Unnamed: 0'],inplace=True)

In [6]:
# from sklearn.preprocessing import LabelEncoder
# encoder = LabelEncoder()
# df['Job Title'] = encoder.fit_transform(df['Job Title'])

In [7]:
df.head(3)

Unnamed: 0,Job Title,Job Description
0,Flutter Developer,We are looking for hire experts flutter develo...
1,Django Developer,PYTHON/DJANGO (Developer/Lead) - Job Code(PDJ ...
2,Machine Learning,"Data Scientist (Contractor)\n\nBangalore, IN\n..."


In [8]:
df.isnull().sum()

Job Title          0
Job Description    0
dtype: int64

In [9]:
df.duplicated().sum()

np.int64(0)

In [10]:
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\vikas\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\vikas\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

## Data Preprocessing

In [11]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vikas\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [12]:
# import nltk
# from nltk.corpus import stopwords
# import string
# from nltk.stem.porter import PorterStemmer
# ps = PorterStemmer()

# def transform_text(text):
#     text = text.lower()
#     text = nltk.word_tokenize(text)
#     y = []
#     for i in text:
#         if i.isalnum():   
#             y.append(i)
#     text = y[:]   
#     y.clear()     
    
#     for i in text:
#         if i not in stopwords.words('english') and i not in string.punctuation:
#             y.append(i)
#     text = y[:]
#     y.clear()

#     for i in text:
#         y.append(ps.stem(i))
    
#     return " ".join(y)

import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
lm = WordNetLemmatizer()
result = []
def transform_text(text):
    # Lowercase + keep only words
    tokens = re.findall(r'\b[a-zA-Z]+\b', text.lower())
    
    # Remove stopwords
    tokens = [w for w in tokens if w not in stop_words]
    
    # Lemmatization
    tokens = [lm.lemmatize(w) for w in tokens]
    
    for i in tokens:
        result.append(ps.stem(i))

        

    return " ".join(result)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vikas\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\vikas\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [13]:
df['transformed_text']=df['Job Description'].apply(transform_text)

In [14]:
df.head(3)

Unnamed: 0,Job Title,Job Description,transformed_text
0,Flutter Developer,We are looking for hire experts flutter develo...,look hire expert flutter develop elig post app...
1,Django Developer,PYTHON/DJANGO (Developer/Lead) - Job Code(PDJ ...,look hire expert flutter develop elig post app...
2,Machine Learning,"Data Scientist (Contractor)\n\nBangalore, IN\n...",look hire expert flutter develop elig post app...


## Model Building

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(
    tokenizer=str.split,      
    ngram_range=(1,2),       
    max_features=3000,
    min_df=3,                 
    max_df=0.85,              
    lowercase=False
)

X = vectorizer.fit_transform(df['transformed_text'])




In [None]:
from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components=100, random_state=42)
X_lsa = svd.fit_transform(X)   


In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

best_k, best_score = None, -1
for k in range(2, 11):
    km = KMeans(n_clusters=k, init='k-means++', random_state=42, n_init=10)
    labels = km.fit_predict(X_lsa)
    score = silhouette_score(X_lsa, labels)
    print(f"k={k}, silhouette={score:.3f}")
    if score > best_score:
        best_k, best_score = k, score

print(" Best k =", best_k, "with silhouette score =", best_score)


In [None]:
kmn = KMeans(n_clusters=best_k, init='k-means++', random_state=42, n_init=10)
df['Cluster'] = kmn.fit_predict(X_lsa)


In [None]:

print(df['Cluster'].value_counts().sort_values(ascending=False))

for c in range(best_k):
    print(f"\nCluster {c} — sample titles:")
    print(df.loc[df['Cluster']==c, 'Job Title'].value_counts().head(10))


In [None]:

terms = vectorizer.get_feature_names_out()
centroids = kmn.cluster_centers_ @ svd.components_ 
order = np.argsort(centroids, axis=1)[:, ::-1]

for c in range(best_k):
    print(f"\nCluster {c} keywords: {', '.join(terms[order[c, :12]])}")
import numpy as np

terms = vectorizer.get_feature_names_out()
centroids = kmn.cluster_centers_ @ svd.components_  
order = np.argsort(centroids, axis=1)[:, ::-1]

for c in range(best_k):
    print(f"\nCluster {c} keywords: {', '.join(terms[order[c, :12]])}")
