In [1]:
import pandas as pd
import numpy as np 
from sklearn.preprocessing import LabelEncoder

In [2]:
df = pd.read_csv('../../datasets/training-dataset-job-categorisation.csv')
df.isnull().sum()

job_id                  0
job_title               0
category                0
subcategory             0
role                  875
Updated category    10973
importance          19888
dtype: int64

In [3]:
# Fill null values with the most frequent value in each column
df['role'].fillna(df['role'].mode()[0], inplace=True)
df['importance'].fillna(df['importance'].mode()[0], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['role'].fillna(df['role'].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['importance'].fillna(df['importance'].mode()[0], inplace=True)


In [4]:
df = df.dropna(subset=['Updated category'])

In [5]:
df.isnull().any()

job_id              False
job_title           False
category            False
subcategory         False
role                False
Updated category    False
importance          False
dtype: bool

In [6]:
import re


def clean_text(text):
    # Convert to lowercase
    text = str(text).lower()
    # Remove special characters and extra spaces
    text = re.sub(r'[^a-zA-Z\s]', ' ', text)
    # Remove extra whitespace
    text = ' '.join(text.split())
    return text

df['job_title_clean'] = df['job_title'].apply(clean_text)

In [7]:
# Encode categorical features
le = LabelEncoder()
df['category_encoded'] = le.fit_transform(df['category'])
df['subcategory_encoded'] = le.fit_transform(df['subcategory'])

In [8]:
# Create word count feature
df['title_word_count'] = df['job_title_clean'].str.count('\s+') + 1

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

# TF-IDF for job titles
tfidf = TfidfVectorizer(max_features=100)
title_features = tfidf.fit_transform(df['job_title_clean'])

In [11]:
df.tail(5)

Unnamed: 0,job_id,job_title,category,subcategory,role,Updated category,importance,job_title_clean,category_encoded,subcategory_encoded,title_word_count
39688,76651593,Cluster Director of Marcom,Marketing & Communications,Management,Cluster-Director,Marketing/Sales,Niveau 2,cluster director of marcom,21,122,4
39689,76567802,"Marketing Executive, Cruises",Marketing & Communications,Marketing Communications,marketing-executive,Marketing/Sales,Niveau 3,marketing executive cruises,21,134,3
39691,76653277,"Marketing Specialist, APAC",Marketing & Communications,Marketing Communications,marketing-specialist,Marketing/Sales,Niveau 3,marketing specialist apac,21,134,3
39692,76674488,Talent Acquisition Partner,Human Resources & Recruitment,Recruitment - Internal,talent-acquisition-partner,Human Resources (HR),Niveau 3,talent acquisition partner,16,185,3
39694,76653289,Banca Specialist (Klang Valley),Banking & Financial Services,Banking - Retail/Branch,specialist,Finance,Niveau 3,banca specialist klang valley,3,29,4


In [12]:
# Encode target variable
target_encoder = LabelEncoder()
df['target'] = target_encoder.fit_transform(df['Updated category'])

In [13]:
df.head(3)

Unnamed: 0,job_id,job_title,category,subcategory,role,Updated category,importance,job_title_clean,category_encoded,subcategory_encoded,title_word_count,target
2,74679363,Purchasing Executive,"Manufacturing, Transport & Logistics","Purchasing, Procurement & Inventory",purchasing-executive,Operations/Logistics,Niveau 1,purchasing executive,20,179,2,7
3,74657915,PURCHASING EXECUTIVE,Engineering,Project Engineering,purchasing-executive,Operations/Logistics,Niveau 1,purchasing executive,11,175,2,7
5,74602737,Admin Assistant,Administration & Office Support,Administrative Assistants,administration-officer,Operations/Logistics,Niveau 3,admin assistant,1,5,2,7


In [14]:
# Combine all features
from scipy.sparse import hstack

X = hstack([
    title_features,
    df[['category_encoded', 'subcategory_encoded', 'title_word_count']].values
])
y = df['target']

In [15]:
from sklearn.model_selection import train_test_split

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Training set size:", X_train.shape)
print("Testing set size:", X_test.shape)

Training set size: (22977, 103)
Testing set size: (5745, 103)


In [17]:
from sklearn.ensemble import RandomForestClassifier
# Recommended settings
rf = RandomForestClassifier(
    n_estimators=200,        # More trees for stability
    max_depth=10,           # Control overfitting
    min_samples_split=5,    # More conservative splits
    max_features='sqrt',    # Good for text features
    class_weight='balanced' # Handle imbalanced categories
)

In [18]:
from sklearn.metrics import accuracy_score, classification_report

# Train the RandomForest model
rf.fit(X_train, y_train)

In [19]:
# Predict on the test set
y_pred = rf.predict(X_test)

In [21]:
from sklearn.metrics import f1_score, recall_score

# Evaluate the model

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")
# Calculate F1 score and recall

f1 = f1_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')

print(f"F1 Score: {f1:.2f}")
print(f"Recall: {recall:.2f}")
# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.91
F1 Score: 0.91
Recall: 0.91
Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.97      0.94       337
           1       0.60      1.00      0.75        25
           2       0.73      1.00      0.84       487
           3       0.92      0.98      0.95       775
           4       0.95      0.78      0.86       199
           5       0.35      0.89      0.51        72
           6       0.97      0.93      0.95      1811
           7       1.00      0.83      0.91      1830
           8       0.83      0.99      0.91       209

    accuracy                           0.91      5745
   macro avg       0.81      0.93      0.84      5745
weighted avg       0.93      0.91      0.91      5745

