In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

job_sectors = [
    "Internship/ Summer Jobs/ Trainee", "Healthcare", "Industrial Work", "Installation and Maintenance",
    "Building and Construction", "Customer Service", "Hotel and Restaurant Work",
    "Sales", "Community and Social Services", "Logistics",
    "Administration", "Manufacturing", "Property Maintenance",
    "Finance", "Education", "Engineering", "IT",
    "Management","Other"
    ]

df = pd.read_csv(r"C:\Users\hesam\OneDrive - LUT University\Desktop\job_samples.csv", encoding="utf-8")

df = df[df["category"].isin(job_sectors)]

x = df["title"]
y = df["category"]

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42) # use random_state for reproducibility, it means you can get the same split every time you run the code


c:\Users\hesam\AppData\Local\Programs\Python\Python312\python.exe


In [42]:
import optuna
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from nltk.corpus import stopwords
from sklearn.metrics import accuracy_score

# finnish stop words
finnish_stopwords = stopwords.words("finnish")
english_stopwords = stopwords.words("english")
all_stopwords = list(set(finnish_stopwords + english_stopwords))

def create_pipeline(trial):
    penalty = trial.suggest_categorical("penalty", ["l2"])
    C = trial.suggest_loguniform("C", 0.01, 10)
    pipeline = Pipeline([
        ("tfidf", TfidfVectorizer(
            lowercase=True,
            stop_words=all_stopwords,
            max_features=200_000
        )),
        ("clf", LogisticRegression(
            max_iter=2000,
            solver="saga",
            n_jobs=-1,
            penalty=penalty,
            C=C,
        ))
    ])

    # train the model
    pipeline.fit(X_train, y_train) 
    preds = pipeline.predict(X_test)
    return accuracy_score(y_test, preds)


study = optuna.create_study(direction="maximize")
study.optimize(create_pipeline, n_trials=20)

print("Best hyperparameters: ", study.best_params)
print("Best accuracy: ", study.best_value)

# Best hyperparameters:  {'penalty': 'l2', 'C': 2.306691231862669}
# Best accuracy:  0.7942407046667231

[I 2025-08-31 22:50:51,951] A new study created in memory with name: no-name-fd6e03ee-cf7e-4da0-9341-c0d73ac1ba42
  C = trial.suggest_loguniform("C", 0.01, 10)
[I 2025-08-31 22:51:06,993] Trial 0 finished with value: 0.7938341661726095 and parameters: {'penalty': 'l2', 'C': 3.9388777343323116}. Best is trial 0 with value: 0.7938341661726095.
  C = trial.suggest_loguniform("C", 0.01, 10)
[I 2025-08-31 22:51:20,145] Trial 1 finished with value: 0.7939696790039807 and parameters: {'penalty': 'l2', 'C': 3.2496765934045575}. Best is trial 1 with value: 0.7939696790039807.
  C = trial.suggest_loguniform("C", 0.01, 10)
[I 2025-08-31 22:51:35,592] Trial 2 finished with value: 0.7938341661726095 and parameters: {'penalty': 'l2', 'C': 3.965733896438594}. Best is trial 1 with value: 0.7939696790039807.
  C = trial.suggest_loguniform("C", 0.01, 10)
[I 2025-08-31 22:51:46,023] Trial 3 finished with value: 0.5979842466333531 and parameters: {'penalty': 'l2', 'C': 0.010601983472868866}. Best is trial

Best hyperparameters:  {'penalty': 'l2', 'C': 2.7493888827317488}
Best accuracy:  0.7945625476412298


In [44]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from nltk.corpus import stopwords
from sklearn.metrics import accuracy_score

# finnish stop words
finnish_stopwords = stopwords.words("finnish")
english_stopwords = stopwords.words("english")
all_stopwords = list(set(finnish_stopwords + english_stopwords))


pipeline = Pipeline([
    ("tfidf", TfidfVectorizer(
        lowercase=True,
        stop_words=all_stopwords,
        max_features=200_000
    )),
    ("clf", LogisticRegression(
        max_iter=2000,
        solver="saga",
        n_jobs=-1,
        penalty="l2",
        C=2.486691231862669,
    ))
])

pipeline.fit(X_train, y_train) 


In [35]:
from sklearn.metrics import classification_report, accuracy_score

y_pred = pipeline.predict(X_test)

train_preds = pipeline.predict(X_train)
test_preds = pipeline.predict(X_test)

train_accuracy = accuracy_score(y_train, train_preds)
test_accuracy = accuracy_score(y_test, y_pred)

print(classification_report(y_test, y_pred))
print(f"model Accuracy: {pipeline.score(X_test, y_test)}")
print(f"Train Accuracy: {train_accuracy}")
print(f"Test Accuracy: {test_accuracy}")


                                  precision    recall  f1-score   support

                  Administration       0.80      0.53      0.63       684
       Building and Construction       0.80      0.81      0.80      2476
   Community and Social Services       0.77      0.67      0.72      3151
                Customer Service       0.62      0.57      0.59      1828
                       Education       0.93      0.94      0.94      7464
                     Engineering       0.83      0.78      0.80      2100
                         Finance       0.81      0.82      0.81      1234
                      Healthcare       0.92      0.90      0.91      5096
       Hotel and Restaurant Work       0.91      0.91      0.91      3349
                              IT       0.86      0.86      0.86      3795
                 Industrial Work       0.55      0.47      0.50      2243
    Installation and Maintenance       0.86      0.86      0.86      3202
Internship/ Summer Jobs/ Trainee     

In [45]:
import joblib

joblib.dump(pipeline, "job_tagger_pipeline.pkl")

['job_tagger_pipeline.pkl']

In [None]:
# test the model with the random job title

job_tagger_pipeline = joblib.load("job_tagger_pipeline.pkl")


new_jobs = [
    "React Engineer",
    "Rakennusinsinööri",
    # "Sairaanhoitaja",
    # "Opettaja",
    # "Keittiöapulainen",
    # "Varastotyöntekijä",
    # "Taloushallinnon assistentti",
    # "Automekaanikko",
    # "Siivooja",
    # "Projektipäällikkö",
    # "IT-asiantuntija",
    # "Data Scientist",
    # "Product Manager",
    # "Chief Financial Officer",
    # "Marketing Specialist",
    # "Mechanical Engineer",
    # "School Teacher",
    # "Nurse",
]

# Get the predicted probabilities
probs = job_tagger_pipeline.predict_proba(new_jobs)
max_probs = probs.max(axis=1)

predictions = job_tagger_pipeline.predict(new_jobs)
for i in range(len(new_jobs)):
    print(new_jobs[i], predictions[i], max_probs[i])

React Engineer IT 0.7030764893229988
Rakennusinsinööri Engineering 0.8635619595737497
Sairaanhoitaja Healthcare 0.9999996184385805
Opettaja Education 0.9996775727484922
Keittiöapulainen Hotel and Restaurant Work 0.9943885584918005
Varastotyöntekijä Logistics 0.9793144368526907
Taloushallinnon assistentti Finance 0.6854933788462086
Automekaanikko Installation and Maintenance 0.7387973020996846
Siivooja Property Maintenance 0.4980600096061379
Projektipäällikkö Management 0.9928271975752226
IT-asiantuntija IT 0.42288710519431283
