In [1]:
import pandas as pd
from kategorije_lemmatized_stanza import reset_lozinke, cijena_paketa, problem_prijave, ostalo
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
import stanza
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import re
import os
import pickle

In [2]:
len(reset_lozinke) , len(problem_prijave), len(cijena_paketa), len(ostalo)

(90, 90, 90, 90)

In [3]:
data = [reset_lozinke + cijena_paketa + problem_prijave + ostalo]

In [4]:
data = reset_lozinke + cijena_paketa + problem_prijave + ostalo

labels = (
    ["reset_lozinke"] * len(reset_lozinke) +
    ["cijena_paketa"] * len(cijena_paketa) +
    ["problem_prijave"] * len(problem_prijave) +
    ["ostalo"] * len(ostalo)
)

# Create DataFrame
df = pd.DataFrame({
    "lemmatized_text": data,
    "label": labels
})

df.head()

Unnamed: 0,lemmatized_text,label
0,zaboraviti biti lozinak moći li ja pomoći da b...,reset_lozinke
1,moliti vi trebati nov lozinak za svoj račun,reset_lozinke
2,ne sjećati sebe šifra kako moći promijeniti pr...,reset_lozinke
3,kako da zamijeniti svoj lozinak jer biti biti ...,reset_lozinke
4,trebati link za reset lozinak moliti,reset_lozinke


In [5]:
# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(df['lemmatized_text'], df['label'], test_size=0.2, random_state=42)

# Creating a pipeline with TF/IDF vectorizer and Logistic Regression
model = Pipeline([('tfidf', TfidfVectorizer()), ('clf', LogisticRegression())])

# Training the model
model.fit(X_train, y_train)

# Predicting on the test set
y_pred = model.predict(X_test)

accuracy =  accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

metrics = {
    'Accuracy': accuracy,
    'Precision': precision,
    'Recall': recall,
    'F1-Score': f1
}

stanza_metrics = pd.DataFrame.from_dict(metrics, orient='index', columns=['stanza'])
stanza_metrics

Unnamed: 0,stanza
Accuracy,0.847222
Precision,0.862606
Recall,0.847222
F1-Score,0.849218


In [6]:
print(df.duplicated(subset=['lemmatized_text']).sum())

0


In [7]:
# initialize the Croatian model
stanza.download('hr')
nlp = stanza.Pipeline('hr', processors='tokenize,pos,lemma')

def stanza_lemmatize_croatian(text):
    doc = nlp(text)
    return ' '.join([word.lemma for sent in doc.sentences for word in sent.words])

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

2025-07-14 12:31:00 INFO: Downloaded file to /home/ikar/stanza_resources/resources.json
2025-07-14 12:31:00 INFO: Downloading default packages for language: hr (Croatian) ...
2025-07-14 12:31:01 INFO: File exists: /home/ikar/stanza_resources/hr/default.zip
2025-07-14 12:31:02 INFO: Finished downloading models and saved to /home/ikar/stanza_resources
2025-07-14 12:31:02 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

2025-07-14 12:31:02 INFO: Downloaded file to /home/ikar/stanza_resources/resources.json
2025-07-14 12:31:03 INFO: Loading these models for language: hr (Croatian):
| Processor | Package      |
----------------------------
| tokenize  | set          |
| pos       | set_nocharlm |
| lemma     | set_nocharlm |

2025-07-14 12:31:03 INFO: Using device: cpu
2025-07-14 12:31:03 INFO: Loading: tokenize
2025-07-14 12:31:04 INFO: Loading: pos
2025-07-14 12:31:05 INFO: Loading: lemma
2025-07-14 12:31:05 INFO: Done loading processors!


In [8]:
text = "Kako ću lozinku resetirati?"

lemmatized_text = stanza_lemmatize_croatian(text)

# Wrap the lemmatized text in a list
lemmatized_input = [lemmatized_text]

# Probability predictions
proba = model.predict_proba(lemmatized_input)
# Print class probabilities with labels
for text, probs in zip(lemmatized_input, proba):
    print(f"\nInput: {text}")
    for cls, prob in zip(model.classes_, probs):
        print(f"Class: {cls}, Probability: {prob:.4f}")

# Print predicted class
print("\nPredicted class:", model.predict(lemmatized_input)[0])


Input: kako htjeti lozinak resetirati ?
Class: cijena_paketa, Probability: 0.1329
Class: ostalo, Probability: 0.3360
Class: problem_prijave, Probability: 0.1195
Class: reset_lozinke, Probability: 0.4115

Predicted class: reset_lozinke


In [9]:
text = "Pojasnite mi vaše pakete."

lemmatized_text = stanza_lemmatize_croatian(text)

# Wrap the lemmatized text in a list
lemmatized_input = [lemmatized_text]

# Probability predictions
proba = model.predict_proba(lemmatized_input)
# Print class probabilities with labels
for text, probs in zip(lemmatized_input, proba):
    print(f"\nInput: {text}")
    for cls, prob in zip(model.classes_, probs):
        print(f"Class: {cls}, Probability: {prob:.4f}")

# Print predicted class
print("\nPredicted class:", model.predict(lemmatized_input)[0])


Input: pojasniti ja vaš paket .
Class: cijena_paketa, Probability: 0.7718
Class: ostalo, Probability: 0.0653
Class: problem_prijave, Probability: 0.0957
Class: reset_lozinke, Probability: 0.0672

Predicted class: cijena_paketa


## With Stopwords

In [11]:
# Correctly get the file path from the parent directory
stopwords_path = os.path.join(os.path.abspath('..'), 'croatian_stopwords.txt')

# Open and load the stopwords
with open(stopwords_path, encoding='utf-8') as f:
    stopwords_hr = {w.strip() for w in f if w.strip()}

In [12]:
def clean_text(text, stopwords):
    text = text.strip().lower()
    text = re.sub(f"[{re.escape(string.punctuation)}]", "", text)
    text = re.sub(r"\d+", "", text)
    tokens = text.split()
    filtered = [t for t in tokens if t not in stopwords]
    return " ".join(filtered)

In [13]:
# Original text
text = ["Kako ću lozinku resetirati?"]

# Clean each sentence
new_text_clean = [clean_text(sentence, stopwords_hr) for sentence in text]

# Lemmatize each cleaned sentence
lemmatized_texts = [stanza_lemmatize_croatian(sentence) for sentence in new_text_clean]

# Predict probabilities using the lemmatized text(s)
proba = model.predict_proba(lemmatized_texts)

# Output probabilities
for sentence, probs in zip(lemmatized_texts, proba):
    print(f"\nInput: {sentence}")
    for cls, prob in zip(model.classes_, probs):
        print(f"Class: {cls}, Probability: {prob:.4f}")

# Print predicted class
print("\nPredicted class:", model.predict(lemmatized_texts)[0])


Input: lozinak resetirati
Class: cijena_paketa, Probability: 0.1378
Class: ostalo, Probability: 0.1642
Class: problem_prijave, Probability: 0.1539
Class: reset_lozinke, Probability: 0.5441

Predicted class: reset_lozinke


In [14]:
# Original text
text = ["Pojasnite mi vaše pakete."]

# Clean each sentence
new_text_clean = [clean_text(sentence, stopwords_hr) for sentence in text]

# Lemmatize each cleaned sentence
lemmatized_texts = [stanza_lemmatize_croatian(sentence) for sentence in new_text_clean]

# Predict probabilities using the lemmatized text(s)
proba = model.predict_proba(lemmatized_texts)

# Output probabilities
for sentence, probs in zip(lemmatized_texts, proba):
    print(f"\nInput: {sentence}")
    for cls, prob in zip(model.classes_, probs):
        print(f"Class: {cls}, Probability: {prob:.4f}")

# Print predicted class
print("\nPredicted class:", model.predict(lemmatized_texts)[0])



Input: pojasniti paket
Class: cijena_paketa, Probability: 0.6597
Class: ostalo, Probability: 0.1219
Class: problem_prijave, Probability: 0.1294
Class: reset_lozinke, Probability: 0.0890

Predicted class: cijena_paketa


In [16]:
# Save the trained model to a file
with open('model.bin', 'wb') as f:
    pickle.dump(model, f)