In [1]:
import pandas as pd
import classla
from kategorije_lemmatized_classla import reset_lozinke, cijena_paketa, problem_prijave, ostalo
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import re
import os
import pickle

In [2]:
len(reset_lozinke) , len(problem_prijave), len(cijena_paketa), len(ostalo)

(90, 90, 90, 90)

In [3]:
data = [reset_lozinke + cijena_paketa + problem_prijave + ostalo]

In [4]:
data = reset_lozinke + cijena_paketa + problem_prijave + ostalo

labels = (
    ["reset_lozinke"] * len(reset_lozinke) +
    ["cijena_paketa"] * len(cijena_paketa) +
    ["problem_prijave"] * len(problem_prijave) +
    ["ostalo"] * len(ostalo)
)

# Create DataFrame
df = pd.DataFrame({
    "lemmatized_text": data,
    "label": labels
})

df.head()

Unnamed: 0,lemmatized_text,label
0,zaboraviti biti lozinka moći li ja pomoći da o...,reset_lozinke
1,moliti vi trebati nov lozinka za svoj račun,reset_lozinke
2,ne sjećati sebe šifra kako moći promijeniti pr...,reset_lozinke
3,kako da zamijeniti svoj lozinka jer biti on za...,reset_lozinke
4,trebati link za reset lozinka moliti,reset_lozinke


In [15]:
# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(df['lemmatized_text'], df['label'], test_size=0.2, random_state=42)

# Creating a pipeline with TF/IDF vectorizer and Logistic Regression
model = Pipeline([('tfidf', TfidfVectorizer()), ('clf', LogisticRegression())])

# Training the model
model.fit(X_train, y_train)

# Predicting on the test set
y_pred = model.predict(X_test)

accuracy =  accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

metrics = {
    'Accuracy': accuracy,
    'Precision': precision,
    'Recall': recall,
    'F1-Score': f1
}

classla_metrics = pd.DataFrame.from_dict(metrics, orient='index', columns=['classla'])
classla_metrics

Unnamed: 0,classla
Accuracy,0.847222
Precision,0.862606
Recall,0.847222
F1-Score,0.849218


In [6]:
print(df.duplicated(subset=['lemmatized_text']).sum())

0


In [16]:
classla.download('hr')
nlp = classla.Pipeline('hr', processors='tokenize,pos,lemma')

def classla_lemmatize_croatian(text):
    doc = nlp(text)
    return ' '.join([word.lemma for sent in doc.sentences for word in sent.words])

Downloading https://raw.githubusercontent.com/clarinsi/classla-resources/main/resources_2.2.json: 15.9kB [00:00, 2.59MB/s]                   
2025-07-14 14:38:03 INFO: Downloading these customized packages for language: hr (Croatian)...
| Processor | Package  |
------------------------
| tokenize  | standard |
| pos       | standard |
| lemma     | standard |
| depparse  | standard |
| ner       | standard |
| pretrain  | standard |

2025-07-14 14:38:03 INFO: File exists: /home/ikar/classla_resources/hr/pos/standard.pt.
2025-07-14 14:38:04 INFO: File exists: /home/ikar/classla_resources/hr/lemma/standard.pt.
2025-07-14 14:38:04 INFO: File exists: /home/ikar/classla_resources/hr/depparse/standard.pt.
2025-07-14 14:38:04 INFO: File exists: /home/ikar/classla_resources/hr/ner/standard.pt.
2025-07-14 14:38:05 INFO: File exists: /home/ikar/classla_resources/hr/pretrain/standard.pt.
2025-07-14 14:38:05 INFO: Finished downloading models and saved to /home/ikar/classla_resources.
2025-07-14 14

KeyboardInterrupt: 

In [8]:
text = "Kako ću lozinku resetirati?"

lemmatized_text = classla_lemmatize_croatian(text)

# Wrap the lemmatized text in a list
lemmatized_input = [lemmatized_text]

# Probability predictions
proba = model.predict_proba(lemmatized_input)
# Print class probabilities with labels
for text, probs in zip(lemmatized_input, proba):
    print(f"\nInput: {text}")
    for cls, prob in zip(model.classes_, probs):
        print(f"Class: {cls}, Probability: {prob:.4f}")

# Print predicted class
print("\nPredicted class:", model.predict(lemmatized_input)[0])


Input: kako htjeti lozinka resetirati ?
Class: cijena_paketa, Probability: 0.1311
Class: ostalo, Probability: 0.3335
Class: problem_prijave, Probability: 0.1146
Class: reset_lozinke, Probability: 0.4208

Predicted class: reset_lozinke


In [9]:
text = "Pojasnite mi vaše pakete."

lemmatized_text = classla_lemmatize_croatian(text)

# Wrap the lemmatized text in a list
lemmatized_input = [lemmatized_text]

# Probability predictions
proba = model.predict_proba(lemmatized_input)
# Print class probabilities with labels
for text, probs in zip(lemmatized_input, proba):
    print(f"\nInput: {text}")
    for cls, prob in zip(model.classes_, probs):
        print(f"Class: {cls}, Probability: {prob:.4f}")

# Print predicted class
print("\nPredicted class:", model.predict(lemmatized_input)[0])


Input: pojasniti ja vaš paket .
Class: cijena_paketa, Probability: 0.7698
Class: ostalo, Probability: 0.0673
Class: problem_prijave, Probability: 0.0969
Class: reset_lozinke, Probability: 0.0659

Predicted class: cijena_paketa


## With Stopwords (is surprisingly worse)

In [10]:
# Correctly get the file path from the parent directory
stopwords_path = os.path.join(os.path.abspath('..'), 'croatian_stopwords.txt')

# Open and load the stopwords
with open(stopwords_path, encoding='utf-8') as f:
    stopwords_hr = {w.strip() for w in f if w.strip()}

In [11]:
def clean_text(text, stopwords):
    text = text.strip().lower()
    text = re.sub(f"[{re.escape(string.punctuation)}]", "", text)
    text = re.sub(r"\d+", "", text)
    tokens = text.split()
    filtered = [t for t in tokens if t not in stopwords]
    return " ".join(filtered)

In [12]:
# Original text
text = ["Kako ću lozinku resetirati?"]

# Clean each sentence
new_text_clean = [clean_text(sentence, stopwords_hr) for sentence in text]

# Lemmatize each cleaned sentence
lemmatized_texts = [classla_lemmatize_croatian(sentence) for sentence in new_text_clean]

# Predict probabilities using the lemmatized text(s)
proba = model.predict_proba(lemmatized_texts)

# Output probabilities
for sentence, probs in zip(lemmatized_texts, proba):
    print(f"\nInput: {sentence}")
    for cls, prob in zip(model.classes_, probs):
        print(f"Class: {cls}, Probability: {prob:.4f}")

# Print predicted class
print("\nPredicted class:", model.predict(lemmatized_texts)[0])



Input: lozinka resetirati
Class: cijena_paketa, Probability: 0.1317
Class: ostalo, Probability: 0.1588
Class: problem_prijave, Probability: 0.1421
Class: reset_lozinke, Probability: 0.5673

Predicted class: reset_lozinke


In [13]:
# Original text
text = ["Pojasnite mi vaše pakete."]

# Clean each sentence
new_text_clean = [clean_text(sentence, stopwords_hr) for sentence in text]

# Lemmatize each cleaned sentence
lemmatized_texts = [classla_lemmatize_croatian(sentence) for sentence in new_text_clean]

# Predict probabilities using the lemmatized text(s)
proba = model.predict_proba(lemmatized_texts)

# Output probabilities
for sentence, probs in zip(lemmatized_texts, proba):
    print(f"\nInput: {sentence}")
    for cls, prob in zip(model.classes_, probs):
        print(f"Class: {cls}, Probability: {prob:.4f}")

# Print predicted class
print("\nPredicted class:", model.predict(lemmatized_texts)[0])



Input: pojasniti paket
Class: cijena_paketa, Probability: 0.6584
Class: ostalo, Probability: 0.1245
Class: problem_prijave, Probability: 0.1304
Class: reset_lozinke, Probability: 0.0866

Predicted class: cijena_paketa


In [None]:
# Save the trained model
with open('model.bin', 'wb') as f:
    pickle.dump(model, f)