In [1]:
import pandas as pd
from kategorije_lemmatized_nltk import reset_lozinke, cijena_paketa, problem_prijave, ostalo
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import re
import os
import pickle

In [2]:
len(reset_lozinke) , len(problem_prijave), len(cijena_paketa), len(ostalo)

(90, 90, 90, 90)

In [3]:
data = [reset_lozinke + cijena_paketa + problem_prijave + ostalo]

In [4]:
data = reset_lozinke + cijena_paketa + problem_prijave + ostalo

labels = (
    ["reset_lozinke"] * len(reset_lozinke) +
    ["cijena_paketa"] * len(cijena_paketa) +
    ["problem_prijave"] * len(problem_prijave) +
    ["ostalo"] * len(ostalo)
)

# Create DataFrame
df = pd.DataFrame({
    "lemmatized_text": data,
    "label": labels
})

df.head()

Unnamed: 0,lemmatized_text,label
0,zaboravio sam lozinku možete li mi pomoći da j...,reset_lozinke
1,molim va trebam novu lozinku za svoj račun,reset_lozinke
2,ne sjećam se šifre kako mogu promijeniti prist...,reset_lozinke
3,kako da zamijenim svoju lozinku jer sam je zab...,reset_lozinke
4,trebam link za reset lozinke molim,reset_lozinke


In [5]:
# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(df['lemmatized_text'], df['label'], test_size=0.2, random_state=42)

# Creating a pipeline with TF/IDF vectorizer and Logistic Regression
model = Pipeline([('tfidf', TfidfVectorizer()), ('clf', LogisticRegression())])

# Training the model
model.fit(X_train, y_train)

# Predicting on the test set
y_pred = model.predict(X_test)

accuracy =  accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

metrics = {
    'Accuracy': accuracy,
    'Precision': precision,
    'Recall': recall,
    'F1-Score': f1
}

nltk_metrics = pd.DataFrame.from_dict(metrics, orient='index', columns=['nltk'])
nltk_metrics

Unnamed: 0,nltk
Accuracy,0.875
Precision,0.884959
Recall,0.875
F1-Score,0.876421


In [6]:
print(df.duplicated(subset=['lemmatized_text']).sum())

0


In [7]:
''' NLTK in wordnet only in English'''
# NLTK data for lemmatization
nltk.download('wordnet')

# Initialize stemmer and lemmatizer
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

# Function to apply stemming
def stem_text(text):
    return ' '.join([stemmer.stem(word) for word in text.split()])

# Function to apply lemmatization
def lemmatize_text(text):
    return ' '.join([lemmatizer.lemmatize(word) for word in text.split()])

[nltk_data] Downloading package wordnet to /home/ikar/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [8]:
text = "Kako ću lozinku resetirati?"

lemmatized_text = lemmatize_text(text)

# Wrap the lemmatized text in a list
lemmatized_input = [lemmatized_text]

# Probability predictions
proba = model.predict_proba(lemmatized_input)
# Print class probabilities with labels
for text, probs in zip(lemmatized_input, proba):
    print(f"\nInput: {text}")
    for cls, prob in zip(model.classes_, probs):
        print(f"Class: {cls}, Probability: {prob:.4f}")

# Print predicted class
print("\nPredicted class:", model.predict(lemmatized_input)[0])


Input: Kako ću lozinku resetirati?
Class: cijena_paketa, Probability: 0.1720
Class: ostalo, Probability: 0.3328
Class: problem_prijave, Probability: 0.1334
Class: reset_lozinke, Probability: 0.3619

Predicted class: reset_lozinke


In [9]:
text = "Pojasnite mi vaše pakete."

lemmatized_text = lemmatize_text(text)

# Wrap the lemmatized text in a list
lemmatized_input = [lemmatized_text]

# Probability predictions
proba = model.predict_proba(lemmatized_input)
# Print class probabilities with labels
for text, probs in zip(lemmatized_input, proba):
    print(f"\nInput: {text}")
    for cls, prob in zip(model.classes_, probs):
        print(f"Class: {cls}, Probability: {prob:.4f}")

# Print predicted class
print("\nPredicted class:", model.predict(lemmatized_input)[0])


Input: Pojasnite mi vaše pakete.
Class: cijena_paketa, Probability: 0.3929
Class: ostalo, Probability: 0.1717
Class: problem_prijave, Probability: 0.2264
Class: reset_lozinke, Probability: 0.2090

Predicted class: cijena_paketa


## With Stopwords

In [10]:
# Correctly get the file path from the parent directory
stopwords_path = os.path.join(os.path.abspath('..'), 'croatian_stopwords.txt')

# Open and load the stopwords
with open(stopwords_path, encoding='utf-8') as f:
    stopwords_hr = {w.strip() for w in f if w.strip()}

In [11]:
def clean_text(text, stopwords):
    text = text.strip().lower()
    text = re.sub(f"[{re.escape(string.punctuation)}]", "", text)
    text = re.sub(r"\d+", "", text)
    tokens = text.split()
    filtered = [t for t in tokens if t not in stopwords]
    return " ".join(filtered)

In [12]:
# Original text
text = ["Kako ću lozinku resetirati?"]

# Clean each sentence
new_text_clean = [clean_text(sentence, stopwords_hr) for sentence in text]

# Lemmatize each cleaned sentence
lemmatized_texts = [lemmatize_text(sentence) for sentence in new_text_clean]

# Predict probabilities using the lemmatized text(s)
proba = model.predict_proba(lemmatized_texts)

# Output probabilities
for sentence, probs in zip(lemmatized_texts, proba):
    print(f"\nInput: {sentence}")
    for cls, prob in zip(model.classes_, probs):
        print(f"Class: {cls}, Probability: {prob:.4f}")

# Print predicted class
print("\nPredicted class:", model.predict(lemmatized_texts)[0])


Input: lozinku resetirati
Class: cijena_paketa, Probability: 0.2010
Class: ostalo, Probability: 0.1826
Class: problem_prijave, Probability: 0.1876
Class: reset_lozinke, Probability: 0.4289

Predicted class: reset_lozinke


In [13]:
# Original text
text = ["Pojasnite mi vaše pakete."]

# Clean each sentence
new_text_clean = [clean_text(sentence, stopwords_hr) for sentence in text]

# Lemmatize each cleaned sentence
lemmatized_texts = [lemmatize_text(sentence) for sentence in new_text_clean]

# Predict probabilities using the lemmatized text(s)
proba = model.predict_proba(lemmatized_texts)

# Output probabilities
for sentence, probs in zip(lemmatized_texts, proba):
    print(f"\nInput: {sentence}")
    for cls, prob in zip(model.classes_, probs):
        print(f"Class: {cls}, Probability: {prob:.4f}")

# Print predicted class
print("\nPredicted class:", model.predict(lemmatized_texts)[0])



Input: pojasnite pakete
Class: cijena_paketa, Probability: 0.3895
Class: ostalo, Probability: 0.2393
Class: problem_prijave, Probability: 0.2196
Class: reset_lozinke, Probability: 0.1516

Predicted class: cijena_paketa


In [14]:
# Save the trained model to a file
with open('model.bin', 'wb') as f:
    pickle.dump(model, f)