1. Import i dane

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB, ComplementNB, BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

nltk.download("stopwords")
nltk.download("wordnet")

2. Czyszczenie tekstu i przetwarzanie

In [None]:
# Funkcje przetwarzające tekst
def clean_text(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r"http\S+|www.\S+", "", text)
    text = re.sub(r"[^a-z\s]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

def apply_stemming(text):
    stemmer = PorterStemmer()
    stop_words = set(stopwords.words("english"))
    return " ".join([stemmer.stem(word) for word in text.split() if word not in stop_words])

def apply_lemmatization(text):
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words("english"))
    return " ".join([lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words])


3. Wczytywanie i przygotowanie danych

In [None]:
df = pd.read_csv("data/train.csv")
df = df.dropna(subset=["text"]).drop_duplicates()

df["clean_text"] = df["text"].apply(clean_text)
df["stemmed"] = df["clean_text"].apply(apply_stemming)
df["lemmatized"] = df["clean_text"].apply(apply_lemmatization)

# Zakodowanie etykiet jako liczby
label_encoder = LabelEncoder()
df["label"] = label_encoder.fit_transform(df["Category"])


4. Balansowanie danych

In [None]:
def balance_dataframe(df, label_col, n_per_class):
    return df.groupby(label_col).apply(lambda x: x.sample(n=min(n_per_class, len(x)), random_state=42)).reset_index(drop=True)

df_balanced = balance_dataframe(df, "Category", 200)
df_balanced["label"] = label_encoder.transform(df_balanced["Category"])


5. Modele i ewaluacja krzyżowa

In [None]:
from sklearn.model_selection import StratifiedKFold
import numpy as np

models = {
    "MultinomialNB": MultinomialNB(),
    "ComplementNB": ComplementNB(),
    "BernoulliNB": BernoulliNB(),
    "LogReg_l2": LogisticRegression(penalty='l2', solver='liblinear', max_iter=1000),
    "LogReg_l1": LogisticRegression(penalty='l1', solver='liblinear', max_iter=1000),
    "LinearSVC": LinearSVC(max_iter=1000),
    "SVC_rbf": SVC(kernel='rbf'),
    "SVC_poly": SVC(kernel='poly'),
    "RF_gini": RandomForestClassifier(criterion="gini", n_estimators=100, random_state=42),
    "RF_entropy": RandomForestClassifier(criterion="entropy", n_estimators=100, random_state=42)
}

def evaluate_models_cv(X_texts, y_labels, title_suffix):
    results = {}
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    for name, model in models.items():
        pipeline = Pipeline([
            ("tfidf", TfidfVectorizer(max_features=5000)),
            ("clf", model)
        ])
        scores = cross_val_score(pipeline, X_texts, y_labels, cv=skf, scoring="accuracy", n_jobs=-1)
        results[name] = (scores.mean(), scores.std())
        print(f"{title_suffix} | {name} — Accuracy: {scores.mean():.4f} ± {scores.std():.4f}")
    return results


6. Uruchomienie czwartej ewaluacji

In [None]:
results = {
    "Stemowane (pełne)": evaluate_models_cv(df["stemmed"], df["label"], "Stem-klasyczne"),
    "Lematyzowane (pełne)": evaluate_models_cv(df["lemmatized"], df["label"], "Lem-klasyczne"),
    "Stemowane (zbal.)": evaluate_models_cv(df_balanced["stemmed"], df_balanced["label"], "Stem-zbalansowane"),
    "Lematyzowane (zbal.)": evaluate_models_cv(df_balanced["lemmatized"], df_balanced["label"], "Lem-zbalansowane")
}


7. Wizualizacja porównania

In [None]:
def plot_cv_results(results_dict):
    for dataset_name, scores_dict in results_dict.items():
        plt.figure(figsize=(10, 6))
        models = list(scores_dict.keys())
        means = [v[0] for v in scores_dict.values()]
        stds = [v[1] for v in scores_dict.values()]

        sns.barplot(x=means, y=models, xerr=stds, palette="viridis")
        plt.title(f"Ewaluacja krzyżowa – {dataset_name}")
        plt.xlabel("Średnia dokładność (± std)")
        plt.xlim(0, 1)
        for i, m in enumerate(means):
            plt.text(m + 0.01, i, f"{m:.3f}", va="center")
        plt.tight_layout()
        plt.show()

plot_cv_results(results)
