<a href="https://colab.research.google.com/github/Glitch0110/AI-GiganciProgramowania/blob/main/L21.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [20]:
import json
import random
import math
# ------------------------------
# 1. Wczytanie danych treningowych
# ------------------------------
def load_email_data(file_path):
  with open(file_path, encoding="utf-8") as f:
    data = json.load(f)
  return data

def train_test_split(data, test_ratio=0.2):
  random.shuffle(data)
  cut = int(len(data) * (1 - test_ratio))
  return data[:cut], data[cut:]

In [21]:
# ------------------------------
# 2. Trenowanie klasyfikatora Bayesa
# ------------------------------
# czyszczenie tekstu i zmiana tekstu na listę słów
def preprocess(text):
    return text.lower().replace("–", " ").replace("-", " ").replace(",", " ")\
        .replace(".", " ").replace("!", " ").replace("?", " ").split()


#trenowanie już na przygotowanych danych
def train_naive_bayes(train_data, alpha=1.0):
    class_counts = {}
    word_counts = {}
    total_words = {}

    for rec in train_data:
        label = rec["label"]
        class_counts[label] = class_counts.get(label, 0) + 1
        word_counts.setdefault(label, {})
        total_words.setdefault(label, 0)

        words = preprocess(rec["text"])
        for word in words:
            word_counts[label][word] = word_counts[label].get(word, 0) + 1
            total_words[label] += 1

    vocab = set()
    for wc in word_counts.values():
        vocab.update(wc.keys())

    return {
        "class_counts": class_counts,
        "word_counts": word_counts,
        "total_words": total_words,
        "vocab": vocab,
        "alpha": alpha,
        "total_docs": len(train_data)
    }

In [22]:
# ------------------------------
# 3. Klasyfikacja wiadomości
# ------------------------------
def log_prob(model, words, class_name):
    logp = math.log(model["class_counts"][class_name] / model["total_docs"])
    V = len(model["vocab"])
    a = model["alpha"]
    for word in words:
        wc = model["word_counts"][class_name].get(word, 0)
        logp += math.log((wc + a) / (model["total_words"][class_name] + a * V))
    return logp

def predict(model, text):
    words = preprocess(text)
    best_class, best_log = None, -float("inf")
    for c in model["class_counts"]:
        lp = log_prob(model, words, c)
        if lp > best_log:
            best_class, best_log = c, lp
    return best_class

def evaluate_model(model, test_data):
    correct = 0
    for rec in test_data:
        prediction = predict(model, rec["text"])
        if prediction == rec["label"]:
            correct += 1
    accuracy = correct / len(test_data)
    print(f"Skuteczność na zbiorze testowym: {accuracy * 100:.2f}%")
    return accuracy

In [23]:
from pprint import pprint


def main():
    data = load_email_data("spam_ham.json")

    train, test = train_test_split(data)


    model = train_naive_bayes(train)

    pprint(model)

    evaluate_model(model, test)

    while True:
        user_input = input("Wpisz wiadomość do klasyfikacji (lub 'exit' aby zakończyć):\n> ")
        if user_input.lower() == "exit":
            print("Zakończono.")
            break
        prediction = predict(model, user_input)
        print(f"Klasyfikacja: {prediction.upper()}")

In [24]:
#5. Uruchomienie
if __name__ == "__main__":
  main()

{'alpha': 1.0,
 'class_counts': {'ham': 24, 'spam': 32},
 'total_docs': 56,
 'total_words': {'ham': 119, 'spam': 190},
 'vocab': {'1',
           '10:00',
           '300',
           '500',
           '7',
           '90%',
           '99',
           '9999',
           'aby',
           'analiza',
           'anulowania',
           'aplikację',
           'apple',
           'bardzo',
           'bez',
           'bik',
           'bonus',
           'budżet',
           'było',
           'celny',
           'ciebie',
           'czeka',
           'cześć',
           'czy',
           'darmowa',
           'darmowego',
           'darmowy',
           'dla',
           'dni',
           'dnia',
           'do',
           'dokonanie',
           'domu',
           'dostarczone',
           'dostęp',
           'dziennie',
           'dziękujemy',
           'dziękuję',
           'dziś',
           'dziś:',
           'ekskluzywna',
           'faktura',
           'finansowa',
  