In [None]:
import re
from collections import defaultdict

import nltk
from nltk import ngrams
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split

# Download the NLTK tokenizer models
nltk.download('punkt')

books = ['HP1', 'HP2', 'HP3', 'HP4', 'HP5', 'HP6', 'HP7']
tokens = []

for book in books:
    try:
        with open(f'assets/harry_potter/{book}.txt', encoding='utf8') as file:
            file_content = file.read()
            file_content = re.sub(r'[^\w\s]', '', file_content).lower()
            pages = file_content.split('\n')
            
            book_tokens = []
            for page_content in pages:
                page_tokens = word_tokenize(page_content)
                book_tokens.append(page_tokens)
            tokens.append(book_tokens)
    except FileNotFoundError:
        print(f"File {book}.txt not found.")
    except Exception as e:
        print(f"An error occurred while processing {book}.txt: {e}")

data = []
labels = []

for index, book in enumerate(tokens):
    for page in book:
        page_tri_grams = [' '.join(ngram) for ngram in ngrams(page, 3)]
        data.append(page_tri_grams)
        labels.append(books[index])

# Split the data into training, validation, and test sets
train_data, test_data, train_labels, test_labels = train_test_split(data, labels, test_size=0.15, random_state=42)
train_data, validation_data, train_labels, validation_labels = train_test_split(train_data, train_labels, test_size=0.15, random_state=42)

class NaiveBayesClassifier:
    def __init__(self):
        self.word_probs = {}
        self.class_probs = {}
        self.tri_gram_probs = {}
        self.classes = set()

    def train(self, train_data, labels):
        word_counts = defaultdict(lambda: defaultdict(int))
        tri_grams_count = defaultdict(lambda: defaultdict(int))
        class_counts = defaultdict(int)

        # Count words and classes
        for i, text in enumerate(train_data):
            label = labels[i]
            self.classes.add(label)
            class_counts[label] += 1

            for trigram in text:
                tri_grams_count[label][trigram] += 1
                
                for word in trigram.split(' '):
                    word_counts[label][word] += 1

        # Calculate class probabilities
        self.class_probs = {c: count / len(labels) for c, count in class_counts.items()}
        
        # Calculate 3-gram probabilities for each class
        for label in self.classes:
            total_count = sum(tri_grams_count[label].values())
            self.tri_gram_probs[label] = {
                tg: (count + 1) / (total_count + len(tri_grams_count[label]))
                for tg, count in tri_grams_count[label].items()
            }
        
        # Calculate word probabilities for each class
        for label in self.classes:
            total_count = sum(word_counts[label].values())
            self.word_probs[label] = {
                word: (count + 1) / (total_count + len(word_counts[label]))
                for word, count in word_counts[label].items()
            }

    def predict(self, text):
        results = {}
        for label in self.classes:
            prob = self.class_probs[label]
    
            for trigram in text:
                if trigram in self.tri_gram_probs[label]:
                    prob *= self.tri_gram_probs[label][trigram]
                else:
                    prob *= 1 / (sum(self.tri_gram_probs[label].values()) + len(self.tri_gram_probs[label]))
                    
                for word in trigram.split(' '):
                    if word in self.word_probs[label]:
                        prob *= self.word_probs[label][word]
                    else:
                        prob *= 1 / (sum(self.word_probs[label].values()) + len(self.word_probs[label]))
    
            results[label] = prob
    
        return max(results, key=results.get)

# Initialize and train the classifier
classifier = NaiveBayesClassifier()
classifier.train(train_data, train_labels)

# Evaluate the classifier on the test set
correct = 0
for i, text in enumerate(test_data):
    prediction = classifier.predict(text)
    if prediction == test_labels[i]:
        correct += 1

accuracy = correct / len(test_data)
print(f'Accuracy: {accuracy * 100:.2f}%')
