In [116]:
import re
from collections import defaultdict

import nltk
from nltk import ngrams
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split

In [117]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\JeannotMunganga\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# Read Files

In [118]:
books = ['HP1', 'HP2', 'HP3', 'HP4', 'HP5', 'HP6', 'HP7']
tokens = []

for book in books:
    try:
        with open(f'assets/harry_potter/{book}.txt', encoding='utf8') as file:
            file_content = file.read()
            file_content = re.sub(r'[^\w\s]', '', file_content).lower()
            pages = file_content.split('\n')
            
            book_tokens = []
            for page_content in pages:
                page_tokens = word_tokenize(page_content)
                book_tokens.append(page_tokens)
            tokens.append(book_tokens)
    except FileNotFoundError:
        print(f"File {book}.txt not found.")
    except Exception as e:
        print(f"An error occurred while processing {book}.txt: {e}")

# Generate 3-grams and split data

In [119]:
data = []
labels = []

for index, book in enumerate(tokens):
    for page in book:
        page_tri_grams = [' '.join(ngram) for ngram in ngrams(page, 3)]
        data.append(page_tri_grams)
        labels.append(books[index])

In [120]:
len(data)

4707

In [121]:
len(labels)

4707

In [122]:
train_data, test_data, train_labels, test_labels = train_test_split(data, labels, test_size=0.15, random_state=42)
train_data, validation_data, train_labels, validation_labels = train_test_split(train_data, train_labels, test_size=0.15, random_state=42)

# Create Classifier

In [123]:
class NaiveBayesClassifier:
    def __init__(self):
        self.word_probs = {}
        self.class_probs = {}
        self.tri_gram_probs = {}
        self.classes = set()

    def train(self, train_data, labels):
        word_counts = defaultdict(lambda: defaultdict(int))
        tri_grams_count = defaultdict(lambda: defaultdict(int))
        class_counts = defaultdict(int)

        # Count words and classes
        for i, text in enumerate(train_data):
            label = labels[i]
            self.classes.add(label)
            class_counts[label] += 1

            for trigram in text:
                tri_grams_count[label][trigram] += 1
                
                for word in trigram.split(' '):
                    word_counts[label][word] += 1

        # Calculate class probabilities
        self.class_probs = {c: count / len(labels) for c, count in class_counts.items()}
        
        # Calculate 3-gram probabilities for each class
        for label in self.classes:
            total_count = sum(tri_grams_count[label].values())
            self.tri_gram_probs[label] = {
                tg: (count + 1) / (total_count + len(tri_grams_count[label]))
                for tg, count in tri_grams_count[label].items()
            }
        
        # Calculate word probabilities for each class
        for label in self.classes:
            total_count = sum(word_counts[label].values())
            self.word_probs[label] = {
                word: (count + 1) / (total_count + len(word_counts[label]))
                for word, count in word_counts[label].items()
            }

    def predict(self, text):
        results = {}
        for label in self.classes:
            prob = self.class_probs[label]
    
            for trigram in text:
                if trigram in self.tri_gram_probs[label]:
                    prob *= self.tri_gram_probs[label][trigram]
                else:
                    prob *= 1 / (sum(self.tri_gram_probs[label].values()) + len(self.tri_gram_probs[label]))
                    
                for word in trigram.split(' '):
                    if word in self.word_probs[label]:
                        prob *= self.word_probs[label][word]
                    else:
                        prob *= 1 / (sum(self.word_probs[label].values()) + len(self.word_probs[label]))
    
            results[label] = prob
    
        return max(results, key=results.get)

In [124]:
classifier = NaiveBayesClassifier()
classifier.train(train_data, train_labels)

# Evaluating Model

In [125]:
# Evaluate the classifier on the test set
correct = 0
for i, text in enumerate(test_data):
    prediction = classifier.predict(text)
    if prediction == test_labels[i]:
        correct += 1

accuracy = correct / len(test_data)
print(f'Accuracy: {accuracy * 100:.2f}%')

Accuracy: 14.00%


In [126]:
classifier.classes

{'HP1', 'HP2', 'HP3', 'HP4', 'HP5', 'HP6', 'HP7'}

In [127]:
classifier.class_probs

{'HP4': 0.1761764705882353,
 'HP3': 0.10235294117647059,
 'HP2': 0.08147058823529411,
 'HP1': 0.07205882352941176,
 'HP7': 0.17470588235294118,
 'HP5': 0.24,
 'HP6': 0.15323529411764705}

In [128]:
classifier.word_probs

{'HP6': {'rely': 2.9749750507774152e-05,
  'on': 0.005411750069641462,
  'his': 0.012778870104475714,
  'horcruxes': 0.00022988443574189116,
  'he': 0.01905606746161606,
  'would': 0.0026179780446841253,
  'need': 0.0005706543051945769,
  'nothing': 0.0006626080794913334,
  'more': 0.001744417188864939,
  'if': 0.0020419146939426804,
  'only': 0.0011061498143345116,
  'could': 0.002501683565426463,
  'regain': 1.0818091093736055e-05,
  'a': 0.018847819208061643,
  'human': 5.9499501015548304e-05,
  'form': 3.515879605464218e-05,
  'was': 0.013519909344396635,
  'already': 0.0006761306933585035,
  'immortal': 2.7045227734340137e-05,
  'you': 0.012635530397483712,
  'see': 0.0018120302582007892,
  'or': 0.0019553699651927918,
  'as': 0.006912760208897339,
  'close': 0.0002542251407027973,
  'to': 0.024957336153249077,
  'any': 0.000784311604295864,
  'man': 0.0006599035567178993,
  'can': 0.0015902593907792001,
  'be': 0.004557120873236313,
  'but': 0.0061365621729217775,
  'now': 0.0024