In [1]:
import os
import requests
import random
import torch
import logging
import re
from typing import List, Tuple, Dict, Optional, Set
from collections import Counter
from pathlib import Path
from unicodedata import normalize, category
import importlib

In [2]:
import text_processor
importlib.reload(text_processor)
from text_processor import TextProcessor

In [3]:
processor = TextProcessor(
    min_word_freq=2,
    min_word_length=2,
    max_word_length=30,
    lowercase=True,
    remove_numbers=True,
    remove_punctuation=True,
    remove_urls=True,
    remove_emails=True,
    custom_filters=[
        (r'#\w+', ''),  # Remove hashtags
        (r'@\w+', ''),  # Remove mentions
    ]
)
text = processor.load_text('input.txt', 'https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt')
processor.load_stop_words()
cleaned_text, cleaned_paragraphs = processor.process_text(text)

2024-11-19 19:35:56,497 - INFO - Loaded text with length: 1115394
2024-11-19 19:35:56,501 - INFO - Built 7222 paragraphs


2024-11-19 19:35:56,692 - INFO - Cleaned 7222 paragraphs


In [4]:
print(cleaned_text[:1000])

<SPEAKER> first citizen before we proceed any further hear me speak
<SPEAKER> all speak speak
<SPEAKER> first citizen you all resolved rather die than famish
<SPEAKER> all resolved resolved
<SPEAKER> first citizen first you know caius marcius chief enemy people
<SPEAKER> all we know we know
<SPEAKER> first citizen let us kill him we ll have corn our own price verdict
<SPEAKER> all no more talking let done away away
<SPEAKER> second citizen one word good citizens
<SPEAKER> first citizen we accounted poor citizens patricians good what authority surfeits would relieve us if they would yield us but superfluity while wholesome we might guess they relieved us humanely but they think we too dear leanness afflicts us object our misery inventory particularise their abundance our sufferance gain them let us revenge this our pikes ere we become rakes gods know speak this hunger bread not thirst revenge
<SPEAKER> second citizen would you proceed especially against caius marcius
<SPEAKER> all again

In [5]:
word2idx, idx2word = processor.build_vocab(cleaned_text)
print(word2idx)

2024-11-19 19:36:05,364 - INFO - Built vocabulary with 6506 words




In [6]:
block_size = 3
embedding_dim = 16
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

random.seed(42)
random.shuffle(cleaned_paragraphs)
n1 = int(0.8*len(cleaned_paragraphs))
n2 = int(0.9*len(cleaned_paragraphs))
print(n1, n2)

x_train, y_train = processor.build_dataset(cleaned_paragraphs[:n1], block_size=block_size, device=device)
x_val, y_val = processor.build_dataset(cleaned_paragraphs[n1:n2], block_size=block_size, device=device)
x_test, y_test = processor.build_dataset(cleaned_paragraphs[n2:], block_size=block_size, device=device)

5777 6499


2024-11-19 19:36:13,153 - INFO - Built dataset with 119288 examples
2024-11-19 19:36:13,174 - INFO - Built dataset with 15964 examples
2024-11-19 19:36:13,192 - INFO - Built dataset with 15900 examples


In [42]:
device

device(type='cuda')

In [8]:
print(x_train[:5], y_train[:5])

tensor([[   1,    3, 2306],
        [   3, 2306, 1000],
        [2306, 1000, 6494],
        [1000, 6494, 2658],
        [6494, 2658, 4531]], device='cuda:0') tensor([1000, 6494, 2658, 4531, 3462], device='cuda:0')


In [16]:
import mlp
importlib.reload(mlp)
from mlp import NeuralProbabilisticLanguageModel

In [17]:
model = NeuralProbabilisticLanguageModel(3, len(word2idx), 30, 200, device)

In [None]:
model.fit(x_train, y_train, num_epochs=1000, batch_size=32, learning_rate=0.1)

In [40]:
text = "<SPEAKER> No more"
context = processor.process_text(text)[1][0]
print(context)
context = processor.word_to_index(context)
print(context)

2024-11-19 22:25:38,510 - INFO - Built 1 paragraphs
2024-11-19 22:25:38,511 - INFO - Cleaned 1 paragraphs


['<SPEAKER>', 'no', 'more']
[3, 3799, 3657]


In [None]:
out = context + model.generate(context, 10, word2idx, end_token='<END>')
print(" ".join(idx2word[idx] for idx in out))

In [None]:
out = context + model.generate(context, 10, word2idx, end_token='<END>')
print(" ".join(idx2word[idx] for idx in out))

In [103]:
context + model.generate(context, 10, word2idx, end_token='<END>')

[5322, 3, 6226, 6301, 6151, 3799, 5699, 5681, 2772, 3372, 3919, 5412, 2658]

In [95]:
model.evaluate(x_val, y_val)

7.6549923133850095