In [2]:
!wget -nc -P ../datasets/articlespinner/ https://lazyprogrammer.me/course_files/nlp/bbc_text_cls.csv

File ‘../datasets/articlespinner/bbc_text_cls.csv’ already there; not retrieving.



In [70]:
import numpy as np
import pandas as pd

import nltk
import textwrap
from collections import defaultdict
from nltk import word_tokenize
from nltk import TreebankWordDetokenizer

nltk.download('punkt', "../datasets/nltk/")
nltk.data.path.append("../datasets/nltk/")

[nltk_data] Downloading package punkt to ../datasets/nltk/...
[nltk_data]   Package punkt is already up-to-date!


In [112]:
class Dataset:
    def __init__(self, path: str, train_label: str, test_label: str):
        self.df, self.labels = self._load_dataset(path)

        self.test_df =  self._filter_dataset(self.df, test_label)
        self.df = self._filter_dataset(self.df, train_label)

        self.token_df = self._tokenize(self.df)
        self.test_token_df = self._tokenize(self.test_df)

    def _load_dataset(self, path: str) -> tuple[pd.DataFrame, set]:
        df = pd.read_csv(path)
        labels = set(df["labels"])
        return df, labels
    
    def _filter_dataset(self, df: pd.DataFrame, label: str):
        return df[df["labels"] == label]["text"]
    
    def _tokenize(self, df: pd.DataFrame) -> pd.DataFrame:
        def tokenize(text: str) -> str:
            return word_tokenize(text.lower())
        return df.apply(tokenize)

    def __getitem__(self, idx: int) -> tuple[str, list]:
        return self.token_df.iloc[idx]

    def __len__(self) -> int:
        return len(self.token_df)
    
    @property
    def get_test_df(self) -> pd.DataFrame:
        return self.test_token_df

class N_Gram:
    START = "<s>"
    END = "<e>"

    def __init__(self, tokens: Dataset):
        self.tokens = tokens
        self.tri_grams = self.get_counts(self.tokens)
        self.tri_grams_prob = self.get_middle_probabilities(self.tri_grams)

        self.detokenizer = TreebankWordDetokenizer()

    def get_counts(self, tokens: Dataset):
        counts = defaultdict(lambda: defaultdict(lambda: defaultdict(int)))

        for line in tokens:
            line = [self.START] + line + [self.END]
            for i in range(1, len(line) - 1):
                t_last, middle, t_next = line[i-1 : i+2]
                counts[t_last][t_next][middle] += 1
        return counts
    
    def get_middle_probabilities(self, counts: defaultdict):
        for prev_t, next_dict in counts.items():
            for next_t, middle_dict in next_dict.items():
                n_tokens = sum(middle_dict.values())

                counts[prev_t][next_t] = {middle_t: value/n_tokens for middle_t, value in middle_dict.items()}
        return counts
    
    def spin_text(self, test: pd.DataFrame):

        spin_text = []
        real_text = []

        for line in test:
            real_text_temp = []
            spin_text_temp = []


            line = [self.START] + line + [self.END]
            for i in range(1, len(line) - 1):
                t_last, middle, t_next = line[i-1 : i+2]
                spin_middle = self.sample_word(t_last, t_next, middle)

                real_text_temp.append(middle)
                spin_text_temp.append(spin_middle)

                if t_next == self.END:
                    break
            
            
            real_text.append(self.detokenizer.detokenize(real_text_temp))
            spin_text.append(self.detokenizer.detokenize(spin_text_temp))
    
        return real_text, spin_text

    def sample_word(self, prev_t: str, next_t: str, middle: str) -> str:
        options = self.tri_grams_prob.get(prev_t, {}).get(next_t, {})

        if options:
            return str(np.random.choice(list(options.keys()), p = list(options.values())))
        else:
            return middle
    

In [117]:
dataset = Dataset("../datasets/articlespinner/bbc_text_cls.csv", "business", "tech")
test = dataset.get_test_df

In [118]:
ngram = N_Gram(dataset)
real, spin = ngram.spin_text(test)

In [119]:
import textwrap

i = 3

print(textwrap.fill(real[i], replace_whitespace = False, fix_sentence_endings = True))

digital guru floats sub- $100 pc nicholas negroponte, chairman and
founder of mit's media labs, says he is developing a laptop pc that
will go on sale for less than $100 (£53). he told the bbc world
service programme go digital he hoped it would become an education
tool in developing countries . he said one laptop per child could be
"very important to the development of not just that child but now the
whole family, village and neighbourhood". he said the child could use
the laptop like a text book . he described the device as a stripped
down laptop, which would run a linux-based operating system, "we have
to get the display down to below $20, to do this we need to rear
project the image rather than using an ordinary flat panel . "the
second trick is to get rid of the fat, if you can skinny it down you
can gain speed and the ability to use smaller processors and slower
memory ." the device will probably be exported as a kit of parts to be
assembled locally to keep costs down . mr negrop

In [120]:
print(textwrap.fill(spin[i], replace_whitespace = False, fix_sentence_endings = True))


digital guru floats sub- $100 pc nicholas negroponte, mothercare and
independence of may's media business," it is bringing a laptop pc that
will be on more for more than $3bn (bse). bat oversaw the whole world
food programme go digital he said more to submit an education tool in
poor countries . it admitted one laptop per child could be viewed an
early to the hands of for convinced that child but loosen the ambani
sector, discriminatory and workable" but renault expects the airline
could hit the laptop like a text book . he said the decision as a
stripped down 247p, taxation is face a linux-based operating company,
then we tried to recommend the day down by restate $15bn% airlines do
this we tried to the project the uk rather than providing an ordinary
flat sales that "the second trick is to get out of the us, if that can
see it down you can gain speed at the latter to fly smaller ones and
slower memory . "the owner will not be worth as a range of moving to
be assembled locally to cut 