In [3]:
# Download dataset
!wget -nc https://lazyprogrammer.me/course_files/nlp/bbc_text_cls.csv

File 'bbc_text_cls.csv' already there; not retrieving.



In [6]:
# import stuff
import pandas as pd
import numpy as np
import textwrap
import nltk
from nltk import word_tokenize
from nltk.tokenize.treebank import TreebankWordDetokenizer
# download punctuation
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Dimitris\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [7]:

# import dataset
df = pd.read_csv('bbc_text_cls.csv')#, on_bad_lines='skip')
df.head()

labels = set(df['labels'])

### CONSTANTS ####

# {'politics', 'sport', 'tech', 'business', 'entertainment'}
label = 'politics' # train only from chosen labels
context_window = 1 # How many words before and after to take as a context
change_prob = 0.3 # Propability to change a word  

In [23]:
def collect_counts(texts):
    # collect counts

    probs = {} # key: [(wt-1), (wt+1)] value: [ wt / count(wt)]

    for doc in texts:
        lines = doc.split("\n")
        for line in lines:
            tokens = word_tokenize(line)
            # for lenght of tokens - context window 
            for i in range(len(tokens)-(context_window*2+1)):
                # for context window
                t = []
                for j in range(context_window*2 + 1):
                    t.append(tokens[i+j])
                    
                wt = t[len(t)//2]
                del[t[len(t)//2]]
                key = tuple(t)
                
                if key not in probs:
                    probs[key] = {}
                    
                # add count for middle token
                if wt not in probs[key]:
                    probs[key][wt] = 1
                else: 
                    probs[key][wt] += 1
    return probs
  
  
def convert_to_probs(probs):
    for key, d in probs.items():
        # d should represent a distibution
        total = sum(d.values())
        # access the dictionary of dictionaries, in corresponding key
        # devide curernt count with total
        for k, v in d.items():
            d[k] = v/total  
            
    return probs
        
        
def sample_word(d):
    p0 = np.random.random()
    cumulative = 0
    for t,p in d.items():
        cumulative += p
        if p0 < cumulative:
            return t
    assert(False)        
    
def spin_line(line,probs):
    tokens = word_tokenize(line)
    i = 0
    output = [tokens[0]]
    

    while i <(len(tokens)-(context_window*2+1)):
        t = []
        for j in range(context_window*2 + 1):
            t.append(tokens[i+j])
      
        wt = t[len(t)//2]
        del[t[len(t)//2]]
        key = tuple(t)
        
        
        p_dist = probs[key]

        if len(p_dist) > 1 and np.random.random() < change_prob:
            # replace middle word
            middle = sample_word(p_dist)
            output.append(wt)
            output.append("<"+middle+">")
            output.append(t[len(t)//2])
            
            # skip 2 steps 
            i +=2
        
        else:
            # dont replace middle word
            output.append(wt)
            i+=1
        
        
    # append the final token
    if i == (len(tokens)-2):
        output.append(tokens[-1])
    return detokenizer.detokenize(output)    


def spin_document(doc,probs):
    # split document into lines
    lines = doc.split("\n")
    output = []
    for line in lines:
        if line:
            new_line = spin_line(line,probs)
        else:
            new_line = line
        output.append(new_line)
        
    return "\n".join(output)
                

In [24]:
texts = df[df['labels'] == label]['text']
probs = collect_counts(texts)
norm_probs = convert_to_probs(probs)

# text is split on paragraphs
texts.iloc[0].split("\n")


detokenizer = TreebankWordDetokenizer()
texts.iloc[0].split("\n")[2]
detokenizer.detokenize(word_tokenize(texts.iloc[0].split("\n")[2]))


'Maternity pay for new mothers is to rise by £1,400 as part of new proposals announced by the Trade and Industry Secretary Patricia Hewitt.'

In [25]:

np.random.seed(1234)
i = np.random.choice(texts.shape[0])
doc = texts.iloc[i]
new_doc = spin_document(doc,norm_probs)
print(textwrap.fill(new_doc, replace_whitespace=False,fix_sentence_endings=True))

UKIP outspent Labour on

The UK Independence Party outspent both
Labour and the Liberal Democrats in the European elections <history>,
new figures

UKIP, which campaigned <starts> on a slogan <way> of "Say
no to Europe" <has>, spent £2.36m on the campaign <party> - second
<that> only to the Conservatives' £3.13m <manifesto>. The campaign
took UKIP into third place with an extra 10 MEPs . Labour's campaign
cost £1.7m, the <the> Lib Dems' £1.19m and the Greens' £404,000,
according to figures revealed by the Electoral <European> Commission
on Wednesday . Much of the UKIP funding came from Yorkshire
millionaire Sir Paul Sykes, who helped bankroll <make> the party's
billboard campaign . Critics <We> have accused <seen> the party of
effectively buying votes . But <In> a UKIP <Labour> spokesman said
Labour and the Conservatives <Treasury> had spent £10m between them on
the last <next> general election . "With the advantages of public
<wasting> money the others <women> have, the only way the s