In [1]:
import csv
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
with open('tmp\\BBC News Train.csv', 'r') as csvfile:
    print(f'=> First line (header) looks like this:\n\n{csvfile.readline()}')
    print(f'=> Each data point looks like this:\n\n{csvfile.readline()}')

=> First line (header) looks like this:

ArticleId,Text,Category

=> Each data point looks like this:




<h2 style=color:#3498db>Removing Stopwords Function</h2>

In [3]:
def remove_stopwords(sentence):
    
    # List of stopwords
    stopwords = ["a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any",
                 "are", "as", "at", "be", "because", "been", "before", "being", "below", "between", "both",
                 "but", "by", "could", "did", "do", "does", "doing", "down", "during", "each", "few", "for",
                 "from", "further", "had", "has", "have", "having", "he", "he'd", "he'll", "he's", "her", "here",
                 "here's", "hers", "herself", "him", "himself", "his", "how", "how's", "i", "i'd", "i'll", "i'm",
                 "i've", "if", "in", "into", "is", "it", "it's", "its", "itself", "let's", "me", "more", "most", "my",
                 "myself", "nor", "of", "on", "once", "only", "or", "other", "ought", "our", "ours", "ourselves", "out",
                 "over", "own", "same", "she", "she'd", "she'll", "she's", "should", "so", "some", "such", "than", "that",
                 "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they",
                 "they'd", "they'll", "they're", "they've", "this", "those", "through", "to", "too", "under", "until",
                 "up", "very", "was", "we", "we'd", "we'll", "we're", "we've", "were", "what", "what's", "when", "when's",
                 "where", "where's", "which", "while", "who", "who's", "whom", "why", "why's", "with", "would", "you",
                 "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves" ]
    
    sentence = sentence.lower()
    
    new_sentence = [word for word in sentence.split() if word not in stopwords]
    
    return " ".join(new_sentence)

<h2 style=color:#3498db>Reading The Raw Data</h2>

In [4]:
def parse_data_from_csv_file(filename):
    
    with open(filename,'r') as file:
        header = next(file)
        
        reader = csv.reader(file)
        
        labels = []
        sentences = []
        
        for row in reader:
            category, text = row[2], row[1]
            
            labels.append(category)
            sentences.append(remove_stopwords(text))
    
    return labels, sentences

In [5]:
labels, sentences = parse_data_from_csv_file("tmp\\BBC News Train.csv")

In [7]:
labels[0], sentences[0]

('business',

In [8]:
def fit_tokenizer(sentences):
    
    tokenizer = Tokenizer(
                          num_words=200,
                          oov_token="<OOV>"
                          )
    
    tokenizer.fit_on_texts(sentences)
    
    return tokenizer

In [9]:
sentences_tokenizer = fit_tokenizer(sentences)
sentences_word_index = sentences_tokenizer.word_index

print(f"Vocabulary contains {len(sentences_word_index)} words")

Vocabulary contains 24963 words


In [10]:
def get_padded_sequences(tokenizer, sentences):
    
    sequence = tokenizer.texts_to_sequences(sentences)
    
    padded_sequence = pad_sequences(
                                    sequence,
                                    padding="post"
                                    )
    return padded_sequence

In [11]:
padded_sequences = get_padded_sequences(sentences_tokenizer, sentences)

In [12]:
padded_sequences.shape

(1490, 1881)

In [13]:
padded_sequences[1]

array([  1, 158,   1, ...,   0,   0,   0])

In [14]:
def tokenize_labels(labels):
    
    label_tokenizer = Tokenizer()
    
    label_tokenizer.fit_on_texts(labels)
    
    label_word_index = label_tokenizer.word_index
    
    # ==================================================
    
    label_sequences = label_tokenizer.texts_to_sequences(labels)
    
    
    return label_sequences, label_word_index

In [15]:
label_sequences, label_word_index = tokenize_labels(labels)
print(f"Vocabulary of labels looks like this {label_word_index}\n")
print(f"First ten sequences {label_sequences[:10]}\n")

Vocabulary of labels looks like this {'sport': 1, 'business': 2, 'politics': 3, 'entertainment': 4, 'tech': 5}

First ten sequences [[2], [2], [2], [5], [2], [3], [1], [4], [2], [4]]

