### Project: news article

A dataset consists of news articles that cover four separate topics: World, Business, Sports, and Sci/Tech are provided. 
The following code will predict the topic, generate text and then test the top performing NLP classifier.

In [100]:
#load data
def load_doc(filename):
    # open the file as read only
    file = open(filename, 'r')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text

train = load_doc('train.csv')
test = load_doc('test.csv')

In [101]:
# prepare y
import pandas as pd
traindata = pd.read_csv('train.csv', names=['label','title','text'])
testdata = pd.read_csv('test.csv', names=['label','title','text'])

ytrain = traindata['label']
ytest = testdata['label']
y_train = ytrain.values
y_test = ytest.values

In [102]:
y_train

array([2, 4, 3, ..., 2, 3, 4])

# 1. TF-IDF with Naive Bayes

In [103]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
import re
import string
from string import punctuation
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize
import time
import pandas as pd

In [104]:
# clean doc
def clean_doc(doc):
    # split into tokens by white space
    tokens = doc.split()
    # remove punctuation from each word
    tokens = [re.sub(r"[{}]+".format(punctuation)," ",w) for w in tokens]
    # filter out stop words
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if not w in stop_words]
    # filter out short tokens
    tokens = [word for word in tokens if len(word) > 2]
    # filter out numbers
    tokens = [word for word in tokens if word.isalpha()]
    #lower tokens
    tokens = [word.lower() for word in tokens]
    tokens = ' '.join(tokens)
    return tokens

In [105]:
#prepare training and test dataset
def process_docs(doc):
    documents = list()
    doc2 = doc.split('\n')
    for line in doc2:
        line = clean_doc(line)
        documents.append(line)
    documents.pop()
    return documents

X_train = process_docs(train)
X_test = process_docs(test)

In [106]:
X_train[0:5]

['guard blake out week washington wizards point guard steve blake miss first month season injuring ankle pickup',
 'acquires synstar move designed help better compete said acquired synstar technology services',
 'airways pilots vote salary airways pilots voted approve new labor agreement yesterday reduce salaries percent save airline million',
 'risks panel examining radiation risks says official estimates dangers health may wide mark',
 'fate line partisans sides calling polarized important election presidential recall referendum today determine course democracy could buffet world oil both campaigns also utterly convinced']

In [127]:
tfidf = TfidfVectorizer()
tfidf.fit(X_train)
text_vector=tfidf.transform(X_train).toarray()
train_text_vector=tfidf.transform(X_train)
test_text_vector=tfidf.transform(X_test)

In [128]:
# fit NB model and calculate running time
start = time.time()
clf = MultinomialNB()
clf = clf.fit(train_text_vector,y_train)
end = time.time()
NB_time = end-start
print('NB running time:', NB_time)

NB running time: 0.02191925048828125


In [129]:
#make presiction and calculate accuracy
prediction=clf.predict(test_text_vector)
accuracy = accuracy_score(y_test, prediction)
print('NB-tfidf accuracy: ',accuracy)

NB-tfidf accuracy:  0.88625


# 2. Word2vec with CNN

In [112]:
from os import listdir
from collections import Counter
from numpy import array
from tensorflow.keras.preprocessing.text import Tokenizer
from pickle import dump
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.vis_utils import plot_model
from keras.layers.convolutional import MaxPooling1D
from sklearn import preprocessing
from keras.utils import to_categorical
import numpy as np
import matplotlib.pyplot as plt
from keras.layers import Dense, Input, Flatten, Dropout
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Sequential
import time
import warnings
warnings.filterwarnings("ignore")
from keras.utils import plot_model
import gensim

### 2.1 Define a Vocabulary

In [113]:
# turn a doc into clean tokens
def clean_doc2(doc):
    # split into tokens by white space
    tokens = doc.split()
    # remove punctuation from each word
    tokens = [re.sub(r"[{}]+".format(punctuation)," ",w) for w in tokens]
    # remove remaining tokens that are not alphabetic
    tokens = [word for word in tokens if word.isalpha()]
    # filter out stop words
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if not w in stop_words]
    # filter out short tokens
    tokens = [word for word in tokens if len(word) > 1]
    return tokens

In [158]:
#apply
tokens = clean_doc2(train)
vocab = Counter()
vocab.update(tokens)
print(len(vocab))
print(vocab.most_common(10))

45724
[('said', 5567), ('The', 4549), ('new', 3370), ('US', 2897), ('first', 2730), ('two', 2491), ('last', 1998), ('company', 1860), ('one', 1823), ('New', 1754)]


In [159]:
# keep tokens with a min occurrence
min_occurrence = 2
tokens = [k for k,c in vocab.items() if c >= min_occurrence]
print(len(tokens))

28474


In [160]:
# save list to file
def save_list(lines, filename):
    # convert lines to a single blob of text
    data = '\n'.join(lines)
    # open file
    file = open(filename, 'w')
    # write text
    file.write(data)
    # close file
    file.close()

# save tokens to a vocabulary file
save_list(tokens, 'vocab.txt')

### 2.2 Encode and Pad

In [161]:
# load the vocabulary
vocab_filename = 'vocab.txt'
vocab = load_doc(vocab_filename)
vocab = set(vocab.split())

In [162]:
# turn doc into clean tokens
def clean_doc3(doc, vocab):
    # split into tokens by white space
    tokens = doc.split()
    # remove punctuation from each word
    [re.sub(r"[{}]+".format(punctuation)," ",w) for w in tokens]
    # filter out tokens not in vocab
    tokens = [w for w in tokens if w in vocab]
    tokens = ' '.join(tokens)
    return tokens

In [163]:
#prepare training and test dataset
def process_docs(doc,vocab):
    documents = list()
    doc2 = doc.split('\n')
    for line in doc2:
        line = clean_doc3(line, vocab)
        documents.append(line)
    documents.pop()
    return documents
X_train2 = process_docs(train,vocab)
X_test2 = process_docs(test,vocab)

In [164]:
# calculate the maximum sequence length
max_length = max([len(s.split()) for s in X_train2])
print('max_length', max_length)

max_length 71


In [165]:
#pad X and one-hot y
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train2)

sequences = tokenizer.texts_to_sequences(X_train2)
word_index = tokenizer.word_index
Xtrain = pad_sequences(sequences, maxlen=max_length)
labels_train= to_categorical(np.asarray(y_train))

In [166]:
# define vocabulary size
vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary size:', vocab_size)

Vocabulary size: 22540


### 2.3 Define word2vec model to embedding layer

In [167]:
sentence= []
labels = []
for line in X_train2:
    splits = line.split(' ')
    sentence.append(splits)
for line in ytrain:
    labels.append(line)

In [168]:
# define word2vec model
model = gensim.models.Word2Vec(sentence, size=100, sg=1, iter=8)  
model.wv.save_word2vec_format('word2Vec' + '.bin', binary=True) 
wordVec = gensim.models.KeyedVectors.load_word2vec_format('word2Vec.bin', binary=True)

In [169]:
# train word2vec model
VECTOR_DIR = 'word2Vec.bin'
EMBEDDING_DIM = 100
w2v_model = gensim.models.KeyedVectors.load_word2vec_format(VECTOR_DIR, binary=True)
embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items(): 
    if word in w2v_model:
        embedding_matrix[i] = np.asarray(w2v_model[word], dtype='float32')
embedding_layer = Embedding(len(word_index) + 1, EMBEDDING_DIM, weights=[embedding_matrix], input_length=max_length, trainable=False)

### 2.4 Train CNN Model

In [170]:
#develop CNN model
CNN_model = Sequential()
CNN_model.add(embedding_layer)
CNN_model.add(Dropout(0.2))
CNN_model.add(Conv1D(250, 3, padding='valid', activation='relu', strides=1))
CNN_model.add(MaxPooling1D(3))
CNN_model.add(Flatten())
CNN_model.add(Dense(100, activation='relu'))
CNN_model.add(Dense(labels_train.shape[1], activation='softmax'))
CNN_model.summary()

CNN_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_12 (Embedding)     (None, 71, 100)           2254000   
_________________________________________________________________
dropout_1 (Dropout)          (None, 71, 100)           0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 69, 250)           75250     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 23, 250)           0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 5750)              0         
_________________________________________________________________
dense_23 (Dense)             (None, 100)               575100    
_________________________________________________________________
dense_24 (Dense)             (None, 5)                 505       
Total para

In [171]:
#fit model and calculate time
start = time.time()
CNN_model.fit(Xtrain, labels_train, epochs=10, verbose=1)
end = time.time()
CNN_time = end-start
print('CNN running time:',CNN_time)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
CNN running time: 72.33172035217285


In [172]:
#Evaluate Model
sequences = tokenizer.texts_to_sequences(X_test2)
Xtest = pad_sequences(sequences, maxlen=max_length)
labels_test = to_categorical(np.asarray(y_test))

_, acc = CNN_model.evaluate(Xtrain, labels_train, verbose=0)
print('Train Accuracy:', acc)
# evaluate model on test dataset
_, acc = CNN_model.evaluate(Xtest, labels_test, verbose=0)
print('Test Accuracy:', acc)

Train Accuracy: 0.9794
Test Accuracy: 0.8215


# 3. Text Generation with LSTM

In [114]:
from random import randint
from pickle import load
from keras.models import load_model
from keras.layers import LSTM

### 3.1 Prepare data

In [115]:
# define training & testing data
Sports_traindata = traindata[traindata['label'] == 2]['text'].tolist()
Sports_testdata = testdata[traindata['label'] == 2]['text'].tolist()

In [116]:
# turn a doc into clean tokens
train_str = "".join(Sports_traindata)
tokens = clean_doc2(train_str)

In [117]:
# clean text
def process_docs(doc):
    documents = list()
    for line in doc:
        line = clean_doc(line)
        documents.append(line)
    documents.pop()
    return documents

Sports_train = process_docs(Sports_traindata)
Sports_test = process_docs(Sports_testdata)
Sports_train[0:5]

['washington wizards point guard steve blake miss first month season injuring ankle pickup',
 'baseball fans across japan saturday honored ichiro seattle mariners japanese player broke major league baseball record season',
 'two years ago ernie els explained possible tiger woods could overtaken world time south african',
 'what appeared mismatch took turn competitive second set regrouped',
 'serena williams blamed headache poor play upset loss qualifier alina jidkova second round generali ladies']

In [118]:
# organize into sequences of tokens
length = 15 + 1
sequences = list()
for i in range(length, len(tokens)+1):
    # select sequence of tokens
    seq = tokens[i - length:i]
    # convert into a line
    line = ' '.join(seq)
    # store
    sequences.append(line)
print('Total Sequences: %d' % len(sequences))

Total Sequences: 154733


In [119]:
# save tokens to file, one dialog per line
def save_doc(lines, filename):
    data = '\n'.join(lines)
    file = open(filename, 'w')
    file.write(data)
    file.close()
# save sequences to file
out_filename = 'sports_sequences.txt'
save_doc(sequences, out_filename)

In [120]:
#load doc
doc = load_doc('sports_sequences.txt')
lines = doc.split('\n')
lines[0]

'AP Washington Wizards point guard Steve Blake miss first month season injuring ankle pickup fans across'

### 3.2 Encode Sequences

In [121]:
# encode sequences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(lines)
sequences = tokenizer.texts_to_sequences(lines)

In [122]:
# vocabulary size
vocab_size = len(tokenizer.word_index) + 1
vocab_size

16058

In [123]:
#Sequence Inputs and Output
sequences = array(sequences)
X, y = sequences[:, :-1], sequences[:, -1]
sequences = array(sequences)
sequences

array([[  134,   122,  1471, ..., 16056,   100,  1206],
       [  122,  1471,   407, ...,   100,  1206,   502],
       [ 1471,   407,   253, ...,  1206,   502,    25],
       ...,
       [ 3813,  4088,  3185, ...,   431,    19,  1376],
       [ 4088,  3185, 16055, ...,    19,  1376,  3554],
       [ 3185, 16055,  3208, ...,  1376,  3554,    10]])

In [124]:
# separate into input and output
sequences = array(sequences)
X, y = sequences[:, :-1], sequences[:, -1]
y = to_categorical(y, num_classes=vocab_size)
seq_length = X.shape[1]

### 3.3 Fit model

In [130]:
# define the model
def define_model(vocab_size, seq_length):
    model = Sequential()
    model.add(Embedding(vocab_size, 50, input_length=seq_length))
    model.add(LSTM(100, return_sequences=True))
    model.add(LSTM(100))
    model.add(Dense(100, activation='relu'))
    model.add(Dense(vocab_size, activation='softmax'))
    # compile network
    model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    # summarize defined model
    model.summary()
    #plot_model(model, to_file='model.png', show_shapes=True)
    return model

In [131]:
# define model
model = define_model(vocab_size, seq_length)
# fit model
model.fit(X, y, batch_size=128, epochs=25)
# save the model to file
model.save('model.h5')
# save the tokenizer
dump(tokenizer, open('tokenizer.pkl', 'wb'))

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_11 (Embedding)     (None, 15, 50)            802900    
_________________________________________________________________
lstm_21 (LSTM)               (None, 15, 100)           60400     
_________________________________________________________________
lstm_22 (LSTM)               (None, 100)               80400     
_________________________________________________________________
dense_21 (Dense)             (None, 100)               10100     
_________________________________________________________________
dense_22 (Dense)             (None, 16058)             1621858   
Total params: 2,575,658
Trainable params: 2,575,658
Non-trainable params: 0
_________________________________________________________________
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/

In [132]:
# load cleaned text sequences
in_filename = 'sports_sequences.txt'
doc = load_doc(in_filename)
lines = doc.split('\n')

In [133]:
seq_length = len(lines[0].split()) - 1

In [134]:
# load the model
model = load_model('model.h5')

In [135]:
# load the tokenizer
tokenizer = load(open('tokenizer.pkl', 'rb'))

### 3.4 Generate Text

In [150]:
# select a seed text
seed_text = lines[randint(0, len(lines))]
print(seed_text + '\n')

Wednesday outstanding match FL The Tampa Bay Lightning captain Dave Andreychuk In accordance team financial terms



In [137]:
# generate a sequence from a language model
def generate_seq(model, tokenizer, seq_length, seed_text, n_words):
    result = list()
    in_text = seed_text
    # generate a fixed number of words
    for _ in range(n_words):
        # encode the text as integer
        encoded = tokenizer.texts_to_sequences([in_text])[0]
        # truncate sequences to a fixed length
        encoded = pad_sequences([encoded], maxlen=seq_length, truncating='pre')
        # predict probabilities for each word
        yhat = model.predict_classes(encoded, verbose=0)
        # map predicted word index to word
        out_word = ''
        for word, index in tokenizer.word_index.items():
            if index == yhat:
                out_word = word
                break
        # append to input
        in_text += ' ' + out_word
        result.append(out_word)
    return ' '.join(result)

In [147]:
# generate new text
generated = generate_seq(model, tokenizer, seq_length, seed_text, 15)
print(generated)

new york yankees win new york patriots abrupt back quarterback john brown hit homer lead


In [154]:
samples = list()
for i in range(100):
    generated = generate_seq(model, tokenizer, seq_length, lines[randint(0, len(lines))], 15)
    samples.append(generated)
samples[0:5]

['the world cup qualifier retief schumacher hit homer lead new york yankees beat new york',
 'ruled return formula one team players said win world cup qualifier retief schumacher hit homer',
 'another olympic committee said said regarding john gibbs said statement victor drug disclose victor conte',
 'new york yankees marathon new york yankees beat city the world cup qualifier bode sharapova',
 'season torn room called new york yankees interconference marathon world cup qualifier retief schumacher homered']

# 4. Use samples to test NB and CNN 

In [192]:
sports_y = [2 for i in range(100)]
sports_y = np.array(sports_y)
sports_vector=tfidf.transform(samples)
prediction_sports=clf.predict(sports_vector)
accuracy = accuracy_score(sports_y, prediction_sports)
print('NB-tfidf accuracy: ',accuracy)

NB-tfidf accuracy:  1.0


In [205]:
#Evaluate Model
sequences = tokenizer.texts_to_sequences(samples)
sportstest = pad_sequences(sequences, maxlen=max_length)
yadd = np.concatenate((sports_y,y_test),axis=0)
sportslabels = to_categorical(np.asarray(yadd))[0:100]

_, acc = CNN_model.evaluate(sportstest, sportslabels, verbose=0)
print('CNN-Word2vec Accuracy:', acc)

CNN-Word2vec Accuracy: 0.99
