In [196]:
%matplotlib inline
# import libraries
import re
import numpy as np  # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import tensorflow as tf
import matplotlib.pyplot as plt
import os, os.path

from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
# function for transforming documents into counts
from sklearn.feature_extraction.text import CountVectorizer
# function for encoding categories
from sklearn.preprocessing import LabelEncoder

In [5]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Dense, Input, Flatten
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model


BASE_DIR = ''
MAX_SEQUENCE_LENGTH = 1000
MAX_NB_WORDS = 20000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.2

In [275]:
# grab the data of news article headlines

data_dir = '../datasets/news-data'
news = []
news_length = 0

for i, tsv_file in enumerate(os.listdir(data_dir)):
    if not tsv_file.startswith('.'):
        sample_url = os.path.join(data_dir, tsv_file)
        sample = pd.read_csv(sample_url, sep='\t', nrows=2000)
        sample['title'] = sample['title'].astype(str)
        sample = [(j, i) for j in sample['title']]

        if (i == 0):
            news = sample
        else: 
            news += sample
    news_length = i

news = shuffle(news)
texts = [i[0] for i in news]
labels = [i[1] for i in news]

def to_one_hot(x, n):
    x = np.array(x)
    z = np.zeros((x.shape[0],n))
    z[np.arange(x.shape[0]),x] = 1
    return z

labels = to_one_hot(labels, news_length+1)
print(labels)
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)

# fit to a keras.sequences
sequences = tokenizer.texts_to_sequences(texts)

[[ 0.  0.  0. ...,  1.  0.  0.]
 [ 0.  0.  0. ...,  0.  1.  0.]
 [ 0.  0.  0. ...,  0.  1.  0.]
 ..., 
 [ 0.  0.  0. ...,  0.  1.  0.]
 [ 0.  0.  0. ...,  1.  0.  0.]
 [ 0.  0.  0. ...,  0.  1.  0.]]


In [276]:
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

data = pad_sequences(sequences)

Found 12473 unique tokens.


In [302]:
#labels = keras.utils.to_categorical(np.asarray(bias_lex))

print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

('Shape of data tensor:', (11885, 24))
('Shape of label tensor:', (11885, 7))


In [309]:
# split into training and tests sets
x_train, x_test, y_train, y_test = train_test_split(data, labels, test_size=.3)

print(x_train.shape)

val_int = int(x_test.shape[0] * .6)

# subset train for validation set which is about 1/3 of the training set
x_train = x_train[:val_int]
y_train = y_train[:val_int]
x_val = x_train[val_int:]
y_val = y_train[val_int:]

(8319, 24)


In [310]:
# Make our embedding layer

embeddings_index = {}
f = open('../datasets/glove.6B/glove.6B.100d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
        
embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            trainable=False)

Found 400000 word vectors.


In [None]:
# Build our Model

sequence_input = Input(shape=(data.shape[0],), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)

x = Conv1D(128, 35, activation='relu')(embedded_sequences)
x = MaxPooling1D(2)(x)
x = Conv1D(128, 35, activation='relu')(x)
x = MaxPooling1D(2)(x)
#x = Conv1D(128, 5, activation='relu')(x)
#x = MaxPooling1D(2)(x)  # global max pooling
x = Flatten()(x)
x = Dense(128, activation='relu')(x)
preds = Dense(1, activation='softmax')(x) # is signmoid for binary?

#sentiment_preds = Dense(4, activation='softmax')(x)

from keras import optimizers
optimizer = optimizers.Adam(0.001)
model = Model(sequence_input, preds)
# to customize loss function between two diff, take say a sentiment + categorical_crossentropy
model.compile(loss='categorical_crossentropy',
              optimizer=optimizer,
              metrics=['acc'])

In [177]:
# happy learning!
model.fit(x_train, y_train, validation_data=(x_val, y_val),
         epochs=10, batch_size=128)