In [1]:
from pprint import pprint
from pymongo import MongoClient
from tensorflow import keras
import numpy as np
import pickle
import json
from unidecode import unidecode_expect_nonascii, unidecode
client = MongoClient(connect=False)
db = client['newscraper']

  from ._conv import register_converters as _register_converters


In [2]:
!sudo service mongod start

In [6]:
def show_schema(table='articles_cleaned'):
    from pprint import pprint
    pprint(next(db[table].find()))

In [7]:
show_schema('articles')

{'_id': ObjectId('5a2730f35cedcc6022e9026e'),
 'flags': ['left-center', 'very high'],
 'source': 'https://brookings.edu',
 'text': 'A chronicle of the year that changed Soviet Russia—and molded the '
         'future path of one of America’s pre-eminent diplomatic '
         'correspondents\n'
         '\n'
         '1956 was an extraordinary year in modern Russian history. It was '
         'called “the year of the thaw”—a time when Stalin’s dark legacy of '
         'dictatorship died in February only to be reborn later that December. '
         'This historic arc from rising hope to crushing despair opened with a '
         'speech by Nikita Khrushchev, then the unpredictable leader of the '
         'Soviet Union. He astounded everyone by denouncing the one figure '
         'who, up to that time, had been hailed as a “genius,” a wizard of '
         'communism—Josef Stalin himself. Now, suddenly, this once '
         'unassailable god was being portrayed as a “madman” whose '
    

In [3]:
class Corpus:
    ''' Retrieves data from MongoDB'''

    def __init__(self, db_table='articles', field='text', n_words=20000):

        self.n_words = n_words
        self.field = field
        self.db_table = db_table
        self.labels = [
            'center', 'conspiracy', 'extreme left', 'extreme right',
            'fake news', 'hate', 'high', 'left', 'left-center', 'low', 'mixed',
            'pro-science', 'propaganda', 'right', 'right-center', 'satire',
            'very high'
        ]

    def get_all_rows(self):
        ''' Retrieve target table from db '''
        print(self.n_words)
        self.articles = [_ for _ in db[self.db_table].find()
                         if _[self.field]]
        self.n_articles = len(self.articles)


from keras.preprocessing import text as Text


class KerasVectorizer(Corpus):
    ''' Performs vectorization and text preprocessing '''

    def __init__(self, dnn_type='seq', max_len=1000, predict_str=False):
        super().__init__()
        if not predict_str:
            self.get_all_rows()
            self.train = True
        else:
            self.articles = predict_str
            self.train = False
        self.dnn_type = dnn_type
        self.max_len = max_len

    def clean(self, seq):
        if len(seq):
            seq = unidecode(seq)
            return ' '.join(
                Text.text_to_word_sequence(
                    seq,
                    filters=
                    '''1234567890!"#$%&()*+,-\n./—:;<=>?@[\\]^_`{|}~\t\'“”'''))

    def fit(self):
        ''' Fit vectorizer on corpus '''

        Tokenizer = Text.Tokenizer
        tokenizer = Tokenizer(self.n_words)

        print('cleaning text')
        texts = [self.clean(entry[self.field]) for entry in self.articles]
        print('fitting vector')
        try:
            tokenizer = pickle.load(open('vector234.pkl', 'rb'))
        except FileNotFoundError:
            tokenizer.fit_on_texts(texts)
            pickle.dump(tokenizer, open('vector234.pkl', 'wb'))
        self.corpus_vector = tokenizer
        self.lookup = {
            k: v
            for k, v in self.corpus_vector.word_index.items()
            if v < self.n_words
        }

        json.dump(self.lookup, open('lookup234.json', 'w'))

    def gen_x_onehot(self):
        if self.train:
            text = [self.clean(_[self.field]) for _ in self.articles]
        else:
            text = self.articles
        for entry in text:
            entry = keras.preprocessing.text.text_to_word_sequence(entry)
            yield [self.lookup[word] for word in entry if word in self.lookup]

    def transform_x_onehot(self):
        x = list(self.gen_x_onehot())
        #         v_len = max([len(_)for _ in x])
        #         print ('longest text', v_len)
        #         if v_len > self.max_len:
        #             v_len = self.max_len
        self.rev_lookup = {v: k for k, v in self.lookup.items()}
        v_len = self.max_len
        print('using limit of', v_len)
        self.lens = []
        for entry in x:
            self.lens.append(len(entry))

            if len(entry) >= v_len:
                yield np.array(entry[-v_len:])
            else:
                yield np.array([0 for _ in range(v_len - len(entry))] + entry)

    def transform_y(self):
        ''' Vectorizes y labels '''
        for entry in self.articles:
            yield np.array(
                [1 if _ in entry['flags'] else 0 for _ in self.labels])

    def transform_x(self):
        ''' Transforms texts to the vector '''

        text = [self.clean(_[self.field]) for _ in self.articles]
        return self.corpus_vector.texts_to_matrix(text)


#         vector = pickle.load(open('./vector234.pkl', 'rb'))

#         self.lookup = json.load(open('lookup234.json'))

#         return list(self.transform_x_onehot())

    def x_y(self):
        self.fit()
        print('producing x, y data')
        y = list(self.transform_y())

        if self.dnn_type == 'seq':
            x = list(self.transform_x_onehot())
        elif self.dnn_type == 'bow':
            x = self.transform_x()
        return x, y


def prep_data():
    k_v = KerasVectorizer(max_len=2000)
    #http://www.newswhip.com/2013/12/article-length/
    x, y = k_v.x_y()
    print('data prepared')
    print(x[0].shape)

    return k_v, x, y


def predict_data(text):
    k_v = KerasVectorizer(max_len=2000, predict_str=[text])

    x = k_v.transform_x()
    print('data prepared')
    print(x[0].shape)

    return k_v, x

Using TensorFlow backend.


In [4]:
# !rm *.pkl
# 

In [5]:
#%%time


def train_setup():
    k_v, X, Y = prep_data()

    def val_set(x, y):
        val_size = .15
        val_ind = int(len(x) * val_size)
        print(val_ind, len(x))

        randomize = np.arange(len(x))
        np.random.shuffle(randomize)

        x = np.array(x)[randomize]
        y = np.array(y)[randomize]

        x = x[:-val_ind]
        y = y[:-val_ind]
        x_val = x[-val_ind:]
        y_val = y[-val_ind:]
        assert len(y) == len(x)

        return x, y, x_val, y_val

    x, y, x_val, y_val = val_set(X, Y)
    return x, y, x_val, y_val, k_v


def load_pickles():
    pickle_rick = 'x', 'y', 'x_val', 'y_val', 'k_v'

    for rick in pickle_rick:
        yield pickle.load(open(rick + '.pkl', 'rb'))


def save_pickles():
    x, y, x_val, y_val, k_v = train_setup()
    print('saving pickles')
    pickle_rick = {'x': x, 'y': y, 'x_val': x_val, 'y_val': y_val, 'k_v': k_v}
    for k, v in pickle_rick.items():
        yield pickle.dump(v, open(k + '.pkl', 'wb'))


try:
    x, y, x_val, y_val, k_v = list(load_pickles())
    print('pickles loaded')
except Exception as e:

    #     x, y, x_val, y_val, k_v = train_setup()
    
    list(save_pickles())
    print(e)
finally:
    x, y, x_val, y_val, k_v = list(load_pickles())
    

pickles loaded


In [1]:
# i = np.random.randint(0, len(x))
# print(i)
# print(x[i])
# #print([k_v.labels[n] for n,v in enumerate(y[i]) if v >0])
# for word in x[i]:
#     if word:
#         print(k_v.rev_lookup[word])

In [None]:
def dnn():

    Sequential = keras.models.Sequential
    load_model = keras.models.load_model
    Tokenizer = keras.preprocessing.text.Tokenizer
    Activation = keras.layers.Activation
    SGD = keras.optimizers.SGD
    Adam = keras.optimizers.Adam
    BatchNormalization = keras.layers.BatchNormalization
    to_categorical = keras.utils.to_categorical
    ModelCheckpoint = keras.callbacks.ModelCheckpoint
    Embedding = keras.layers.Embedding
    Reshape = keras.layers.Reshape
    Flatten = keras.layers.Flatten
    Dropout = keras.layers.Dropout
    Concatenate = keras.layers.Concatenate
    Dense = keras.layers.Dense
    Model = keras.models.Model
    Input = keras.layers.Input
    Conv2D = keras.layers.Conv2D
    MaxPool2D = keras.layers.MaxPool2D
    Conv1D = keras.layers.Conv1D
    MaxPool1D = keras.layers.MaxPool1D

    n_classes = 17

    def define_model_rnn():
        vector_len = x[0].shape[0]
        vocab_size = k_v.n_words
        embedding_dim = 10
        model = Sequential()
        model.add(
            keras.layers.Embedding(
                vocab_size, embedding_dim, input_shape=(vector_len, )))
        model.add(keras.layers.GRU(3, dropout=0.2, recurrent_dropout=0.2))
        model.add(Activation('relu'))
        model.add(Dense(n_classes, ))
        model.add(Activation('sigmoid'))
        return model

    def define_model():
        vector_len = k_v.n_words
        model = Sequential()
        model.add(Dense(128, input_shape=(vector_len, )))
        model.add(Activation('relu'))
        model.add(BatchNormalization())
        model.add(Dense(32))
        model.add(Activation('relu'))
        model.add(BatchNormalization())
        model.add(Dense(n_classes, ))
        model.add(Activation('sigmoid'))
        return model

    def define_model_cnn():

        sequence_length = x.shape[1]
        vocabulary_size = k_v.n_words
        embedding_dim = 10
        filter_sizes = [1,2, 3]
        num_filters = 128
        drop = 0.5
        batch_size = 5

        inputs = Input(shape=(sequence_length, ), dtype='int32')
        
        embedding = Embedding(
            input_dim=vocabulary_size,
            output_dim=embedding_dim,
            input_length=sequence_length)(inputs)
        
        reshape = Reshape((sequence_length, embedding_dim, 1))(embedding)

        conv_0 = Conv2D(
            num_filters,
            kernel_size=(filter_sizes[0], embedding_dim),
            padding='valid',
            kernel_initializer='normal',
            activation='relu')(reshape)
        conv_1 = Conv2D(
            num_filters,
            kernel_size=(filter_sizes[1], embedding_dim),
            padding='valid',
            kernel_initializer='normal',
            activation='relu')(reshape)
        
        conv_2 = Conv2D(
            num_filters,
            kernel_size=(filter_sizes[2], embedding_dim),
            padding='valid',
            kernel_initializer='normal',
            activation='relu')(reshape)

        maxpool_0 = MaxPool2D(
            pool_size=(sequence_length - filter_sizes[0] + 1, 1),
            strides=(1, 1),
            padding='valid')(conv_0)
        maxpool_1 = MaxPool2D(
            pool_size=(sequence_length - filter_sizes[1] + 1, 1),
            strides=(1, 1),
            padding='valid')(conv_1)
        
        maxpool_2 = MaxPool2D(
            pool_size=(sequence_length - filter_sizes[2] + 1, 1),
            strides=(1, 1),
            padding='valid')(conv_2)

        concatenated_tensor = Concatenate(axis=1)(
            [maxpool_0, maxpool_1,maxpool_2])

        flatten = Flatten()(concatenated_tensor)
        dropout = Dropout(drop)(flatten)
        dense1= Dense(64, activation='relu')(dropout)
        bnorm1 = BatchNormalization()(dense1)
        dense2= Dense(32, activation='relu')(bnorm1)
        bnorm2 = BatchNormalization()(dense2)
        output = Dense(units=n_classes, activation='sigmoid')(bnorm2)
        
        model = Model(inputs=inputs, outputs=output)
        print (model.summary())
        return model


    label_dict = {k: i for i, k in enumerate(k_v.labels)}

    print('starting training')

    def train():

        model = define_model_cnn()

        embedding_layer_names = set(
            layer.name for layer in model.layers
            if layer.name.startswith('embedding_')
            or layer.name.startswith('dense_'))

        #         tb = keras.callbacks.TensorBoard(
        #             histogram_freq=0,
        #             batch_size=30,
        #             log_dir='./logs/test',
        #             write_graph=False,
        #             write_grads=False,
        #             write_images=False,
        #             embeddings_freq = 1,
        #             embeddings_layer_names=embedding_layer_names,
        #             embeddings_metadata='metadata.tsv')

        lr1 = Adam(lr=0.00005)
        lr2 = Adam(lr=0.0001)
        adam = Adam(lr=0.001)
        early_stop = keras.callbacks.EarlyStopping(
            monitor='val_loss',
            min_delta=0,
            patience=5,
            verbose=1,
            mode='auto')
        checkpointer = ModelCheckpoint(
            filepath='tester.h5', verbose=1, save_best_only=False)
        
        model.compile(
            loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
        
        
        history = model.fit(
            np.array(x),
            np.array(y),
            epochs=80,
            verbose=1,
            validation_data=(x_val, y_val),
            callbacks=[
                #                 tb,
                keras.callbacks.TensorBoard(
                    log_dir='./logs/CNN', write_graph=False),
                early_stop,
                checkpointer,
            ])

    train()


dnn()

starting training
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 2000)         0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 2000, 10)     200000      input_1[0][0]                    
__________________________________________________________________________________________________
reshape_1 (Reshape)             (None, 2000, 10, 1)  0           embedding_1[0][0]                
__________________________________________________________________________________________________
conv2d_1 (Conv2D)               (None, 2000, 1, 128) 1408        reshape_1[0][0]                  
___________________________________________________________________________________________


Epoch 12/80

Epoch 13/80

Epoch 14/80

Epoch 15/80

Epoch 16/80

Epoch 17/80

Epoch 18/80

Epoch 19/80

Epoch 20/80

Epoch 21/80

Epoch 22/80

Epoch 23/80

Epoch 24/80

Epoch 25/80

Epoch 26/80

Epoch 27/80

Epoch 28/80

Epoch 29/80

Epoch 30/80

Epoch 31/80

Epoch 32/80

Epoch 33/80

Epoch 34/80

Epoch 35/80



Epoch 36/80

Epoch 37/80

Epoch 38/80

Epoch 39/80

In [None]:
%%time
    
load_model = keras.models.load_model
model = load_model('tester.h5')




label_dict = {i: k for i, k in enumerate(k_v.labels)}

preds = [model.predict(np.array(text).reshape(1,-1)) for text in x_val[:10]]
np.set_printoptions(precision=3, suppress=True)
preds
pred_dict = {
    label_dict[i]: round(float(p), 6) for i, p in enumerate([_ for _ in preds[0].flatten()])
}


final_output = [    {
    label_dict[i]: round(float(p), 6) for i, p in enumerate([_ for _ in pred.flatten()])
} for pred in preds]

In [None]:
from pprint import pprint

for i in range(15):
    t = [
        _[0] for _ in sorted(
            final_output[i].items(), key=lambda kv: kv[1], reverse=True)[:1]
    ]

    p = [k_v.labels[j] for j in [k for k, _ in enumerate(y_val[i]) if _ > 0]]
    print(t, '\t', p,len(set(p) & set(t)))

In [None]:
import pandas as pd
%matplotlib inline

res = pd.DataFrame(final_output,y_val[:10])
res.transpose().plot(kind='barh');


In [None]:
from itertools import islice

def metadata():
    with open('./logs/test/metadata.tsv','w') as meta:
        meta.write('word\tvalue\n')
        
        meta.write('NULL\tNULL\n')
        for k, v in sorted(k_v.lookup.items(),key=lambda kv: kv[1]):
            meta.write(k+'\t'+str(v)+'\n')
        
    with open('./logs/test/metadata.tsv') as meta_read:
        print(len([_ for _ in meta_read.readlines()]))
#         print(meta_read.read()[:100])
        return
        

        
def labels():
    with open('./logs/test/metadata_labels.tsv','w') as meta:
        meta.write('label\tnumber\n')
        for k, v in enumerate(k_v.labels):
            
            
            meta.write(str(k)+'\t'+str(v))
                
            meta.write('\n')
                    
            
        
    with open('./logs/test/metadata_labels.tsv') as meta_read:
        print(meta_read.read())
#         print(len([_ for _ in meta_read.readlines()]))
        
        return
metadata()
# labels()




def visualize():
    rank = (_ for _ in k_v.lookup.items())
    [next(rank) for _ in range(18999)]
    pprint([next(rank) for _ in range(1000)])
    