### Imports

In [1]:

import pickle as pkl
import pandas as pd
import pandas
import numpy as np
from nltk.tokenize import sent_tokenize, word_tokenize 
import gensim 
from gensim.models import Word2Vec 
import rouge
from nltk.tokenize.treebank import TreebankWordDetokenizer

import tensorflow as tf
from tensorflow.keras.layers import Dense, LSTM, Activation, Bidirectional, Dropout, Input, concatenate, Reshape, TimeDistributed, Flatten
from tensorflow.keras.models import Sequential
from tensorflow.keras import Model
import tensorflow.keras.backend as K


### Loading Data

In [2]:
data = pd.read_pickle('data/tokenized.pkl')
data

Unnamed: 0.1,Unnamed: 0,publication,content
0,0,New York Times,"[WASHINGTON, —, Congressional, Republicans, ha..."
1,1,New York Times,"[After, the, bullet, shells, get, counted, ,, ..."
2,2,New York Times,"[When, Walt, Disney, ’, s, “, Bambi, ”, opened..."
3,3,New York Times,"[Death, may, be, the, great, equalizer, ,, but..."
4,4,New York Times,"[SEOUL, ,, South, Korea, —, North, Korea, ’, s..."
...,...,...,...
47220,47220,BBC_tech,"[BT, is, introducing, two, initiatives, to, he..."
47221,47221,BBC_tech,"[Computer, users, across, the, world, continue..."
47222,47222,BBC_tech,"[A, new, European, directive, could, put, soft..."
47223,47223,BBC_tech,"[The, man, making, sure, US, computer, network..."


In [5]:
all_sentences = list(data['content'])

### Getting Relevant publications

In [6]:
# selected_publications = [
#  'Breitbart',
#  'CNN',
#  'New York Times',
#  'NPR',
#  'Fox News',
#  'Reuters']
selected_publications = [
 'Breitbart',
 'CNN',
 'New York Times']

In [7]:
all_publications = list(set(data['publication']))
all_publications

['Atlantic',
 'Talking Points Memo',
 'Buzzfeed News',
 'New York Times',
 'National Review',
 'BBC_sport',
 'BBC_tech',
 'Vox',
 'Washington Post',
 'CNN',
 'BBC_entertainment',
 'Business Insider',
 'Breitbart',
 'New York Post',
 'Fox News',
 'Guardian',
 'BBC_politics',
 'NPR',
 'BBC_business',
 'Reuters']

In [8]:
# Take only the contents from publications with >= 3000 samples.
publications = [pub for pub in all_publications if len(data[data['publication'] == pub]) >= 3000 and pub in selected_publications]
publications

['New York Times', 'CNN', 'Breitbart']

In [9]:
contents = []
for pub in publications:
    contents.append(np.asarray(data[data['publication'] == pub]['content']))

### Padding with special Character

In [10]:
max_seq_length = max([len(seq) for content in contents for seq in content])
max_seq_length

307

In [11]:
end_token = '~?@_'

In [12]:
for content in contents:
    for seq in content:
        seq.extend([end_token] * (max_seq_length - len(seq)))

In [13]:
max_seq_length = max([len(seq) for content in contents for seq in content])
max_seq_length

307

### Vectorize Words

In [14]:
word_dim = 100

In [15]:

word2vec = gensim.models.Word2Vec(all_sentences, min_count = 1,  
                              size = word_dim, window = 5) 

In [16]:
print("Cosine similarity between 'congress' " + 
               "and 'senate' - CBOW : ", 
    word2vec.wv.similarity('congress', 'senate')) 
      
print("Cosine similarity between 'congress' " +
                 "and 'house' - CBOW : ", 
    word2vec.wv.similarity('congress', 'house')) 

Cosine similarity between 'congress' and 'senate' - CBOW :  0.5585541
Cosine similarity between 'congress' and 'house' - CBOW :  0.20196319


In [17]:
word2vec.wv['Congressional']

array([-1.81409   , -1.8259698 , -0.5213424 , -1.0039812 , -1.1219344 ,
       -1.1313971 ,  1.053734  , -2.0641692 ,  0.08805798, -1.3682878 ,
       -1.0811073 , -1.4631051 , -0.7301839 ,  0.65787923,  0.07922222,
        0.11877885, -1.6522729 , -1.4045465 , -1.2127597 ,  1.4468809 ,
       -0.9213013 , -0.24367768, -2.4355125 ,  1.3575794 ,  0.0561852 ,
       -0.24700265,  0.4540977 ,  2.4848862 ,  1.3756186 , -0.47934413,
       -1.4292005 , -0.05118076,  0.5859316 , -1.1900172 , -0.52885246,
       -1.2477894 ,  1.9266349 ,  1.2263732 ,  0.91029304,  1.9951388 ,
        1.9347837 ,  1.3460112 , -1.2425102 ,  0.5068862 , -0.25383335,
       -0.6194678 ,  2.034936  ,  1.4890639 ,  0.3475387 ,  1.3723586 ,
       -1.1544163 ,  0.41810307, -0.36372223,  3.263224  ,  0.8378256 ,
        0.03856661,  1.5172393 , -0.5618742 , -0.01595489, -0.18518563,
       -0.19397275,  0.8147149 , -0.46721897, -1.6012917 ,  0.53174263,
        1.0769796 , -0.46443713, -2.0040765 ,  0.23908596, -1.56

In [18]:
word2vec.wv.similar_by_vector(word2vec.wv['Congressional'])

[('Congressional', 0.9999998807907104),
 ('Budget', 0.8419904708862305),
 ('Ethics', 0.7599501609802246),
 ('Government', 0.7324299812316895),
 ('Fairness', 0.7036246657371521),
 ('Accountability', 0.6974054574966431),
 ('Information', 0.6918414831161499),
 ('Freedom', 0.6913105249404907),
 ('Services', 0.6477600932121277),
 ('Relations', 0.6472127437591553)]

In [19]:
contents = np.asarray(contents)

In [20]:
samples = np.zeros(shape=(contents.shape[0], contents.shape[1], max_seq_length, word_dim))

In [21]:
for i in range(contents.shape[0]):
    for j in range(contents.shape[1]):
        for k in range(max_seq_length):
            samples[i, j, k, :] = word2vec.wv[contents[i, j][k]]

In [22]:
# TODO use closest cosine distance to find output word.

## Funciton Definitions

In [23]:
def squareError(xTrue, xPred):
    return K.square(xTrue - xPred)


In [24]:
def reconstructionLoss(sample, encoder, decoder, f_w, weight): # (L_1 from the paper)
    return K.mean(squareError(sample, decoder(encoder(sample)))) + K.mean(weight*K.log(f_w(encoder(sample))))


In [25]:
def divergenceLoss(f_w, encoder, sample, z_j, n_j): # Mean of log f_w(E_theta_i(x_j)) + log (1-f_w(z_j, n_j)) from the paper (L_2).
    return K.mean(K.log(f_w(encoder(sample)))) + K.mean(K.log(1 - f_w([z_j, n_j])))


In [26]:
def sample(data, domain, num_samples):
    N = data.shape[1]
    return tf.convert_to_tensor(data[domain, np.random.choice(N, num_samples, replace=True),:,:], dtype=tf.float32)


In [27]:
# Currently just doing a restriction to the last z variables, might want to do a matrix multiplication?
# pi_Z from the paper. projects a latent distribution in (z, n) to z
def projectZ(encoded):
    return encoded[0] # take zs.

In [28]:
def projectN(encoded):
    return encoded[1] # taek Ns.

In [29]:
# takes in two inputs, n and z, and outputs samples.
def createDecoder(z_dims, n_dims, time_steps, output_dims):
    # TODO MAYBE: Add in more regularization or different than dropout?

    z_inputs = Input(shape=(z_dims,))
    n_inputs = Input(shape=(n_dims,))
    inputs = concatenate([z_inputs, n_inputs])
#     # 150 is arbitrary rn...
#     dense = Dense(150)(inputs)
    dense = Dense(time_steps*output_dims)(inputs)
    reshape = Reshape((time_steps, output_dims))(dense)
    # TODO Reshape to enforce time_steps?
    bilstm = Bidirectional(LSTM(32, activation='tanh', return_sequences=True))(reshape)
    bilstm = Dropout(0.2)(bilstm)
    bilstm = Bidirectional(LSTM(32, activation='tanh', return_sequences=False))(bilstm)
    
    dense = Dense(time_steps*output_dims, activation='linear')(bilstm)
    outputs = Reshape((time_steps, output_dims))(dense)
    
    model = Model(inputs=[z_inputs, n_inputs], outputs=outputs)
    
    return model

In [30]:
def createEncoder(time_steps, input_num, z_dims, n_dims):
    # TODO MAYBE: Add in more regularization or different than dropout?
    inputs = Input(shape=(time_steps, input_num,))
    bilstm = Bidirectional(LSTM(32, activation='tanh', return_sequences=True))(inputs)
    bilstm = Dropout(0.2)(bilstm)
    dense = Bidirectional(LSTM(32, activation='tanh', return_sequences=False))(bilstm)
    dense = Dropout(0.5)(dense)
    z_output = Dense(z_dims, activation='linear')(dense)
    n_output = Dense(n_dims, activation='linear')(dense)
    
    model = Model(inputs=inputs, outputs=[z_output, n_output])
    
    return model

In [31]:
def createDiscriminator(z_dims, n_dims):
    z_inputs = Input(shape=(z_dims,))
    n_inputs = Input(shape=(n_dims,))
    inputs = concatenate([z_inputs, n_inputs])
    
    # 150, 100 is arbitrary rn...
    dense = Dense(150, activation='relu')(inputs)
    dense = Dense(100, activation='relu')(dense)
    output = Dense(1, activation='sigmoid')(dense)
    
    model = Model(inputs=[z_inputs, n_inputs], outputs=output)
    
    return model

In [48]:
lr = 5e-4
enc_optimizer = tf.keras.optimizers.Adam(lr)
dec_optimizer = tf.keras.optimizers.Adam(lr)
disc_optimizer = tf.keras.optimizers.Adam(lr)

### When $P_Z$ is known... 

In [49]:

# k is num of domains.
# encoders is a list of encoders.
# decoders is list of decoders.
# samples is a K x N x Timesteps x dim, array of samples, where the 0th index is the domain,
# the 1th index is the # of the sample in that domain, 2th index is the # timesteps per sequence, 3th index is the #
# of dimensions at each timestep
# original_domains is a list of the original domains P_z was derived from.

# Currently assuming P_Z is known. Must approximate P_Z first.
def trainAutoencodersWithPz(samples, encoders, decoders, discriminator, num_samples, original_domains, epochs, weight=1.0):
    N = samples.shape[1]
    k = samples.shape[0]
        
    
    for i in range(k):
        if i not in original_domains:
            original_domain = np.random.choice(original_domains)
            encoder = encoders[i]
            decoder = decoders[i]
            original_encoder = encoders[original_domain]
            epoch = 0
            while(epoch < epochs): # TOOD: could also do until some convergence criteria.
                p_Xi_samples = sample(samples, i, num_samples)
                p_Z_samples = projectZ(original_encoder(sample(samples, original_domain, num_samples)))
                p_Ni_samples = projectN(encoder(sample(samples, i, num_samples)))

                with tf.GradientTape() as enc_tape, tf.GradientTape() as dec_tape, tf.GradientTape() as disc_tape:

                    reconstruction_loss = reconstructionLoss(p_Xi_samples, encoder, decoder, discriminator, weight)

                    # negative b/c gradient ascent.
                    divergence_loss = -1 * divergenceLoss(discriminator, encoder, p_Xi_samples, p_Z_samples, p_Ni_samples)

                gradients_of_encoder = enc_tape.gradient(reconstruction_loss, encoder.trainable_variables)
                gradients_of_decoder = dec_tape.gradient(reconstruction_loss, decoder.trainable_variables)
                gradients_of_discriminator = disc_tape.gradient(divergence_loss, discriminator.trainable_variables)


                enc_optimizer.apply_gradients(zip(gradients_of_encoder, encoder.trainable_variables))
                dec_optimizer.apply_gradients(zip(gradients_of_decoder, decoder.trainable_variables))
                disc_optimizer.apply_gradients(zip(gradients_of_discriminator, discriminator.trainable_variables))
                
                print('Domain {}, Epoch {}:\n\tReconstruction Loss: {}\n\tDivergence Loss: {}'.format(i, epoch+1, reconstruction_loss, divergence_loss))
                epoch+=1
            

### When $P_Z$ is unknown...
"A straight-forward approach for learning the latent distribution PZ is to train a regularized autoencoder on data from a
single representative domain. However, such a representation could potentially capture variability that is specific to
that one domain. To learn a more invariant latent representation, we propose the following extension of our autoencoder
framework. The basic idea is to alternate between training
multiple autoencoders until they agree on a latent representation that is effective for their respective domains. This is
particularly relevant for applications to biology; for example, often one is interested in learning a latent representation
that integrates all of the data modalities."

In [50]:

# k is num of domains.
# encoders is a list of encoders.
# decoders is list of decoders.
# samples is a K x N x Timesteps x dim, array of samples, where the 0th index is the domain,
# the 1th index is the # of the sample in that domain, 2th index is the # timesteps per sequence, 3th index is the #
# of dimensions at each timestep
# domains is a list of the domains we are currently training over.

def trainAutoencodersInitial(samples, encoders, decoders, discriminator, num_samples, domains, epochs, weight=1.0):
    N = samples.shape[1]
    k = samples.shape[0]
    
    for i in domains:
        encoder = encoders[i]
        decoder = decoders[i]
        for j in domains:
            if i != j:
                j_encoder = encoders[j]
                epoch = 0
                while(epoch < epochs): # TOOD: could also do until some convergence criteria.
                    p_Xi_samples = sample(samples, i, num_samples)
                    p_Zj_samples = projectZ(j_encoder(sample(samples, j, num_samples)))
                    p_Ni_samples = projectN(encoder(sample(samples, i, num_samples)))

                    with tf.GradientTape() as enc_tape, tf.GradientTape() as dec_tape, tf.GradientTape() as disc_tape:

                        reconstruction_loss = reconstructionLoss(p_Xi_samples, encoder, decoder, discriminator, weight)
#                         print(p_Xi_samples)

                        # negative b/c gradient ascent.
                        divergence_loss = -1 * divergenceLoss(discriminator, encoder, p_Xi_samples, p_Zj_samples, p_Ni_samples)
#                         print(p_Zj_samples)
#                         print(p_Ni_samples)
                        
                    gradients_of_encoder = enc_tape.gradient(reconstruction_loss, encoder.trainable_variables)
                    gradients_of_decoder = dec_tape.gradient(reconstruction_loss, decoder.trainable_variables)
                    gradients_of_discriminator = disc_tape.gradient(divergence_loss, discriminator.trainable_variables)
            

                    enc_optimizer.apply_gradients(zip(gradients_of_encoder, encoder.trainable_variables))
                    dec_optimizer.apply_gradients(zip(gradients_of_decoder, decoder.trainable_variables))
                    disc_optimizer.apply_gradients(zip(gradients_of_discriminator, discriminator.trainable_variables))
                    
                    print('Domain {}, Epoch {}:\n\tReconstruction Loss: {}\n\tDivergence Loss: {}'.format(i, epoch+1, reconstruction_loss, divergence_loss))
                    epoch+=1
            

In [51]:
# samples is a K x N x Timesteps x dim, array of samples, where the 0th index is the domain,
# the 1th index is the # of the sample in that domain, 2th index is the # timesteps per sequence, 3th index is the #
# of dimensions at each timestep

def initModel(samples, z_dims, n_dims):
    
    k = samples.shape[0]
    N = samples.shape[1]
    time_steps = samples.shape[2]
    dim = samples.shape[3]
    
    
    
    discriminator = createDiscriminator(z_dims, n_dims)
    
    encoders = []
    decoders = []
    
    for i in range(k):
        encoders.append(createEncoder(time_steps, dim, z_dims, n_dims))
        decoders.append(createDecoder(z_dims, n_dims, time_steps, dim))
    
    return encoders, decoders, discriminator

In [52]:
def translate(start_sequences, samples, encoders, decoders, start_domain, end_domain):
    N = samples.shape[1]
    print(start_sequences.shape)
    num_samples = start_sequences.shape[0]
    
    start_encoder = encoders[start_domain]
    end_encoder = encoders[end_domain]
    end_decoder = decoders[end_domain]
    
    z = projectZ(start_encoder(start_sequences))
    n = projectN(end_encoder(sample(samples, end_domain, num_samples)))
    
    end_sequences = end_decoder([z, n])
    return end_sequences
    

In [53]:
def vecSeqToSentence(sequence):
    sequence = K.eval(sequence)
    sentence = []
    for i in range(sequence.shape[0]):
        word = sequence[i,:]
#         print(word)
#         print(word2vec.wv.similar_by_vector(word))
        sentence.append(word2vec.wv.similar_by_vector(word)[0][0])
    print(sentence)

In [54]:
n_dims = 20 # len(n)
z_dims = 80 # len(Z)

num_epochs = 1
num_samples = 128

original_domains = [0, 1]



In [55]:
# samples = tf.convert_to_tensor(samples)

In [56]:
encoders, decoders, discriminator = initModel(samples, z_dims, n_dims)

### Original First Sentence from NYT

In [57]:
' '.join(contents[0, 0])

'WASHINGTON — Congressional Republicans have a new fear when it comes to their health care lawsuit against the Obama administration : They might win . The incoming Trump administration could choose to no longer defend the executive branch against the suit , which challenges the administration ’ s authority to spend billions of dollars on health insurance subsidies for and Americans , handing House Republicans a big victory on issues . But a sudden loss of the disputed subsidies could conceivably cause the health care program to implode , leaving millions of people without access to health insurance before Republicans have prepared a replacement . That could lead to chaos in the insurance market and spur a political backlash just as Republicans gain full control of the government . To stave off that outcome , Republicans could find themselves in the awkward position of appropriating huge sums to temporarily prop up the Obama health care law , angering conservative voters who have been d

In [58]:
seq = tf.convert_to_tensor(np.asarray([samples[0, 0, :, :]]), dtype=tf.float32)
translation = translate(seq, samples, encoders, decoders, original_domains[0], original_domains[1])

(1, 307, 100)


### Original First Sentence from NYT translated to CNN before Training (Random)

In [59]:

vecSeqToSentence(translation[0,:,:])

['bid.', 'stiches', 'woke', 'riverboat', 'ensued.', 'hemorrhaged', '112mph', 'Unfettered', 'accomodate', 'Kindergarten', 'Understood', 'modernizer', 'this✌🏻our', 'classfied', 'into', 'Filipinos', '351', '£1', 'Haggadah', 'criticism', '103.05', 'importance', 'Trumpileaks', 'KCTV', 'Stallion', '3099', 'cataclysms', 'little', 'Limit', '4.89', 'barstool', 'Succumbing', 'USICH', 'incongrous', 'Belgium.', 'smokehouse', 'treasury', 'PVDA', 'Heartbroken', 'Headboards', 'Neave', 'Bangko', 'serious', 'tMe', 'soon-to-retire', 'first', 'hideouts', 'dependability', 'Dimes', 'junta', 'cathartically', 'Hildebrandt', 'CYO', 'particular', 'Employing', 'eyeshadow', 'vital', 'taking', 'fences.', 'LAVC', 'McQueary', 'TamirRice', 'Firman', 'preliminary', 'healthspan', 'hex', 'psychotropic', 'Seaport', '19:15', 'Lonnergan', 'Downplaying', 'domestically', 'multi-billion', 'beseeching', 'fables', "'50/50", 'LeBlanc', 'homicide', 'Wass', 'sky.', 'Asmussen', 'resumed', 'abstinence', 'readouts', 'Chicago.', 'tir

In [None]:
trainAutoencodersInitial(samples, encoders, decoders, discriminator, num_samples, original_domains, epochs=10, weight=1.0)


Domain 0, Epoch 1:
	Reconstruction Loss: 0.8598251342773438
	Divergence Loss: 1.355478048324585


### Original First Sentence from NYT translated to CNN after training

In [150]:
translate(seq, samples, encoders, decoders, original_domains[0], original_domains[1])
vecSeqToSentence(translation[0,:,:])

(1, 307, 100)
['Corsica', 'harbours', 'Assurance', 'not', 'Hoerbranz', 'cohered', '6,000-strong', 'ini', 'klansman', 'dudgeon', 'Seltzer', 'Splitting', 'WKMG', 'incarcerating', 'Suchitra', 'givemecookies', 'losses.', 'internists', 'intranet', 'recalled', 'Co-operation', 'unsolvable', 'Poms', 'monolingual', 'Vannevar', 'pap', 'Licht', 'guinea', 'butt.', '50mg', 'Montengro', "'Throughout", 'Hounds', 'biweekly', 'LAUGHTER', 'Fung', 'Femi', 'pigeonholed', 'secular.', 'sheikhs', 'idle', 'perused', 'Zawia', 'épater', 'City', 'recommitment', '31m', 'pulverizes', '1561', 'SB193', 'Fraternal', 'Manteca', 'Ryen', 'Chiemingo', 'newspapers', 'Chaikin', 'depends', 'hollows', '464,000', 'Purcellville', 'double-digit', 'Haka', '572,900', 'correctly.', 'unmemorable', 'Unprocessed', '1:27:34', '1981.', 'A1C', 'adopter', 'Instagramming', 'Carolinians', 'Photoshopped', 'Cottom', 'gamesmanship', 'arguable', 'glitziest', 'Rapides', 'snowboards', 'unconcern', 'Reidar', 'Hegeman', 'stooping', 'embrace', 'yea

In [153]:
trainAutoencodersWithPz(samples, encoders, decoders, discriminator, num_samples, original_domains, epochs=10, weight=1.0)

Domain 2, Epoch 1:
	Reconstruction Loss: 0.48000383377075195
	Divergence Loss: 1.581259846687317
Domain 2, Epoch 2:
	Reconstruction Loss: 0.4359170198440552
	Divergence Loss: 1.6108452081680298
Domain 2, Epoch 3:
	Reconstruction Loss: 0.3379688262939453
	Divergence Loss: 1.640911340713501


### Original First Sentence from NYT translated to Breitbart after Training

## Evaluation with Rouge

In [None]:
detok = TreebankWordDetokenizer()

evaluator = rouge.Rouge(metrics=['rouge-n', 'rouge-l', 'rouge-w'],
                        max_n=4,
                        limit_length=True,
                        length_limit=100,
                        length_limit_type='words',
                        apply_avg=False,
                        apply_best=True,
                        alpha=0.5, # Default F1_score
                        weight_factor=1.2,
                        stemming=True)

In [None]:
def evaluateOnArticles(articles, encoder, decoder):
    translated = decoder(encoder(articles))
       
    original_sentences = [detok.detokenize(tokens) for tokens in articles]
    
    translated_sentences = [vecSeqToSentence(tokens) for tokens in translated]
    
    scores = evaluator.get_scores(translated_sentences, original_sentences)
    
    for metric, results in sorted(scores.items(), key=lambda x: x[0]):
        print('\t{}:\t{}: {:5.2f}\t{}: {:5.2f}\t{}: {:5.2f}'.format(metric, 'P', 100.0 * results['p'], 'R', 100.0 * results['r'], 'F1', 100.0 * results['f']))

In [None]:
def evaluate(articles_df, encoders, decoders):
    
    publications = articles_df.publication.unique()
    for i in range(len(publications)):
        for j in range(len(publications)):
            if (i != j):
                pub1=publications[i]
                pub2=publications[j]
                source_articles = articles_df.loc[articles_df['publication'] == pub1]['content'].tolist()

                print(pub1,"to",pub2)
                evaluateOnArticles(source_articles, encoders[i], decoders[j])
                print()