===========================================

Title: 10.3 Project Milestone 4

Author: Chad Wood

Date: 17 Jan 2022

Modified By: Chad Wood

Description: This program is project milestone 4 towards the term project. It demonstrates setting up a deep neural network for the purpose of detecting bias in news articles.

===========================================

### Importing Data

In [1]:
import pandas as pd

files = [r'data\articles1.csv', 
         r'data\articles2.csv', 
         r'data\articles3.csv']

# Loads all articles
articles = pd.concat((pd.read_csv(f, usecols=['publication', 'content']) for f in files))

In [2]:
# Imports allsides.com dataset for determining bias
bias_df = pd.read_csv('https://raw.githubusercontent.com/favstats/AllSideR/master/data/allsides_data.csv',
                      usecols=['news_source', 'rating'])

# Creates list of unique publishers
publishers = articles.publication.unique()

#### Bias-Scoring The Data
Building the score dictionary:

In [3]:
import re

# Regex pattern identifying publishers
publisher = '|'.join(r'(?:{})'.format(x) for x in publishers)

# Selects publishers from 
df = bias_df.loc[bias_df['news_source'].str.contains(publisher, case=False)]

# Replaces bias_df publisher names with articles publisher names
pub_scores = df.copy()
for pub in publishers:
    pub_scores.loc[pub_scores.news_source.str.contains(pub, case=False), 'news_source'] = pub

# Defines 3 positions for bias and scores them
label = '(left|center|right)'
scores = {'left': 0, 'center': 1, 'right':2}

# Creates score column with score for each publishers rating
pub_scores['score'] = pub_scores['rating'].str.extract(label)[0].map(scores)

# Drops duplicate rows and redundant columns
pub_scores.drop_duplicates(['news_source'], inplace=True)
pub_scores.drop(columns=['rating'], inplace=True)

# Converts to dictionary
pub_scores = dict(zip(pub_scores.news_source, pub_scores.score))

Applying the scores:

In [4]:
# Removes articles that were not scored
articles = articles.loc[articles['publication'].isin(pub_scores.keys())].copy()

# Added scores as column for each publication
articles['scores'] = articles['publication'].apply(lambda x: pub_scores.get(x))

#### Cleaning Data

In [8]:
import re
import nltk
import spacy

# Loads spacy and customized stop_words 
nlp = spacy.load('en_core_web_sm')
stop_words = nltk.corpus.stopwords.words('english')
stop_words.remove('no')
stop_words.remove('but')
stop_words.remove('not')
stop_words.remove('against')

def normalize(corpus):   
    # General Cleansing
    html = re.compile('<.*?>') # Compiles regex for HTML tags
    corpus = corpus.apply(lambda x: re.sub(html, '', x)) # Removes HTML tags
    corpus = corpus.apply(lambda x: re.sub(r'\S*https?:\S*', '', x)) # Removes links
    corpus = corpus.apply(lambda x: re.sub("@[A-Za-z0-9_]+", '', x)) # Removes mentions
    corpus = corpus.apply(lambda x: re.sub('#([a-zA-Z0-9_]{1,50})', '', x)) # Removes hashtags    
    corpus = corpus.apply(lambda x: re.sub(r'[^a-zA-z\s]', '', str(x))) # Removes special characters
    corpus = corpus.apply(lambda x: re.sub(' +', ' ', x)) # Removes double+ spaces
    corpus = corpus.apply(lambda x: x.strip()) # Removes extra whitspaces

    # Runs text through pipeline
    clean_list = [] # Preserves cleaned text
    tok_list = [] # Preserves tokens
    for doc in nlp.pipe(corpus):
        tokens = doc
        clean_text = (' '.join(word.lemma_ # Returns roots...
                               if word.lemma_ != '-PRON-' # ...Unless POS is pronoun...
                               else word.text for word in doc # ...Then returns text for pronouns
                               if word.lemma_ not in stop_words)) # Filters stopwords
        
        tok_list.append(tokens) # Returns tokens
        clean_list.append(clean_text) # Returns clean text
        
    # Clean text to lowercase as Series
    clean_Series = pd.Series(clean_list).apply(lambda x: str(x).lower()) 
    
    return clean_Series, pd.Series(tok_list)

In [9]:
# Cleans text and retrieves spacy tokens
articles['norm_content'], articles['tokens'] = normalize(articles['content'])

In [39]:
# Collects an even portion of each bias, then shuffles randomly
# Sample size of 1000 per group to reduce computation time
data = articles.dropna().groupby(['scores']).sample(n=1000, random_state=1).sample(frac=1, random_state=1)

#### Model Setup

In [12]:
import tensorflow.compat.v1 as tf
import tensorflow_hub as hub
import numpy as np

# Splits data for supervised model
train, validate, test = np.split(data.sample(frac=1, random_state=42), # Shuffles data
                                 [int(.7*len(data)), int(.85*len(data))]) # Splits 0-70%; 70-85%; 85-100%


print('Reviews:',
      'Train ', train.norm_content.shape, 
      'Validate ', validate.norm_content.shape, 
      'Test', test.norm_content.shape)

print('Sentiments:',
      'Train ', train.norm_content.shape, 
      'Validate ', validate.norm_content.shape, 
      'Test', test.norm_content.shape)

Reviews: Train  (36550,) Validate  (7832,) Test (7833,)
Sentiments: Train  (36550,) Validate  (7832,) Test (7833,)


#### Building Features

In [40]:
from tqdm import trange # For status bar

# Usiversal Sentence Encoder Setup
flow_graph = tf.Graph()
with flow_graph.as_default():
    text_input = tf.placeholder(dtype=tf.string, shape=[None])
    embed = hub.Module("https://tfhub.dev/google/universal-sentence-encoder-large/3") 
    embedded_text = embed(text_input)
    init_op = tf.group([tf.global_variables_initializer(), tf.tables_initializer()])
    
flow_graph.finalize()

# Initializing TensorFlow
session = tf.Session(graph = flow_graph)
session.run(init_op)

# Computes embeddings in sentences
def sim_matrix(merge_list):
    # Array for embeddings (512 features per text)
    all_embeddings = np.zeros([len(merge_list),512])
    
    # Builds matrix of all embeddings
    for i in trange(0,len(merge_list)):
        i_embedding = session.run(embedded_text, feed_dict={text_input: [merge_list[i]]})
        all_embeddings[i,:] = i_embedding
        
    return all_embeddings

In [41]:
features_array = sim_matrix(data.norm_content.values)

100%|████████████████████████████████████████████████████████████████████████████| 3000/3000 [1:00:54<00:00,  1.22s/it]


#### Building Model

In [16]:
# Instantiates sentence embeding feature
embedding_feature = hub.text_embedding_column(
    key='content',
    module_spec="https://tfhub.dev/google/universal-sentence-encoder/2", # Leverages Universal Sentence Encoder
    trainable=False)

In [55]:
from keras.regularizers import l1, l2
from keras.models import Sequential
from keras.layers import Dropout
from keras.layers import Dense

# Optimizer:
optimize = tf.keras.optimizers.Adam(lr=0.00015) 

# Creates NN with two hidden layers of 40 neurons
DNN = Sequential()
DNN.add(Dense(40, input_dim=512, activation='relu', kernel_regularizer=l2(0.1)))
DNN.add(Dropout(0.25))
DNN.add(Dense(40, activation='relu', kernel_regularizer=l2(0.1)))
DNN.add(Dropout(0.25))

# Output layer of NN
DNN.add(Dense(4,activation='softmax'))

# Compiles the model
DNN.compile(loss='sparse_categorical_crossentropy', optimizer=optimize, metrics=['acc'])

#### Training Model

#### Evaluation