===========================================

Title: 8.2 Project Milestone 3

Author: Chad Wood

Date: 17 Jan 2022

Modified By: Chad Wood

Description: This program is project milestone 3 towards the term project. It demonstrates wrangling the data to be used and feature engineering on its unstructured data (news articles).

===========================================

In [None]:
'''
Not all of these are used. As this is a milestone,
consider this a draft of the models that may be used.
Some modules are still imported at specific cells because
it was necessary to restart and import only necessary modules to prevent 
memory errors from arrising.
'''

# Typical modules
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Google SP (linux can use tf_sentencepiece)
import sentencepiece

# Keras and tensorflow
import tensorflow as tf
import tensorflow_hub as hub

from keras.regularizers import l1, l2
from keras.models import Sequential
from keras.layers import Dropout
from keras.layers import Dense
from keras import optimizers
from keras import backend

#sklearn and imblearn
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import cross_val_predict
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

### Importing Data

In [None]:
import pandas as pd

files = [r'data\articles1.csv', 
         r'data\articles2.csv', 
         r'data\articles3.csv']

# Imports all articles
articles = pd.concat((pd.read_csv(f, usecols=['publication', 'content']) for f in files))

In [1]:
import pandas as pd

articles = pd.read_csv('articles_comp.csv')

In [2]:
# Imports allsides.com dataset for determining bias
bias_df = pd.read_csv('https://raw.githubusercontent.com/favstats/AllSideR/master/data/allsides_data.csv',
                      usecols=['news_source', 'rating'])

# Creates list of unique publishers
publishers = articles.publication.unique()

### Building a Bias-Score Dictionary

Building the score dictionary:

In [3]:
import re

# Regex pattern identifying publishers
publisher = '|'.join(r'(?:{})'.format(x) for x in publishers)

# Selects publishers from articles articles
df = bias_df.loc[bias_df['news_source'].str.contains(publisher, case=False)]

# Replaces bias_df publisher names with articles publisher names
pub_scores = df.copy()
for pub in publishers:
    pub_scores.loc[pub_scores.news_source.str.contains(pub, case=False), 'news_source'] = pub

# Defines 3 positions for bias and scores them
label = '(left|center|right)'
scores = {'left': 0, 'center': 1, 'right':2}

# Creates score column with score for each publishers rating
pub_scores['score'] = pub_scores['rating'].str.extract(label)[0].map(scores)

# Drops duplicate rows and redundant columns
pub_scores.drop_duplicates(['news_source'], inplace=True)
pub_scores.drop(columns=['rating'], inplace=True)

# Converts to dictionary
pub_scores = dict(zip(pub_scores.news_source, pub_scores.score))

Applying the scores:

In [4]:
# Removes articles that were not scored
articles = articles.loc[articles['publication'].isin(pub_scores.keys())].copy()

# Added scores as column for each publication
articles['scores'] = articles['publication'].apply(lambda x: pub_scores.get(x))

### Cleaning Data

Converts to lower and removes special characters:

In [8]:
import re

# Regex pattern
pattern = r'[^a-zA-z\s]'


articles['norm_content'] = articles.content.copy()
# Text to lowercase
articles['norm_content'] = articles.norm_content.apply(lambda x: str(x).lower())
# Removes special characters
articles['norm_content'] = articles.norm_content.apply(lambda x: re.sub(pattern, '', str(x)))

Filters stopwords and lemmatizes text:

In [None]:
import spacy
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner', 'textcat']) 

# Runs text through pipeline
nlp_list = []
def nlp_pipe(corpus):
    for doc in nlp.pipe(corpus, disable=['parser', 'ner', 'textcat']):
        nlp_list.append(' '.join(word.lemma_
                                 if word.lemma_ != '-PRON-' # Preserves pronouns
                                 else word.text for word in doc # Returns roots
                                 if not word.is_stop)) # Filters stopwords

    return pd.Series(nlp_list)

In [None]:
# Removes stopwords and lemmatizes text
articles['norm_content'] = nlp_pipe(articles['norm_content'])

### Collecting Features

In [9]:
# For reducing computation time
# Collects an even portion of each bias and drops raw content/pub cols
data = articles.dropna().groupby(['scores']).sample(n=17405, random_state=1).drop(columns=['content', 'publication'])

BOW:

In [None]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

# Gets bag of words features
cv = CountVectorizer(min_df=0., max_df=1.)
cv_X = cv.fit_transform(data.norm_content)
cv_names = cv.get_feature_names_out()

bow = pd.DataFrame(cv_X.toarray(), columns=cv_names)

In [14]:
bow.head()

Unnamed: 0,a_suwilm,aa,aaa,aaaaaah,aaaaah,aaaand,aaajiao,aaany,aaarena,aaas,...,zyngas,zyuganov,zyvex,zywicki,zyzo,zz,zzz,zzzanthropology,zzzs,zzzz
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


TF-idf:

In [17]:
from sklearn.feature_extraction.text import TfidfTransformer

# Initializes transformer instance
tfid = TfidfTransformer(norm='l2', use_idf=True)
# Fit to data, followed by transform count matrix
tfid_X = tfid.fit_transform(cv_X)

pd.DataFrame(np.round(tfid_X.toarray(), 2), columns=cv_names)


