<a href="https://colab.research.google.com/github/MatthewAlexOBrien/All-The-News/blob/master/code/subjects.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **1A. Identifying Named Subjects**

> We start by extracting the subject of each article sentence using Stanford NLP Group's dependency parser pipeline, availble through their python NLP package 'Stanza'. https://stanfordnlp.github.io/stanza/depparse.html. Next, we use Stanza's Named Entity Regognition pipline to determine which nominal subjects are in fact named entities https://stanfordnlp.github.io/stanza/ner.html. Sentences for which the nominal sibject is not named-entity are removed from the dataset.


*   Input: 50k subset of News Article Dataset. Has columns [date, year, month, day, title, article, url, section, publication]
*   Output: Sentence dataset with sentence subjects. Has columns [year, month, day, publication, article_id, sentence_id, sentence, article_names_partial, article_names_full, sentence_subject_names]

**Install and Import Libraries**


In [None]:
%%capture

# Packages not pre-install on Python 3.7
!pip3 install stanza
!pip3 install nltk
!pip3 install ethnicolr
!pip3 install vaderSentiment
!pip3 install afinn
!pip3 install gender-guesser
!pip3 install spacy
!python -m spacy download en_core_web_sm

In [None]:
%%capture

# Imports
import itertools
import csv
import sys
import re
import stanza
import pandas
from stanza.models.common.doc import Document
import nltk
import spacy
import numpy as np
import pandas as pd
from google.colab import files
import glob
csv.field_size_limit(sys.maxsize)
stanza.download("en")
nltk.download('stopwords')

**Static Functions**

In [None]:
# Method to replace a partial names in a list with corresponding full names found in itself
def partial_to_full(names, fullnames=None):
    name_list = []
    partial_name_to_full_name = dict()
    usenames = names if fullnames==None else fullnames
    for name in usenames:
        parts_of_name = name.split()
        name_length = len(parts_of_name)
        all_combinations = []
        for L in range(1, name_length+1):
            for subset in itertools.combinations(parts_of_name, L):
                subset = ' '.join(subset)
                all_combinations.append(subset)  
        for part in all_combinations:
            if part not in partial_name_to_full_name:
                partial_name_to_full_name[part] = name
            elif part in partial_name_to_full_name:
                if len(name) > len(partial_name_to_full_name[part]):
                    partial_name_to_full_name[part] = name
                else:
                    pass
            else:
                pass   
    for name in names:
        name_list.append(partial_name_to_full_name.get(name))
    return name_list


# Method to explode a dataframe text column into sentences
def split_sentences(data, textcol):
    nlp_splitter = spacy.load('en_core_web_sm')
    rows_list = []
    def splitter(data = data):
        doc = nlp_splitter(data[textcol])
        a = [str(sent) for sent in doc.sents]
        b = len(a)
        dictionary = {"article_id": np.repeat(data.article_id,b), "sentence_id": list(range(1, b+1)), "sentence": a}
        dictionaries = [{key : value[i] for key, value in dictionary.items()} for i in range(b)]
        for dictionary in dictionaries:
            rows_list.append(dictionary)
    data.apply(lambda x: splitter(x), axis = 1)
    sentences = pandas.DataFrame(rows_list, columns=['article_id', 'sentence_id','sentence'])
    sentences = sentences.merge(data, on='article_id', how='left')
    sentences = sentences.drop(textcol, 1)
    return sentences

**Article Class**

In [None]:
class Article():
    def __init__(self, data):
        self.raw = data
        
    def names(self):
        docs = nlp_stanza([Document([], text=doccontent) for doccontent in self])
        article_names = [[f'{ent.text}' for ent in doc.ents if f'{ent.type}'=="PERSON"] for doc in docs]
        return article_names 
    
    def subject_names(self, fullnames):
        article_names_subjects = []
        relations = ['nsubj']
        for article, fulls in zip(self, fullnames):
            for name in fulls:
                if len(name.split()) > 1:
                    try:
                      name = re.sub('[\(\[\)\]]', '', name)
                      name_no_space = re.sub(" ", "", name)
                      article = re.sub(str(name), str(name_no_space), article)
                    except:
                      pass
                else:
                    pass
            doc = nlp_spacy(article)
            try:
              subjects = [word.text for word in doc if word.dep_ in relations]
              subjects = [re.sub(r"(?<![A-Z])(?<!^)([A-Z])",r" \1",subject) for subject in subjects]
              subjects = [subject for subject in subjects if subject in fulls]
            except:
              subjects = []
            article_names_subjects.append(subjects)
        return article_names_subjects



**Import and Clean Data**

In [None]:
%%capture
# import 
cols = ['year', 'month', 'day', 'title', 'publication', 'article']
news = pandas.read_csv('/Data/Articles/all_the_news50k.csv', usecols=cols, engine='python', encoding='utf-8', error_bad_lines=False)

# some basic cleaning
news = news[news['article'].str.count(' ') >= 50]
news = news.dropna(subset=['year', 'month', 'day', 'title', 'publication', 'article'])
news = news[['article_id','year', 'month', 'day', 'title', 'publication', 'article']]

# remove whitespace and punctatation within quotations
news['article'] = news['article'].replace('\s+', ' ', regex=True)
news['article'] = news['article'].replace(r'[“|”|]', '"', regex=True)
news['article'] = news['article'].replace(r'(?!(([^"]*"){2})*[^"]*$)[\?|\.|\!]', '', regex=True)


In [None]:
# load processors
nlp_stanza = stanza.Pipeline('en', processors='tokenize, ner', tokenize_no_ssplit = True)
nlp_spacy = spacy.load('en_core_web_sm')

# getting subjects chunk by chunk (better for memory purposes)
count = 0
for news_chunk in np.array_split(news, 500):
    try:
        # get names from each article
        news_chunk['article_names_partial'] = Article.names(news_chunk['article'])
        news_chunk['article_names_full']=news_chunk['article_names_partial'].apply(lambda x: partial_to_full(x))
        news_chunk = news_chunk[news_chunk['article_names_partial'].map(lambda d: len(d)) >= 1]
        
        # exploding datset to sentences and extracting nominal subjects
        news_chunk = split_sentences(data = news_chunk, textcol = 'article')
        news_chunk = news_chunk[['year', 'month', 'day', 'publication', 'article_id', 'sentence_id', 'sentence','article_names_partial', 'article_names_full']]
        news_chunk = news_chunk[news_chunk.apply(lambda x: any(name in x.sentence for name in x.article_names_partial), axis=1)]
        news_chunk['sentence_subject_names'] = Article.subject_names(news_chunk['sentence'], searchnames=news_chunk['article_names_partial'])
        news_chunk = news_chunk[news_chunk['sentence_subject_names'].map(lambda d: len(d)) >= 1]
        news_chunk['sentence_subject_names']=news_chunk[['sentence_subject_names', 'article_names_full']].apply(lambda x: partial_to_full(x[0], fullnames=x[1]), axis = 1)

    
        # download chunk
        news_chunk.to_csv('Data/Sentences/sentences_' + str(count) + '.csv', index = False)
        count = count + 1
    except:
        print('issue with chunk #' + str(count))

In [None]:
# concatenate sentence chunks together
path = 'Data/Sentences'
all_files = glob.glob(path + "/*.csv")

li = []
for filename in all_files:
    df = pandas.read_csv(filename, index_col=None, header=0)
    li.append(df)
    
# write dataframe
sentences = pandas.concat(li, axis=0, ignore_index=True)
sentences.to_csv('Data/sentences.csv', index = False)