In [71]:
# LOAD CLASS AND DATASET 
# dataset: https://www.kaggle.com/rmisra/news-category-dataset
# start:    each doc in corpus contains an article link, category and other (irrelevant) key/value pairs
# end goal: each doc in corpus contains a sentence with POS tagging and gender polarity, and label vector with actual gender
%reload_ext autoreload
%autoreload 2
from classes.data_prepper import DataPrepper

p = DataPrepper()

data = p.load_json('datasets/1_newsDataset.json')

constructor of DataPrepper


In [72]:
# FILTER CATEGORIES

men_categories = ['SPORTS', 'MONEY', 'BUSINESS']
women_categories = ['WOMEN', 'STYLE & BEAUTY']

data['articles'] = p.filter_articles(men_categories+women_categories, data['articles'])

p.write_json('datasets/2_filtered_news_data.json', data)

In [73]:
# SCRAPE LINKS

# scraped_data = {}
# scraped_data['articles'] = []
# textlessUrls = []

# for article in data['articles']:
#     text = p.scrape_url(article['link'], textlessUrls)
#     gender = 'M' if article['category'] in men_categories else 'W'
#     if text != "":
#         scraped_data['articles'].append({'gender': gender, 'text': text})

# p.write_json('datasets/3_text_and_gender.json', scraped_data)

In [74]:
# INSTANTIATE NLP FROM SPACY
import spacy

nlp = spacy.load("en_core_web_lg")

In [75]:
scraped_data = p.load_json('datasets/3_text_and_gender.json')

In [76]:
# CALCULATE WEIGHTS FOR EACH TERM
# using tf-idf weighting from 'An Introduction to Information Retrieval (2009 Online Edition)'
# written by Christopher D. Manning, Prabhakar Raghavan & Hinrich Schütze
ignore_terms = ['.', ',', '...', ' ', '\u2019', '  ', '(', ')', '?', '\u00a3', '/', '"', ':', ';', '-', '--', '\u2015', "'", '!', '$', '#', '\u2014', '   ', '[',']']
ignore_ents = ['TIME', 'DATE', 'GPE', 'CARDINAL', 'PERSON', 'MONEY', 'PERCENT']

m_weights, w_weights = p.get_weights(scraped_data['articles'], nlp, ignore_terms, ignore_ents)

p.write_json('datasets/4_word_weight_m.json', m_weights)
p.write_json('datasets/4_word_weight_w.json', w_weights)

p.write_json('datasets/4_word_weight_m.json', p.order_dict(m_weights, 'desc'))
p.write_json('datasets/4_word_weight_w.json', p.order_dict(w_weights, 'desc'))

In [79]:
# CALCULATE POLARITY FOR EACH TERM
# -1 (man) to 1 (woman)

polarity_dict = p.get_polarity(w_weights, m_weights)
p.write_json('datasets/5_word_polarity.json', polarity_dict)
p.write_json('datasets/5_word_polarity_ordered.json', p.order_dict(polarity_dict, 'desc'))

In [82]:
# WRAP ALL INTO FINAL DATASET
# split all texts into sentences into words, each assigned sentence #, word, lemma, pos, dep and polarity
import json

data = p.load_json('datasets/3_text_and_gender.json')
corpus =    {
                'X': [], 
                'label': []
            }
# corpus['X'].append({'Sentence #', 'Word', 'Lemma', 'POS', 'Tag', 'Dep', 'Polarity'})
# corpus['label'].append({'Gender'})

for article in data['articles']:
    doc = nlp(article['text'])
    assert doc.has_annotation("SENT_START")
    for sent in doc.sents:
        for token in sent:
            corpus['X'].append({
                'sentence': 1, 
                'word': token.text, 
                'lemma': token.lemma_, 
                'pos': token.pos_, 
                'dep': token.dep_, 
                'polarity': polarity_dict[token.lemma_] if token.lemma_ in polarity_dict else 0})
            corpus['label'].append({'gender': article['gender']})

with open('../datasets/6_dataset.json', "w") as outFile:
    json.dump(corpus, outFile)