In [7]:
# LOAD CLASS AND DATASET 
# dataset: https://www.kaggle.com/rmisra/news-category-dataset
# start:    each doc in corpus contains an article link, category and other (irrelevant) key/value pairs
# end goal: each doc in corpus contains a sentence with POS tagging and gender polarity, and label vector with actual gender
%reload_ext autoreload
%autoreload 2
from classes.data_prepper import DataPrepper

p = DataPrepper()

data = p.load_json('datasets/1_newsDataset.json')

constructor of DataPrepper


In [16]:
# FILTER CATEGORIES

men_categories = ['SPORTS', 'MONEY', 'BUSINESS']
women_categories = ['WOMEN', 'STYLE & BEAUTY']

data['articles'] = p.filter_articles(men_categories+women_categories, data['articles'])

p.write_json('datasets/2_filtered_news_data.json', data)

In [17]:
# SCRAPE LINKS

scraped_data = {}
scraped_data['articles'] = []
textlessUrls = []

for article in data['articles']:
    text = p.scrape_url(article['link'], textlessUrls)
    gender = 'M' if article['category'] in men_categories else 'W'
    if text != "":
        scraped_data['articles'].append({'gender': gender, 'text': text})

p.write_json('datasets/3_text_and_gender.json', scraped_data)

In [13]:
# INSTANTIATE NLP FROM SPACY
import spacy

nlp = spacy.load("en_core_web_lg")

In [20]:
# CALCULATE WEIGHTS FOR EACH TERM
# using tf-idf weighting from 'An Introduction to Information Retrieval (2009 Online Edition)'
# written by Christopher D. Manning, Prabhakar Raghavan & Hinrich Schütze
ignore_terms = ['.', ',', '...', ' ', '\u2019', '  ', '(', ')', '?', '\u00a3', '/', '"', ':', ';', '-', '--', '\u2015', "'", '!', '$', '#', '\u2014', '   ', '[',']']
ignore_ents = ['TIME', 'DATE', 'GPE', 'CARDINAL', 'PERSON', 'MONEY', 'PERCENT']

m_weights, w_weights = p.get_weights(scraped_data['articles'], nlp, ignore_terms, ignore_ents)

p.write_json('datasets/4_word_weight_m.json', m_weights)
p.write_json('datasets/4_word_weight_w.json', w_weights)

p.write_json('datasets/4_word_weight_m_ordered.json', p.order_dict(m_weights, 'desc'))
p.write_json('datasets/4_word_weight_w_ordered.json', p.order_dict(w_weights, 'desc'))

In [10]:
# NORMALIZE WEIGHTS

p.normalize_dict(m_weights)
p.normalize_dict(w_weights)

p.write_json('datasets/5_word_weight_m_norm.json', m_weights)
p.write_json('datasets/5_word_weight_w_norm.json', w_weights)

p.write_json('datasets/5_word_weight_m_norm_ordered.json', p.order_dict(m_weights, 'desc'))
p.write_json('datasets/5_word_weight_w_norm_ordered.json', p.order_dict(w_weights, 'desc'))

In [11]:
# CALCULATE POLARITY FOR EACH TERM
# -1 (man) to 1 (woman)

polarity_dict = p.get_polarity(w_weights, m_weights)
p.write_json('datasets/6_word_polarity.json', polarity_dict)
p.write_json('datasets/6_word_polarity_ordered.json', p.order_dict(polarity_dict, 'desc'))

In [14]:
# WRAP ALL INTO FINAL DATASET
# split all texts into sentences into words, each assigned sentence #, word, lemma, pos, dep and polarity
# runtime: ~31 min
import json

data = p.load_json('datasets/3_text_and_gender.json')
polarity_dict = p.load_json('datasets/6_word_polarity.json')

corpus =    {
                'Sentence #': [], 
                'Word': [],
                'Lemma': [],
                'POS': [],
                'Tag': [],
                'Dep': [],
                'Polarity': [],
                'Label': []
            }
# corpus['X'].append({'Sentence #', 'Word', 'Lemma', 'POS', 'Tag', 'Dep', 'Polarity'})
# corpus['label'].append({'Gender'})

for i, article in enumerate(data['articles']):
    doc = nlp(article['text'])
    assert doc.has_annotation("SENT_START")
    for sent in doc.sents:
        for token in sent:
            corpus['Sentence #'].append(i)
            corpus['Word'].append(token.text)
            corpus['Lemma'].append(token.lemma_)
            corpus['POS'].append(token.pos_)
            corpus['Tag'].append(token.tag_)
            corpus['Dep'].append(token.dep_)
            corpus['Polarity'].append(polarity_dict[token.lemma_] if token.lemma_ in polarity_dict else 0)
            corpus['Label'].append(article['gender'])

with open('datasets/7_dataset.json', "w") as outFile:
    json.dump(corpus, outFile)

In [15]:
# LOAD DATASET WITH PANDA 
import pandas as pd

df = pd.read_json('datasets/7_dataset.json')
df

Unnamed: 0,Sentence #,Word,Lemma,POS,Tag,Dep,Polarity,Label
0,0,At,at,ADV,RB,advmod,0.000000,W
1,0,least,least,ADV,RBS,advmod,0.000000,W
2,0,two,two,NUM,CD,nummod,0.000000,W
3,0,organizations,organization,NOUN,NNS,nsubj,-0.193345,W
4,0,have,have,AUX,VBP,aux,-0.046438,W
...,...,...,...,...,...,...,...,...
13209400,23310,spot,spot,NOUN,NN,pobj,-0.001930,M
13209401,23310,for,for,ADP,IN,prep,0.000000,M
13209402,23310,Howard,Howard,PROPN,NNP,pobj,-0.001477,M
13209403,23310,.,.,PUNCT,.,punct,0.000000,M
