In [1]:
# LOAD CLASS AND DATASET 
# dataset: https://www.kaggle.com/rmisra/news-category-dataset
# start:    each doc in corpus contains an article link, category and other (irrelevant) key/value pairs
# end goal: each doc in corpus contains a sentence with POS tagging and gender polarity, and label vector with actual gender

%reload_ext autoreload
%autoreload 2
from classes.data_prepper import DataPrepper

p = DataPrepper()

data = p.load_json('../datasets/1_newsDataset.json')

ImportError: attempted relative import with no known parent package

In [None]:
# FILTER CATEGORIES

men_categories = ['SPORTS', 'MONEY', 'BUSINESS']
women_categories = ['WOMEN', 'STYLE & BEAUTY']

data['articles'] = p.filter_articles(men_categories+women_categories, data['articles'])

p.write_json('../datasets/2_filtered_news_data.json', data)

In [None]:
# SCRAPE LINKS

scraped_data = {}
scraped_data['articles'] = []
textlessUrls = []

for article in data['articles']:
    text = p.scrape_url(article['link'], textlessUrls)
    gender = 'M' if article['category'] in men_categories else 'W'
    if text != "":
        scraped_data['articles'].append({'gender': gender, 'text': text})

p.write_json('../datasets/3_text_and_gender.json', scraped_data)

In [2]:
# INSTANTIATE NLP FROM SPACY
import spacy

nlp = spacy.load("en_core_web_lg")

In [10]:
# SPLITTING TRAINING AND TEST DATA
scraped_data = p.load_json('../datasets/3_text_and_gender.json')
split = round(len(scraped_data['articles'])*0.75)
scraped_data['articles'] = scraped_data['articles'][:split]

In [6]:
# CALCULATE WEIGHTS FOR EACH TERM
# using tf-idf weighting from 'An Introduction to Information Retrieval (2009 Online Edition)'
# written by Christopher D. Manning, Prabhakar Raghavan & Hinrich Schütze
ignore_terms = ['.', ',', '...', ' ', '\u2019', '  ', '(', ')', '?', '\u00a3', '/', '"', ':', ';', '-', '--', '\u2015', "'", '!', '$', '#', '\u2014', '   ', '[',']']
ignore_ents = ['TIME', 'DATE', 'GPE', 'CARDINAL', 'PERSON', 'MONEY', 'PERCENT']

m_weights, w_weights = p.get_weight s(scraped_data['articles'], nlp, ignore_terms, ignore_ents)

p.write_json('../datasets/4_word_weight_m.json', m_weights)
p.write_json('../datasets/4_word_weight_w.json', w_weights)

p.write_json('../datasets/4_word_weight_m_ordered.json', p.order_dict(m_weights, 'desc'))
p.write_json('../datasets/4_word_weight_w_ordered.json', p.order_dict(w_weights, 'desc'))

In [7]:
# NORMALIZE WEIGHTS

p.normalize_dict(m_weights)
p.normalize_dict(w_weights)

p.write_json('datasets/5_word_weight_m_norm.json', m_weights)
p.write_json('datasets/5_word_weight_w_norm.json', w_weights)

p.write_json('datasets/5_word_weight_m_norm_ordered.json', p.order_dict(m_weights, 'desc'))
p.write_json('datasets/5_word_weight_w_norm_ordered.json', p.order_dict(w_weights, 'desc'))

In [8]:
# CALCULATE POLARITY FOR EACH TERM
# -1 (man) to 1 (woman)

polarity_dict = p.get_polarity(w_weights, m_weights)
p.write_json('../datasets/6_word_polarity.json', polarity_dict)
p.write_json('../datasets/6_word_polarity_ordered.json', p.order_dict(polarity_dict, 'desc'))

In [9]:
# WRAP ALL INTO FINAL DATASET
# split all texts into sentences into words, each assigned sentence #, word, lemma, pos, dep and polarity
# runtime: ~31 min

data = p.load_json('../datasets/3_text_and_gender.json')
polarity_dict = p.load_json('../datasets/6_word_polarity.json')

corpus =    {
                'Sentence #': [], 
                'Word': [],
                'Lemma': [],
                'Tag': [],
                'POS': [],
                'Dep': [],
                'Polarity': [],
                'Gender': []
            }

sentenceCount = 1

ignore_terms = ['.', ',', '...', ' ', '\u2019', '  ', '(', ')', '?', '\u00a3', '/', '"', ':', ';', '-', '--', '\u2015', "'", '!', '$', '#', '\u2014', '   ', '[',']']
ignore_ents = ['TIME', 'DATE', 'GPE', 'CARDINAL', 'PERSON', 'MONEY', 'PERCENT']

for article in data['articles']:
    doc = nlp(article['text'])
    assert doc.has_annotation("SENT_START")
    for sent in doc.sents:
        for token in sent:
            if not token.is_stop and token.lemma_ not in ignore_terms and token.ent_type_ not in ignore_ents:
                corpus['Sentence #'].append(sentenceCount)
                corpus['Word'].append(token.text)
                corpus['Lemma'].append(token.lemma_)
                corpus['Tag'].append(token.tag_)
                corpus['POS'].append(token.pos_)
                corpus['Dep'].append(token.dep_)
                corpus['Polarity'].append(polarity_dict[token.lemma_] if token.lemma_ in polarity_dict else 0)
                corpus['Gender'].append(article['gender'])
        sentenceCount += 1

p.write_json('../datasets/7_dataset_w_tags.json', corpus)

In [5]:
# LOAD DATASET WITH PANDA 
import pandas as pd 

df = pd.read_json('../datasets/7_dataset_SM.json')
df

SyntaxError: (unicode error) 'unicodeescape' codec can't decode bytes in position 2-3: truncated \UXXXXXXXX escape (<ipython-input-5-037853430cd5>, line 4)

In [13]:
# SIMPLE DATASET
# each row contains sentence and gender

data = p.load_json('../datasets/3_text_and_gender_SM.json')
polarity_dict = p.load_json('../datasets/6_word_polarity.json')

corpus =    {
                'Text': [],
                'Gender': []
            }

ignore_terms = ['.', ',', '...', ' ', '\u2019', '  ', '(', ')', '?', '\u00a3', '/', '"', ':', ';', '-', '--', '\u2015', "'", '!', '$', '#', '\u2014', '   ', '[',']']
ignore_ents = ['TIME', 'DATE', 'GPE', 'CARDINAL', 'PERSON', 'MONEY', 'PERCENT']

for article in data['articles']:
    doc = nlp(article['text'])
    assert doc.has_annotation("SENT_START")
    for sent in doc.sents:
        text = p.preprocess_text(sent.text)
        if text != " ":
            corpus['Text'].append(text)
            corpus['Gender'].append(article['gender'])

p.write_json('../datasets/8_dataset_simple_SM.json', corpus)

KeyboardInterrupt: 

In [12]:
test = '.. . testing , " at my home from of yes'

p.preprocess_text(test)

'testing at my home from of yes'