In [130]:
import json
import spacy
import math

with open('../datasets/3_text_and_gender.json') as json_file:
    data = json.load(json_file)

nlp = spacy.load("en_core_web_lg")

In [131]:
ignores = ['.', ',', '...', ' ', '\u2019', '  ', '(', ')', '?', '\u00a3', '/', '"', ':', ';', '-', '--', '\u2015', "'", '!', '$', '#', '\u2014', '   ', '[',']']
ent_types = ['TIME', 'DATE', 'GPE', 'CARDINAL', 'PERSON', 'MONEY', 'PERCENT']
m_words = {}
w_words = {}

N = len(data['articles'])
m_tf_ds = []
m_df_t = {}
w_tf_ds = []
w_df_t = {}

for article in data['articles']:
    doc = nlp(article['text'])
    tf_d = {}
    if article['gender'] == 'M':
        for token in doc:
            if not token.is_stop and token.lemma_ not in ignores and token.ent_type_ not in ent_types:
                word = token.lemma_
                if word in tf_d:
                    tf_d[word] += 1
                else:
                    tf_d[word] = 1
        m_tf_ds.append(tf_d)
        for key in tf_d:
            if key in m_df_t:
                m_df_t[key] += 1
            else:
                m_df_t[key] = 1
    if article['gender'] == 'W':
        for token in doc:
            if not token.is_stop and token.lemma_ not in ignores and token.ent_type_ not in ent_types:
                word = token.lemma_
                if word in tf_d:
                    tf_d[word] += 1
                else:
                    tf_d[word] = 1
        w_tf_ds.append(tf_d)
        for key in tf_d:
            if key in w_df_t:
                w_df_t[key] += 1
            else:
                w_df_t[key] = 1

m_tf_idfs = {}
for tf_dict in m_tf_ds:
    tf_idf = {}
    for k, v in tf_dict.items():
        tf_idf = v*(math.log(N/m_df_t[k]))
        if k in m_tf_idfs:
            m_tf_idfs[k].append(tf_idf)
        else:
            m_tf_idfs[k] = [tf_idf]

w_tf_idfs = {}
for tf_dict in w_tf_ds:
    tf_idf = {}
    for k, v in tf_dict.items():
        tf_idf = v*(math.log(N/w_df_t[k]))
        if k in w_tf_idfs:
            w_tf_idfs[k].append(tf_idf)
        else:
            w_tf_idfs[k] = [tf_idf]

for k, v in m_tf_idfs.items():
    # length = len(v)
    m_tf_idfs[k] = sum(v)/N

for k, v in w_tf_idfs.items():
    # length = len(v)
    w_tf_idfs[k] = sum(v)/N


with open('../datasets/4_word_weight_m.json', "w") as outFile:
    json.dump(m_tf_idfs, outFile)

with open('../datasets/4_word_weight_w.json', "w") as outFile:
    json.dump(w_tf_idfs, outFile)


In [133]:
from collections import OrderedDict

m_descending = OrderedDict(sorted(m_tf_idfs.items(), key=lambda kv: kv[1], reverse=True))
w_descending = OrderedDict(sorted(w_tf_idfs.items(), key=lambda kv: kv[1], reverse=True))

with open('../datasets/4_word_weight_m_ordered.json', "w") as outFile:
    json.dump(m_descending, outFile)

with open('../datasets/4_word_weight_w_ordered.json', "w") as outFile:
    json.dump(w_descending, outFile)