In [2]:
# INSTANTIATE NLP FROM SPACY
import spacy

nlp = spacy.load("en_core_web_lg")

In [16]:
m = ["John Johnson is stronger than strong Jane", "I am a very strong person", "I thrive in a competitive setting"]
f = ["Jane Smalls is prettier than pretty John", "I am a very supporting person", "I thrive in a nurturing setting"]

print('base f:', f)

#### PREPROCESSING (LEMMATIZING AND FILTERING) ####
ignore_terms = ['.', ',', '...', ' ', '\u2019', '  ', '(', ')', '?', '\u00a3', '/', '"', ':', ';', '-', '--', '\u2015', "'", '!', '$', '#', '\u2014', '   ', '[',']']
ignore_ents = ['TIME', 'DATE', 'GPE', 'CARDINAL', 'PERSON', 'MONEY', 'PERCENT']


m_lem = []
f_lem = []

for sentence in m:
    doc = nlp(sentence)
    lemmatized_sentence = []
    for token in doc:
        if token.lemma_ not in ignore_terms and token.ent_type_ not in ignore_ents:
            lemmatized_sentence.append(token.lemma_)
    m_lem.append(lemmatized_sentence)

for sentence in f:
    doc = nlp(sentence)
    lemmatized_sentence = []
    for token in doc:
        if token.lemma_ not in ignore_terms and token.ent_type_ not in ignore_ents:
            lemmatized_sentence.append(token.lemma_)
    f_lem.append(lemmatized_sentence)

print('lemmatized m:', m_lem)
print('lemmatized f:', f_lem)

#### COMPUTING TERM FREQUENCY ####

m_tf = []
f_tf = []
for sentence in m_lem:
    tf_d = {}
    tf = []
    
    # Count term frequencies in each doc/sentence
    for lemma in sentence:
        if lemma in tf_d:
            tf_d[lemma] += 1
        else:
            tf_d[lemma] = 1

    # Append to array to show tf
    for lemma in sentence:
        tf.append(tf_d[lemma])

    m_tf.append(tf)

for sentence in f_lem:
    tf_d = {}
    tf = []
    
    # Count term frequencies in each doc/sentence
    for lemma in sentence:
        if lemma in tf_d:
            tf_d[lemma] += 1
        else:
            tf_d[lemma] = 1

    # Append to array to show tf
    for lemma in sentence:
        tf.append(tf_d[lemma])

    f_tf.append(tf)

print('tf m:', m_tf)
print('tf f:', f_tf)

#### COMPUTING DOCUMENT FREQUENCY ####
m_df = {}
f_df = {}
for sentence in m_lem:
    for unique_lemma in set(sentence):
        if unique_lemma in m_df:
            m_df[unique_lemma] += 1
        else:
            m_df[unique_lemma] = 1

for sentence in f_lem:
    for unique_lemma in set(sentence):
        if unique_lemma in f_df:
            f_df[unique_lemma] += 1
        else:
            f_df[unique_lemma] = 1

print('df m:', m_df)
print('df f:', f_df)

#### COMPUTING INVERSE DOCUMENT FREQUENCY ####
from collections import OrderedDict
import math
N = len(m) + len(f)

m_idf = {}
f_idf = {}

for key, value in m_df.items():
    m_idf[key] = round(math.log(N/value), 2)

for key, value in f_df.items():
    f_idf[key] = round(math.log(N/value), 2)

# m_idf = []
# f_idf = []
# for sentence in m_lem:
#     idf = []
#     for lemma in sentence:
#         idf.append(round(math.log(N/m_df[lemma]), 2))
#     m_idf.append(idf)

# for sentence in f_lem:
#     idf = []
#     for lemma in sentence:
#         idf.append(round(math.log(N/f_df[lemma]),2))
#     f_idf.append(idf)

m_idf = OrderedDict(sorted(m_idf.items(), key=lambda kv: kv[1], reverse=True))
f_idf = OrderedDict(sorted(f_idf.items(), key=lambda kv: kv[1], reverse=True))

print('idf m:', m_idf)
print('idf f:', f_idf)

#### COMPUTING TF-IDF ####
m_tf_idfs = {}
for i, sentence in enumerate(m_lem):
    tf_idfs = []
    for j, lemma in enumerate(sentence):
        tf_idf = m_tf[i][j] * m_idf[lemma]
        if lemma in m_tf_idfs:
            m_tf_idfs[lemma] += tf_idf
        else:
            m_tf_idfs[lemma] = tf_idf

for key, value in m_tf_idfs.items():
    m_tf_idfs[key] = round(value / N, 2)

f_tf_idfs = {}
for i, sentence in enumerate(f_lem):
    tf_idfs = []
    for j, lemma in enumerate(sentence):
        tf_idf = f_tf[i][j] * f_idf[lemma]
        if lemma in f_tf_idfs:
            f_tf_idfs[lemma] += tf_idf
        else:
            f_tf_idfs[lemma] = tf_idf

for key, value in f_tf_idfs.items():
    f_tf_idfs[key] = round(value / N, 2)


m_tf_idfs = OrderedDict(sorted(m_tf_idfs.items(), key=lambda kv: kv[1], reverse=True))
f_tf_idfs = OrderedDict(sorted(f_tf_idfs.items(), key=lambda kv: kv[1], reverse=True))

print('------------------------------------------')
print('sum tf-idf m:', m_tf_idfs)
print('sum tf-idf f:', f_tf_idfs)


print('------------------------------------------')
import numpy as np

def normalize_dict(d):
    v_min = min(d.values())
    v_max = max(d.values())
    print("normalize v_min", v_min, "v_max", v_max)
    for k in d:
        print("key", k, "d[k]-v_min", d[k]-v_min, "v_max-v_min", v_max-v_min, "result", (d[k]-v_min)/(v_max-v_min))
        d[k] = (d[k]-v_min)/(v_max-v_min)

# test = np.array([0.3, 0.37, 0.3, 0.3, 0.3, 0.37, 0.37, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3])
# norm = np.linalg.norm(test)
# ok = test/norm
# print("NORMMMMM ", ok)
# normalize_dict(m_tf_idfs)
# normalize_dict(f_tf_idfs)

normalize_dict(m_tf_idfs)
normalize_dict(f_tf_idfs)
print("normalized m", m_tf_idfs)


print('----------------------------------------------------------')
pol_dict = {key: f_tf_idfs[key] - m_tf_idfs.get(key, 0) for key in f_tf_idfs}
for k, v in m_tf_idfs.items():
    if k not in pol_dict:
        pol_dict[k] = -v

pol_dict = OrderedDict(sorted(pol_dict.items(), key=lambda kv: kv[1], reverse=True))
print('polarity dict:', pol_dict)

#idf
# 'than', 'very', 'person', 'in', 'thrive', 'setting', 'competitive', 'strong', 'be', 'I', 'a'
# 'than', 'pretty', 'John', 'support', 'very', 'person', 'in', 'thrive', 'set', 'nurture', 'be', 'I', 'a'

#tf-idf
# 'strong', 'be', 'I', 'a', 'than', 'very', 'person', 'thrive', 'in', 'competitive', 'setting',
# 'pretty', 'be', 'I', 'a', 'than', 'John', 'very', 'support', 'person', 'thrive', 'in', 'nurture', 'set',

#pol dict
# 'pretty', 'than', 'John', 'very', 'support', 'person', 'thrive', 'in', 'nurture', 'set', 'competitive', 'setting', 'be', 'I', 'a', 'strong'
# 'pretty', 1.0, 'than', 0.0, 'John', 0.0, 'very', 0.0, 'support', 0.0, 'person', 0.0, 'thrive', 0.0, 'in', 0.0, 'nurture', 0.0, 'set', 0.0, 'competitive', -0.0, 'setting', -0.0, 'be', -0.03425154041319317, 'I', -0.03425154041319317, 'a', -0.03425154041319317, 'strong', -1.0


base f: ['Jane Smalls is prettier than pretty John', 'I am a very supporting person', 'I thrive in a nurturing setting']
lemmatized m: [['be', 'strong', 'than', 'strong'], ['I', 'be', 'a', 'very', 'strong', 'person'], ['I', 'thrive', 'in', 'a', 'competitive', 'setting']]
lemmatized f: [['be', 'pretty', 'than', 'pretty', 'John'], ['I', 'be', 'a', 'very', 'support', 'person'], ['I', 'thrive', 'in', 'a', 'nurture', 'set']]
tf m: [[1, 2, 1, 2], [1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1]]
tf f: [[1, 2, 1, 2, 1], [1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1]]
df m: {'strong': 2, 'than': 1, 'be': 2, 'I': 2, 'very': 1, 'a': 2, 'person': 1, 'in': 1, 'thrive': 1, 'setting': 1, 'competitive': 1}
df f: {'than': 1, 'pretty': 1, 'John': 1, 'be': 2, 'support': 1, 'I': 2, 'very': 1, 'a': 2, 'person': 1, 'in': 1, 'thrive': 1, 'set': 1, 'nurture': 1}
idf m: OrderedDict([('than', 1.79), ('very', 1.79), ('person', 1.79), ('in', 1.79), ('thrive', 1.79), ('setting', 1.79), ('competitive', 1.79), ('strong', 1.1), ('be'