# Additional analysis (exploratory)
In this notebook, I investigate the polarity scores of jobs, occupations and sentences up in three Danish sentiment lexicons: AFINN, SenTM, and SENTIDA

In [1]:
# imports
import pandas as pd
from afinn import Afinn
from sentm.sentm import senTM
from sentida import Sentida

In [2]:
# load gender corpus
corpus = pd.read_excel("gender_corpus.xlsx")
corpus.head()

Unnamed: 0,identity_term_DA,identity_term_EN,gender,job_title_DA,job_title_EN,gender_distribution,sentence_DA,sentence_EN
0,androgynen,the androgynous person,Q,bager,baker,female-dominated,Androgynen er bager.,The androgynous person is a(n) baker.
1,androgynen,the androgynous person,Q,bibliotekar,librarian,female-dominated,Androgynen er bibliotekar.,The androgynous person is a(n) librarian.
2,androgynen,the androgynous person,Q,optiker,optician,female-dominated,Androgynen er optiker.,The androgynous person is a(n) optician.
3,androgynen,the androgynous person,Q,boghandler,bookseller,female-dominated,Androgynen er boghandler.,The androgynous person is a(n) bookseller.
4,androgynen,the androgynous person,Q,praktiserende læge,general practitioner,female-dominated,Androgynen er praktiserende læge.,The androgynous person is a(n) general practit...


In [3]:
# utility funciton
def not_neutral_score(s:str, model, is_sentida:bool=False) -> float:
    """Returns the mean sentiment score of a sentence if it's not neutral, i.e. score != 0.0.

    Args:
        s (str): The input sentence.
        model: The sentiment model.
        is_sentida (bool, optional): Whether the model is the Sentida one or not. Defaults to False.

    Returns:
        float: The mean sentiment score if != 0.0.
    """
    # calculate mean score
    if is_sentida:
        mean_score = model.sentida(s)
    else:
        mean_score = model.score(s)/len(s.split())

    # return score if not neutral
    if mean_score != 0.0:
        return mean_score

# define test sentence
test_sent = "jeg er træt i dag"

### AFINN
See http://www.imm.dtu.dk/~faan/ps/Nielsen2016Danish.pdf and https://pypi.org/project/afinn/

In [4]:
# load and test model
afinn = Afinn(language="da")
#print("Total score:", afinn.score(test_sent)) # returns total score
print("Mean score :", afinn.score(test_sent)/len(test_sent.split())) # returns mean score

Mean score : -0.4


In [5]:
# print identity or job, if it is not categorised as neutral (i.e. if it gets a score of more or less than 0.0)

for (name, column) in [("Identities", "identity_term_DA"), ("Job", "job_title_DA")]:
    print(name+"...")
    c = 0
    for words in set(corpus[column]):
        score = not_neutral_score(words, afinn)
        if score is not None:
            print(words, score)
    print(f"{c} terms ({(c/len(set(corpus[column])))*100:.2f} %)\n")
print("Done!")

Identities...
0 terms (0.00 %)

Job...
0 terms (0.00 %)

Done!


### SenTM
See https://pypi.org/project/sentm/ and https://github.com/MadsLang/senTM

Note that (for a classifier) they propose that:
* Scores larger than 1 = "positiv"
* Scores between -1 and 1 = "neutral"
* Scores lower than -1 = "negativ"

In [6]:
# load and test model
sentm = senTM()
#print("Total score:", sentm.score(test_sent)) # returns total score
print("Mean score :", sentm.score(test_sent)/len(test_sent.split())) # returns mean score

  from .autonotebook import tqdm as notebook_tqdm


Mean score : -0.2


In [7]:
# print identity or job, if it is not categorised as neutral (i.e. if it gets a score of more or less than 0.0)

for (name, column) in [("Identities", "identity_term_DA"), ("Job", "job_title_DA")]:
    print(name+"...")
    c = 0 
    for words in set(corpus[column]):
        score = not_neutral_score(words, sentm)
        if score is not None:
            print(words, score)
            c += 1
    print(f"{c} terms ({(c/len(set(corpus[column])))*100:.2f} %)\n")
print("Done!")

Identities...
min genderqueer ven 0.3333333333333333
tøsen -1.0
2 terms (4.17 %)

Job...
humanistisk forsker 1.0
smed -1.0
tjener 2.0
3 terms (6.00 %)

Done!


In [8]:
# print synthetic sentence, if it is not categorised as neutral (i.e. if it gets a score of more or less than 0.0)

print("Synthetic sentences...")
c = 0 
for words in set(corpus["sentence_DA"]):
    score = not_neutral_score(words, sentm)
    if score is not None:
        print(words, score)
        c += 1
print(f"{c} terms ({(c/len(set(corpus['sentence_DA'])))*100:.2f} %)\n")
print("Done!")

Synthetic sentences...
Min søn er humanistisk forsker. 0.4
Denne tvekønnede person er humanistisk forsker. 0.3333333333333333
Min genderqueer ven er skotøjshandler. 0.2
Min genderqueer ven er glarmester. 0.2
Lgbt personen er humanistisk forsker. 0.4
Min genderqueer ven er bibliotekar. 0.2
Pigen er humanistisk forsker. 0.5
Min bedstemor er humanistisk forsker. 0.4
Min genderqueer ven er dyrlæge. 0.2
Denne fætter er humanistisk forsker. 0.4
Min genderqueer ven er humanistisk forsker. 0.5
Androgynen er humanistisk forsker. 0.5
Min genderqueer ven er murer. 0.2
Min genderqueer ven er lokomotivfører. 0.2
Min genderqueer ven er tagdækker. 0.2
Hermafroditten er humanistisk forsker. 0.5
Min genderqueer ven er event- og udstillingsmedarbejder. 0.14285714285714285
Drengen er humanistisk forsker. 0.5
Denne kusine er humanistisk forsker. 0.4
Hun er humanistisk forsker. 0.5
Han er humanistisk forsker. 0.5
Min genderqueer ven er boghandler. 0.2
Denne transperson er humanistisk forsker. 0.4
Min gende

In [9]:
# examine ratings of individual words
non_neutral = ["min genderqueer ven", "tøsen", "humanistisk forsker", "smed", "tjener"]
for words in non_neutral:
    print(words, sentm.score(words)/len(words.split()))
    for word in words.split():
        print("\t", word, sentm.score(word))

min genderqueer ven 0.3333333333333333
	 min 0.0
	 genderqueer 0.0
	 ven 1.0
tøsen -1.0
	 tøsen -1.0
humanistisk forsker 1.0
	 humanistisk 0.0
	 forsker 0.0
smed -1.0
	 smed -1.0
tjener 2.0
	 tjener 2.0


### Sentida
See https://github.com/Guscode/Sentida and https://pypi.org/project/sentida/

```
# default settings
from sentida import Sentida
Sentida().sentida(
    text,
    output = ["mean", "total", "by_sentence_mean", "by_sentence_total"],
    normal = True,
    speed = ["normal", "fast"]
)
```

In [10]:
# load and test model sentence
sv = Sentida()
#print("Total score:", sv.sentida(test_sent, "total")) # returns total score
print("Mean score :", sv.sentida(test_sent)) # returns mean score (default = mean)

Mean score : -0.19166666666666632


In [11]:
# print synthetic sentence, if it is not categorised as neutral (i.e. if it gets a score of more or less than 0.0)

for (name, column) in [("Identities", "identity_term_DA"), ("Job", "job_title_DA")]:
    print(name+"...")
    c = 0 
    for words in set(corpus[column]):
        score = not_neutral_score(words, sv, True)
        if score is not None:
            print(words, score)
            c += 1
    print(f"{c} terms ({(c/len(set(corpus[column])))*100:.2f} %)\n")
print("Done!")

Identities...
lgbt personen 0.0888888888888888
min genderqueer ven 0.6000000000000001
min søn 0.1333333333333334
denne dame 0.0666666666666666
fyren -0.16666666666666663
mit ikkebinære barn 0.1
ægtemanden 0.0666666666666666
min søster 0.0666666666666666
drengen 0.0666666666666666
min nonbinære forælder 0.1
denne kvinde 0.0666666666666666
konen 0.0666666666666666
denne transseksuelle medarbejder 0.0888888888888888
min far -0.12
denne tvekønnede person 0.0888888888888888
min bedstefar 0.1333333333333334
min intetkønnede kollega 0.0333333333333334
min datter 0.0222222222222222
pigen 0.0666666666666666
min mor 0.266666666666666
denne mand 0.0666666666666666
21 terms (43.75 %)

Job...
humanistisk forsker 0.1333333333333333
bager 0.01666666666666666
hjemmehjælper 0.1
naturvidenskabelig forsker 0.0666666666666666
praktiserende læge 0.0666666666666666
politibetjent -0.0666666666666666
soldat -0.1333333333333334
arkitekt 0.0666666666666666
murer -0.0222222222222222
tjener 0.0666666666666666
10 

Interesting that *mor* is positive, while *far* is negative! It's also interesting that in SenTM *tøsen* was negative. Here it's neutral, but *fyren* is negative.

In [12]:
# how many sentences are not neutral?
print("Synthetic sentences...")
c = 0
for sent in set(corpus["sentence_DA"]):
    if sv.sentida(sent) != 0.0:
        c += 1
print(f"{c} terms ({(c/len(set(corpus['sentence_DA'])))*100:.2f} %)")

print("\nDone!")

Synthetic sentences...
1309 terms (54.54 %)

Done!
