# Analysis of Historical Lexicons

## Imports

In [31]:
import pandas as pd
from scipy import stats as st

## Data Preparation

In [2]:
def linear_scaling(x, oldmin, oldmax, newmin,newmax):
    return ((newmax-newmin)*(x-oldmin))/(oldmax-oldmin)+newmin

In [3]:
warriner = pd.read_csv("/Users/sven/julie/resources/Warriner/Ratings_Warriner_et_al.csv",  index_col=1)[["V.Mean.Sum", "A.Mean.Sum", "D.Mean.Sum"]]
warriner.columns = ["Valence", "Arousal", "Dominance"]
warriner.head()

Unnamed: 0_level_0,Valence,Arousal,Dominance
Word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
aardvark,6.26,2.41,4.27
abalone,5.3,2.65,4.95
abandon,2.84,3.73,3.32
abandonment,2.63,4.95,2.64
abbey,5.85,2.2,5.0


In [4]:
angst = pd.read_csv("/Users/sven/julie/resources/ANGST2014/ANGST2014.csv", sep="\t", index_col=0)[["Valence", "Arousal", "Dominance"]]
angst.Valence = [round(linear_scaling(x, -3, 3, 1, 9), 2) for x in angst.Valence]
angst.head()

Unnamed: 0_level_0,Valence,Arousal,Dominance
Word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Aas,2.2,5.9,4.56
Abenddämmerung,7.13,3.9,4.63
Abendessen,6.51,3.75,6.94
Abenteuer,7.4,6.45,5.81
Abfall,2.73,5.35,5.25


In [5]:
hist_en = pd.read_csv("goldEN.vad", sep="\t", names=["Valence", "Arousal","Dominance"])
hist_en.head()

Unnamed: 0,Valence,Arousal,Dominance
deal,5.5,6.0,6.0
study,5.5,6.5,5.5
afford,2.5,6.5,1.5
service,6.5,5.5,6.5
height,5.0,5.5,5.0


In [6]:
hist_de = pd.read_csv("goldDE.vad", sep="\t", names=["Valence", "Arousal","Dominance"])
hist_de.head()

Unnamed: 0,Valence,Arousal,Dominance
fliegen,6.0,6.67,4.0
vermehrt,4.33,5.67,6.0
Zeit,3.67,6.33,5.0
Jahr,6.0,6.0,3.67
lachen,7.67,7.0,5.67


In [7]:
common_index_de = list(set(angst.index).intersection(set(hist_de.index)))
len(common_index_de)

13

In [8]:
common_index_en = list(set(warriner.index).intersection(set(hist_en.index)))
len(common_index_en)

97

Overlap between contemporary German and historical German dataset is very small, most probably due to the employed sampling procedure. For this reason, the below analysis focuses on English only.

In [9]:
df = pd.concat([hist_en, warriner], join="inner", axis=1, keys=["historical", "modern"])
df.head()

Unnamed: 0_level_0,historical,historical,historical,modern,modern,modern
Unnamed: 0_level_1,Valence,Arousal,Dominance,Valence,Arousal,Dominance
deal,5.5,6.0,6.0,6.14,3.9,6.12
study,5.5,6.5,5.5,5.78,2.55,5.33
afford,2.5,6.5,1.5,6.56,4.33,6.44
service,6.5,5.5,6.5,6.83,2.95,5.33
height,5.0,5.5,5.0,5.83,3.85,4.56


## Computing Change in Emotion Values

We use squared eucledian distance.

In [10]:
for d in ["Valence", "Arousal", "Dominance"]:
    df.loc[:, ("diff", d)] = (df[("historical", d)] - df[("modern", d)])**2

df.loc[:, ("diff", "sum")] = ((df["modern"]-df["historical"])**2).sum(axis=1)
df.head()

Unnamed: 0_level_0,historical,historical,historical,modern,modern,modern,diff,diff,diff,diff
Unnamed: 0_level_1,Valence,Arousal,Dominance,Valence,Arousal,Dominance,Valence,Arousal,Dominance,sum
deal,5.5,6.0,6.0,6.14,3.9,6.12,0.4096,4.41,0.0144,4.834
study,5.5,6.5,5.5,5.78,2.55,5.33,0.0784,15.6025,0.0289,15.7098
afford,2.5,6.5,1.5,6.56,4.33,6.44,16.4836,4.7089,24.4036,45.5961
service,6.5,5.5,6.5,6.83,2.95,5.33,0.1089,6.5025,1.3689,7.9803
height,5.0,5.5,5.0,5.83,3.85,4.56,0.6889,2.7225,0.1936,3.605


In [11]:
df.sort_values(("diff", "Valence"), ascending=False).head()

Unnamed: 0_level_0,historical,historical,historical,modern,modern,modern,diff,diff,diff,diff
Unnamed: 0_level_1,Valence,Arousal,Dominance,Valence,Arousal,Dominance,Valence,Arousal,Dominance,sum
doctor,1.5,7.0,2.5,5.93,4.05,4.69,19.6249,8.7025,4.7961,33.1235
afford,2.5,6.5,1.5,6.56,4.33,6.44,16.4836,4.7089,24.4036,45.5961
receive,3.5,6.5,2.0,7.14,4.3,6.79,13.2496,4.84,22.9441,41.0337
daughter,3.5,4.0,4.0,6.73,5.0,5.06,10.4329,1.0,1.1236,12.5565
narrow,2.0,4.0,4.0,4.95,4.53,5.19,8.7025,0.2809,1.4161,10.3995


In [12]:
df.sort_values(("diff", "Arousal"), ascending=False).head()

Unnamed: 0_level_0,historical,historical,historical,modern,modern,modern,diff,diff,diff,diff
Unnamed: 0_level_1,Valence,Arousal,Dominance,Valence,Arousal,Dominance,Valence,Arousal,Dominance,sum
divine,7.0,7.0,2.0,7.15,3.05,5.96,0.0225,15.6025,15.6816,31.3066
study,5.5,6.5,5.5,5.78,2.55,5.33,0.0784,15.6025,0.0289,15.7098
species,4.5,7.5,5.0,5.26,3.73,4.31,0.5776,14.2129,0.4761,15.2666
rank,6.5,7.0,5.5,5.26,3.67,5.57,1.5376,11.0889,0.0049,12.6314
country,7.5,7.0,6.0,6.14,3.71,6.35,1.8496,10.8241,0.1225,12.7962


In [13]:
df.sort_values(("diff", "Dominance"), ascending=False).head()

Unnamed: 0_level_0,historical,historical,historical,modern,modern,modern,diff,diff,diff,diff
Unnamed: 0_level_1,Valence,Arousal,Dominance,Valence,Arousal,Dominance,Valence,Arousal,Dominance,sum
afford,2.5,6.5,1.5,6.56,4.33,6.44,16.4836,4.7089,24.4036,45.5961
receive,3.5,6.5,2.0,7.14,4.3,6.79,13.2496,4.84,22.9441,41.0337
strange,2.0,6.5,1.0,4.72,3.5,5.28,7.3984,9.0,18.3184,34.7168
assume,4.0,5.5,2.0,4.41,4.2,6.21,0.1681,1.69,17.7241,19.5822
tear,2.0,7.5,1.5,3.14,4.8,5.68,1.2996,7.29,17.4724,26.062


In [14]:
df.sort_values(("diff", "sum"), ascending=False).head()

Unnamed: 0_level_0,historical,historical,historical,modern,modern,modern,diff,diff,diff,diff
Unnamed: 0_level_1,Valence,Arousal,Dominance,Valence,Arousal,Dominance,Valence,Arousal,Dominance,sum
afford,2.5,6.5,1.5,6.56,4.33,6.44,16.4836,4.7089,24.4036,45.5961
receive,3.5,6.5,2.0,7.14,4.3,6.79,13.2496,4.84,22.9441,41.0337
strange,2.0,6.5,1.0,4.72,3.5,5.28,7.3984,9.0,18.3184,34.7168
doctor,1.5,7.0,2.5,5.93,4.05,4.69,19.6249,8.7025,4.7961,33.1235
divine,7.0,7.0,2.0,7.15,3.05,5.96,0.0225,15.6025,15.6816,31.3066


## Correlation between Historical and Modern

In [34]:
for d in ["Valence", "Arousal", "Dominance"]:
    corr = st.pearsonr(df.loc[:, ("historical", d)], df.loc[:, ("modern", d)])[0]
    print(d,round(corr,2))

Valence 0.66
Arousal 0.51
Dominance 0.31


## Building the Table

In [30]:
examples = ["daughter","divine", "study", "strange"]
df_examples = df.loc[examples, ["historical", "modern"]].round(3)
print(df_examples.to_latex(float_format="%.1f"))
df_examples

\begin{tabular}{lrrrrrr}
\toprule
{} & \multicolumn{3}{l}{historical} & \multicolumn{3}{l}{modern} \\
{} &    Valence & Arousal & Dominance & Valence & Arousal & Dominance \\
\midrule
daughter &        3.5 &     4.0 &       4.0 &     6.7 &     5.0 &       5.1 \\
divine   &        7.0 &     7.0 &       2.0 &     7.2 &     3.0 &       6.0 \\
study    &        5.5 &     6.5 &       5.5 &     5.8 &     2.5 &       5.3 \\
strange  &        2.0 &     6.5 &       1.0 &     4.7 &     3.5 &       5.3 \\
\bottomrule
\end{tabular}



Unnamed: 0_level_0,historical,historical,historical,modern,modern,modern
Unnamed: 0_level_1,Valence,Arousal,Dominance,Valence,Arousal,Dominance
daughter,3.5,4.0,4.0,6.73,5.0,5.06
divine,7.0,7.0,2.0,7.15,3.05,5.96
study,5.5,6.5,5.5,5.78,2.55,5.33
strange,2.0,6.5,1.0,4.72,3.5,5.28
