In [3]:
import pandas as pd
import re
import numpy as np
import fasttext as ft
from sklearn.neural_network import MLPRegressor
from scipy.stats import pearsonr
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import  train_test_split

In [None]:
# Parameter and Paths
fasttext_path = "models/oscar_ft_model_dim1536_ws2.bin.bin"
affects_path = "resources/affective_norms.txt"

In [2]:
# cleaning function

def to_text(x):
    
    x = re.sub("\"|\'|\[|\]|\,","",x)
    
    return x

In [3]:
# Load Resources
ft_model = ft.load_model(fasttext_path)
data = pd.read_csv(affects_path, sep="\t")



In [12]:
# train test split

X_text = [ft_model[to_text(str(x))] for x in data["Word"]][:10000]
Y_scores = np.array(data.loc[:len(X_text)-1,["AbstConc","Arou","IMG","Val"]])
X_train, X_test, y_train, y_test = train_test_split(X_text, Y_scores, test_size=0.1, random_state=42)

In [13]:
# init model

mlp = MLPRegressor(hidden_layer_sizes=(200,), random_state=1, max_iter=1000)

In [14]:
mlp.fit(X_train, y_train)

MLPRegressor(hidden_layer_sizes=(200,), max_iter=1000, random_state=1)

In [15]:
mlp.score(X_test, y_test)

0.8191755260582454

In [9]:
import pickle as pkl

In [16]:
from joblib import dump, load

In [10]:
mlp = load('aff_fasttext.joblib')

EOFError: 

In [17]:
dump(mlp, 'aff_fasttext.bin')

['aff_fasttext.bin']

In [21]:
from tqdm.notebook import tqdm

In [56]:
frame2 = pd.read_csv("Feedback Voting.csv",sep="\t")

In [57]:
frame2 = frame2.drop(columns=["Metapher_sw","Metapher_fj","Metapher_mk","Metonymie_sw","Metonymie_fj","Metonymie_mk","Metapher Summe","Metonymie Summe"])

In [58]:
frame2 = frame2.rename(columns={"Metapher Konsens":"Metapher","Metonymie Konsens":"Metonymie"})

In [59]:
frame2["Titel"] = ""
frame2["Metonymie"] = ""
frame2["Name"] = ""
frame2["quelle"] = "Konsens 2"
frame2["w1"] = frame2["genitiv_konstruktion"].apply(lambda x: x.split(" ")[0])
frame2["w2"] = frame2["genitiv_konstruktion"].apply(lambda x: x.split(" ")[-1])

In [60]:
frame1 = pd.read_csv("full+sense+affect.tsv", sep="\t", index_col=0)

In [61]:
frame = pd.concat([frame1,frame2], axis=0)

In [18]:
frame = pd.read_csv("gen_mlk_fig_spr_predict+super.tsv", sep="\t", index_col=0)

In [19]:
frame

Unnamed: 0,Unnamed: 0.1,genitiv_konstruktion,kontext,ID,w1,w2,supersenser_0,supersenser_1,supersenser_2,supersenser_3,...,supersenser_0.1,supersenser_1.1,supersenser_2.1,supersenser_3.1,supersenser_4.1,supersenser_5.1,supersenser_6.1,supersenser_7.1,supersenser_8.1,supersenser_9.1
0,0,Mund des röchelnden Banditen,###Wie aus dem Hohlweg Hülferufen klang;\nWie ...,l0020169.txt,Mund,Banditen,-12.449068,16.281939,-11.241486,6.675114,...,20.622215,7.387614,-10.765052,1.777130,-22.567417,31.093182,6.734971,-13.364339,-1.992805,-16.592121
1,1,Hand des Mörders,"###weißt, wie wir in der Campagna ritten,\nWie...",l0020169.txt,Hand,Mörders,-16.334610,12.250860,-15.578655,11.949934,...,21.126543,6.256617,-19.244587,4.333963,-29.366735,19.278040,-13.377811,-1.048816,-4.060271,-11.961693
2,2,Werkzeug dieser Höllenthat,"###Mutter Schoos, der dich getragen,\nBei ihre...",l0020169.txt,Werkzeug,Höllenthat,-12.462503,-11.280316,4.226968,9.851651,...,5.453100,-5.921415,3.839832,-5.545960,-24.702826,18.958019,-37.999504,-0.435136,-13.231165,-25.817455
3,3,Hochgefühl der Lust,"###Leib umfangen,\nSie küßte mich, als ich von...",l0020169.txt,Hochgefühl,Lust,-3.039791,3.775304,-1.387659,-8.354455,...,-6.868844,-5.031555,-0.897522,-1.763056,-8.427630,14.740567,1.650468,3.950479,0.034862,1.751651
4,4,Gebuscht der Brauen,###mir stand —\nDen weißen Arm erhoben — das G...,l0020169.txt,Gebuscht,Brauen,7.708554,-2.983548,4.824840,7.563175,...,1.884007,10.970037,15.535578,-8.435513,-6.866509,-19.579123,12.671397,-8.429327,3.754439,9.436239
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5295,5295,Silberglanz der Sterne,"###noch in uns der Keim der Erde lag,\nVon Ewi...",l00270580.txt,Silberglanz,Sterne,-6.775403,-1.854375,5.722345,0.381913,...,9.991711,10.789911,-1.942554,-8.009349,0.954613,7.369104,20.038372,-2.520092,0.035562,-16.285530
5296,5296,Klang der Sphären,###her:\nDie ew'gen Sterne sprechen mit dem Me...,l00270580.txt,Klang,Sphären,4.175107,2.952331,-7.502730,-9.138515,...,0.316299,6.120955,2.812529,-19.207472,-4.779900,-0.233042,-1.488213,0.508939,-6.146500,0.715690
5297,5297,Zaubersang der Wogen,###Da jauchzen wir und hüpfen in die Höh';\nDe...,l00270580.txt,Zaubersang,Wogen,10.389238,10.857550,8.099211,-9.128633,...,13.617758,14.678090,10.333622,5.958462,4.303414,-8.409082,4.303331,-2.417928,-6.520422,-5.629939
5298,5298,Welt der Menschen,"###alte Windesbraut redet auch darein,\nWill a...",l00270580.txt,Welt,Menschen,4.943958,-0.502833,-11.905156,1.247485,...,-8.522528,15.735208,-7.292505,8.613425,-7.906993,-1.654968,-4.604799,-16.602583,4.681283,13.645138


In [22]:
w1_abs = []
w1_aro = []
w1_img = []
w1_val = []

w2_abs = []
w2_aro = []
w2_img = []
w2_val = []

for index, row in tqdm(frame.iterrows(),total=frame.shape[0]):
    
    info = mlp.predict([ft_model[row["w1"]]])[0]
    
    w1_abs.append(info[0])
    w1_aro.append(info[1])
    w1_img.append(info[2])
    w1_val.append(info[3])
    
    info = mlp.predict([ft_model[row["w2"]]])[0]
    
    w2_abs.append(info[0])
    w2_aro.append(info[1])
    w2_img.append(info[2])
    w2_val.append(info[3])

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=5300.0), HTML(value='')))




In [23]:
frame["1_abs"] = w1_abs
frame["w1_aro"] = w1_aro
frame["w1_img"] = w1_img
frame["w1_val"] = w1_val

frame["w2_abs"] = w2_abs
frame["w2_aro"] = w2_aro
frame["w2_img"] = w2_img
frame["w2_val"] = w2_val

frame["abs_diff"] = frame["1_abs"]-frame["w2_abs"]
frame["aro_diff"] = frame["w1_aro"]-frame["w2_aro"]
frame["img_diff"] = frame["w1_img"]-frame["w2_img"]
frame["val_diff"] = frame["w1_val"]-frame["w2_val"]

In [32]:
frame.to_csv("predictionbase.tsv", sep="\t")

In [27]:
frame = frame.drop(columns=[ 'supersenser_0', 'supersenser_1', 'supersenser_2', 'supersenser_3',
       'supersenser_4', 'supersenser_5', 'supersenser_6', 'supersenser_7',
       'supersenser_8', 'supersenser_9'])

In [8]:
from collections import Counter

In [31]:
frame = frame.rename(columns={'supersenser_0.1':"supersenser_0", 
                              'supersenser_1.1':"supersenser_1", 
                              'supersenser_2.1':"supersenser_2",
                              'supersenser_3.1':"supersenser_3",
                              'supersenser_4.1':"supersenser_4",
                              'supersenser_5.1':"supersenser_5",
                              'supersenser_6.1':"supersenser_6",
                              'supersenser_7.1':'supersenser_7', 
                              'supersenser_8.1':'supersenser_8',
                               'supersenser_9.1':'supersenser_9'})

In [4]:
data = pd.read_csv("gen_mlk+all.tsv",sep="\t")

In [35]:
frame = frame.drop(columns=[ 'supersenser_0', 'supersenser_1', 'supersenser_2', 'supersenser_3',
       'supersenser_4', 'supersenser_5', 'supersenser_6', 'supersenser_7',
       'supersenser_8', 'supersenser_9'])

In [10]:
Counter(data["quelle"])

Counter({'fotis1': 78,
         'simone1': 135,
         'merten2': 100,
         'Simone2': 133,
         'merten1': 100,
         'marbach': 285,
         'Konsens 2': 99})

In [12]:
len(data)-285

645