In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from sklearn.metrics import calinski_harabaz_score
from sklearn.preprocessing import StandardScaler

In [2]:
df_results_LM = pd.read_csv('../FilesToUploadToColab/wikipedia_predictions_LM.csv')
df_results_concat = pd.read_csv('../FilesToUploadToColab/wikipedia_predictions_concat.csv')
df_tuples = pd.read_csv('../FilesToUploadToColab/NovelTuples.csv', sep = '\t', header = None)
df_tuples.columns = ('rel','head','tail','misc','misc2')

In [3]:
def score(tail_cond, tail_marg, head_cond, head_marg, lam = 1):
    return ((lam * (tail_cond + head_cond) - (tail_marg + head_marg)))/2
    # return lam * tail_cond + (k - 1) * head_marg - tail_marg

In [4]:
best_lams = []
for df in [df_results_LM, df_results_concat]:  
    sils = []
    lams = []
    for lam in np.arange(1, 10, 0.05):
        ss = StandardScaler()
        df_tuples['temp'] = score(
            df.tail_conditional,
            df.tail_marginal, 
            df.head_conditional, 
            df.head_marginal, 
            lam = lam
        )
        gm = GaussianMixture(5).fit(df_tuples[['temp']])
        dat = ss.fit_transform(df_tuples[['temp']])
        pred = gm.fit_predict(dat)
        sil = gm.aic(dat)
        sils.append(sil)
        lams.append(lam)
        #accs.append(acc)
    sils = np.array(sils)
    #accs = np.array(accs)
    best_lam = lams[sils.argmax()]
    best_lams.append(best_lam)

In [5]:
best_lams

[1.4500000000000004, 1.4000000000000004]

In [7]:
best_lams = []
for df in [df_results_LM, df_results_concat]:  
    sils = []
    lams = []
    for lam in np.arange(1, 10, 0.05):
        df_tuples['temp'] = score(
            df.tail_conditional,
            df.tail_marginal, 
            df.head_conditional, 
            df.head_marginal, 
            lam = lam
        )
        gm = GaussianMixture(5).fit(df_tuples[['temp']])
        sil = calinski_harabaz_score(df_tuples[['temp']], gm.predict(df_tuples[['temp']]))
        #acc = np.mean((gm.cluster_centers_.argmax() == df.label) == gm.predict(df_tuples[['score']]))
        sils.append(sil)
        lams.append(lam)
        #accs.append(acc)
    sils = np.array(sils)
    #accs = np.array(accs)
    best_lam = lams[sils.argmax()]
    best_lams.append(best_lam)

In [20]:
for lam in np.arange(1, 11, 1):
    df_tuples[f'score_LM_{lam}'] = score(
        df_results_LM.tail_conditional,
        df_results_LM.tail_marginal, 
        df_results_LM.head_conditional, 
        df_results_LM.head_marginal, 
        lam = lam
    )
    df_tuples[f'score_concat_{lam}'] = score(
        df_results_concat.tail_conditional,
        df_results_concat.tail_marginal, 
        df_results_concat.head_conditional, 
        df_results_concat.head_marginal, 
        lam = lam
    )

In [21]:
df_tuples['sent_LM'] = df_results_LM['sent']
df_tuples['sent_concat'] = df_results_concat['sent']

In [22]:
df_tuples['score_LM_cluster'] = score(
    df.tail_conditional,
    df.tail_marginal, 
    df.head_conditional, 
    df.head_marginal, 
    lam = best_lams[0]
)
df_tuples['score_concat_cluster'] = score(
    df.tail_conditional,
    df.tail_marginal, 
    df.head_conditional, 
    df.head_marginal, 
    lam = best_lams[1]
)

In [23]:
df_tuples['top_100'] = False

In [24]:
for clm in df_tuples.columns:
    if 'score' in clm:
        df_tuples[clm.replace('score', 'rank')] = df_tuples[clm].rank(ascending=False)        

In [25]:
rank_clms = [clm for clm in df_tuples.columns if re.match('rank_LM_[0-9]', clm)]
df_tuples['score_LM_ensemble'] = - df_tuples[rank_clms].mean(axis = 1)

rank_clms = [clm for clm in df_tuples.columns if re.match('rank_concat_[0-9]', clm)]
df_tuples['score_concat_ensemble'] = - df_tuples[rank_clms].mean(axis = 1)

In [26]:
for clm in df_tuples.columns:
    if 'score' in clm:
        df_tuples.loc[df_tuples[clm].rank(ascending=False) <= 100, 'top_100'] = True

In [27]:
for clm in df_tuples.columns:
    if 'score' in clm:
        df_tuples[clm.replace('score', 'rank')] = df_tuples[clm].rank(ascending=False)        

In [28]:
df_tuples[df_tuples.top_100]

Unnamed: 0,rel,head,tail,misc,misc2,temp,score_LM_1,score_concat_1,score_LM_2,score_concat_2,...,rank_LM_9,rank_concat_9,rank_LM_10,rank_concat_10,rank_LM_cluster,rank_concat_cluster,score_LM_ensemble,score_concat_ensemble,rank_LM_ensemble,rank_concat_ensemble
8,HasSubevent,wind power,energy,17,0.335372,-64.409269,3.671586,3.675419,-7.367223,-3.931808,...,1318.0,78.0,1329.0,77.0,224.0,239.0,-1125.8,-156.2,1092.0,108.0
13,HasSubevent,dopamine,norepinephrine transporter,7,0.220842,-192.816688,4.098280,7.844059,-26.461197,-14.576137,...,2993.0,2956.0,2993.0,2958.0,1832.0,1680.0,-2731.1,-2583.7,2805.0,2653.0
26,HasSubevent,beta decay,emission,16,0.102018,-60.663568,4.156948,5.881988,-9.817300,-1.553270,...,2276.0,53.0,2283.0,53.0,67.0,72.0,-1903.6,-67.9,1925.0,38.0
50,HasSubevent,set theory,logic,31,0.044019,-36.489515,2.925434,7.268706,-7.026707,2.379519,...,984.0,4.0,994.0,5.0,11.0,11.0,-906.4,-12.4,874.0,3.0
51,HasSubevent,köppen climate,cwa,7,0.043962,-241.158432,6.957806,-2.736583,-8.504061,-29.375896,...,2511.0,2990.0,2523.0,2990.0,2991.0,2991.0,-1994.0,-2981.2,1997.0,2992.0
58,HasSubevent,airport,air base,11,0.035322,-61.173613,3.476409,10.509255,-7.632346,2.499996,...,1349.0,52.0,1364.0,58.0,5.0,5.0,-1163.5,-31.3,1129.0,12.0
66,HasSubevent,signal,phase shift,7,0.021044,-67.582112,4.893082,8.604543,-5.099000,0.092068,...,915.0,109.0,933.0,114.0,23.0,24.0,-733.0,-71.9,690.0,44.0
84,HasSubevent,mating system,polygyny,7,0.013174,-129.421728,5.310981,7.262086,-16.378161,-8.009849,...,2958.0,2158.0,2961.0,2189.0,507.0,467.0,-2646.1,-1636.7,2721.0,1656.0
94,HasSubevent,reconnaissance,ground attack,8,0.010487,-95.391164,2.633332,8.234715,-9.380363,-3.343596,...,1734.0,712.0,1742.0,742.0,116.0,109.0,-1539.6,-487.6,1538.0,437.0
104,HasSubevent,trade deadline,draft pick,7,0.008709,-115.892033,5.041932,10.743823,-8.149487,-3.405435,...,2019.0,1582.0,2037.0,1634.0,80.0,67.0,-1616.5,-1022.9,1614.0,1009.0


In [29]:
df_tuples[df_tuples.top_100].to_csv(f'../FilesToUploadToColab/wikipedia_top.csv')