In [None]:
import pandas as pd
import numpy as np
import os
import re
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
sns.set(font_scale=1.5)
sns.set_style({'axes.grid' : False})

In [None]:
files = os.listdir("love_permut")

result = pd.DataFrame()
result["fname"] = files
result["type"] = [x[:2] for x in files]
result["ID"] = result["fname"].apply(lambda x: re.sub("shuffle\_|\.+tsv","",x))

a1 = list(result[result["type"] == "A0"]["fname"])
b1 = list(result[result["type"] == "B1"]["fname"])
b2 = list(result[result["type"] == "B2"]["fname"])
b3 = list(result[result["type"] == "B3"]["fname"])

In [None]:
dist1 = pd.read_csv("permut_1.tsv", sep="\t", index_col=0)
dist2 = pd.read_csv("permut_2.tsv", sep="\t", index_col=0)
dist3 = pd.read_csv("permut_3.tsv", sep="\t", index_col=0)
dist4 = pd.read_csv("permut_4.tsv", sep="\t", index_col=0)

In [None]:
def get_distances(dist_matrix, group1, group2, size):
    
    d = []
    s = 0
    samplesize = size
    while s < samplesize:

        g1 = np.random.choice(group1,size=10)
        g2 = np.random.choice(group2,size=10)
                
        d.append(dist_matrix.loc[g1,g2].mean().mean())
        s+=1
        
    return d

In [None]:
full_frame = pd.DataFrame()

size = 500
case = ["base", "+topics", "+relevance","+topics\n+relevance"]
i = 0
for mat in [dist1, dist2, dist3, dist4]:

    frame = pd.DataFrame()
    a = get_distances(mat, a1, a1, size)
    b = get_distances(mat, a1, b1, size)
    c = get_distances(mat, a1, b2, size)
    d = get_distances(mat, a1, b3, size)
    
    d = a + c + b + d
    frame["Distance"] = d
    frame["Model"] = case[i]
    frame["Group"] = size*["A1"]+size*["B1"]+size*["B2"]+size*["B3"]
    
    full_frame = pd.concat([frame, full_frame])
    
    i+=1
    

In [None]:
fig, ax = plt.subplots(1,4,figsize=(16,9))

pdata = full_frame[full_frame["Model"] == "base"]
sns.boxplot(data=pdata, x="Group", y="Distance", ax=ax[0], fliersize=0, order=["A1","B2","B1","B3"])
ax[0].set_xlabel("base")
ax[0].set_yticklabels(["","","",""])

pdata = full_frame[full_frame["Model"] == "+topics"]
sns.boxplot(data=pdata, x="Group", y="Distance", ax=ax[1], fliersize=0, order=["A1","B2","B1","B3"])
ax[1].set_ylabel("")
ax[1].set_yticklabels(["","","",""])
ax[1].set_xlabel("+relevance")

pdata = full_frame[full_frame["Model"] == "+relevance"]
sns.boxplot(data=pdata, x="Group", y="Distance", ax=ax[2], fliersize=0, order=["A1","B2","B1","B3"])
ax[2].set_ylabel("")
ax[2].set_yticklabels(["","","",""])
ax[2].set_xlabel("+topics")

pdata = full_frame[full_frame["Model"] == "+topics\n+relevance"]
sns.boxplot(data=pdata, x="Group", y="Distance",ax=ax[3], fliersize=0, order=["A1","B2","B1","B3"])
ax[3].set_ylabel("")
ax[3].set_yticklabels(["","","",""])
ax[3].set_xlabel("+topics\n+relevance")

plt.tight_layout()
#plt.show()
plt.savefig("plots/eval3.png", dpi=300)

Task 2

In [None]:
def type_attr(x):
    
    if x.startswith("s"):
        return "fake"
    else:
        return "real"
    
files = os.listdir("slices")
meta = pd.read_csv("/mnt2/data/collections/corpora/Heftromane/meta.tsv", sep="\t", index_col=0)

result = pd.DataFrame()
result["fname"] = files
result["type"] = [type_attr(x) for x in files]
result["genre"] = list(meta.loc[[re.sub("shuffle\_|\.+tsv","",x) for x in files], "genre"])
result["ID"] = result["fname"].apply(lambda x: re.sub("shuffle\_|\.+tsv","",x))

lr = list(result[(result.type == "real") & (result.genre == "liebe")]["fname"])
lf = list(result[(result.type == "fake") & (result.genre == "liebe")]["fname"])
hr = list(result[(result.type == "real") & (result.genre == "horror")]["fname"])
hf = list(result[(result.type == "fake") & (result.genre == "horror")]["fname"])

groups = [lr,lf,hr,hf]
group_labels = ["love_real","love_fake","horror_real","horror_fake"]

In [None]:
dist1 = pd.read_csv("shuffle_1.tsv", sep="\t", index_col=0)
dist2 = pd.read_csv("shuffle_2.tsv", sep="\t", index_col=0)
dist3 = pd.read_csv("shuffle_3.tsv", sep="\t", index_col=0)
dist4 = pd.read_csv("shuffle_4.tsv", sep="\t", index_col=0)

In [None]:
a = []
for fname in os.listdir("shatter_matrix"):
    a.append(pd.read_csv("shatter_matrix/"+fname, sep="\t", index_col=0))
    

In [None]:
dist3 = pd.concat(a)

In [None]:
full_frame = pd.DataFrame()

size = 500
case = ["base", "+topics", "+relevance", "+topics\n+relevance"]
i = 0
for mat in [dist2, dist1, dist4, dist3]:

    frame = pd.DataFrame()
    a = get_distances(mat, lr, lr, size)
    b = get_distances(mat, lr, lf, size)
    c = get_distances(mat, hr, hr, size)
    d = get_distances(mat, hr, hf, size)
    
    d = a + b + c + d
    frame["Distance"] = d
    frame["Model"] = case[i]
    frame["Group"] = size*["Romance"]+size*["Romance\nshuffle"]+size*["Horror"]+size*["Horror\nshuffle"]
    
    full_frame = pd.concat([frame, full_frame])
    
    i+=1

In [None]:
fig, ax = plt.subplots(1,4,figsize=(16,9))

xlabels = ["Romance","Romance\nshuffle","Horror","Horror\nshuffle"]

pdata = full_frame[full_frame["Model"] == "base"]
sns.boxplot(data=pdata, x="Group", y="Distance", ax=ax[0], fliersize=0)
ax[0].set_xlabel("base")
ax[0].set_yticklabels(["","","",""])
ax[0].set_xticklabels(xlabels, rotation=45)

pdata = full_frame[full_frame["Model"] == "+topics"]
sns.boxplot(data=pdata, x="Group", y="Distance", ax=ax[1], fliersize=0)
ax[1].set_ylabel("")
ax[1].set_yticklabels(["","","",""])
ax[1].set_xlabel("+relevance")
ax[1].set_xticklabels(xlabels, rotation=45)

pdata = full_frame[full_frame["Model"] == "+relevance"]
sns.boxplot(data=pdata, x="Group", y="Distance", ax=ax[2], fliersize=0)
ax[2].set_ylabel("")
ax[2].set_yticklabels(["","","",""])
ax[2].set_xlabel("+topics")
ax[2].set_xticklabels(xlabels, rotation=45)

pdata = full_frame[full_frame["Model"] == "+topics\n+relevance"]
sns.boxplot(data=pdata, x="Group", y="Distance", ax=ax[3], fliersize=0)
ax[3].set_ylabel("")
ax[3].set_yticklabels(["","","",""])
ax[3].set_xlabel("+topics\n+relevance")
ax[3].set_xticklabels(xlabels, rotation=45)

plt.tight_layout()
#plt.show()
plt.savefig("plots/eval2.png", dpi=300)

In [None]:
fig, ax = plt.subplots(1,4,figsize=(16,9))

xlabels = ["Romance","Romance\nshuffle","Horror","Horror\nshuffle"]

pdata = full_frame[full_frame["Model"] == "base"]
sns.boxplot(data=pdata, x="Group", y="Distance", ax=ax[0], fliersize=0)
ax[0].set_xlabel("base")
ax[0].set_yticklabels(["","","",""])
ax[0].set_xticklabels(xlabels, rotation=45)

pdata = full_frame[full_frame["Model"] == "+topics\n+relevance"]
sns.boxplot(data=pdata, x="Group", y="Distance", ax=ax[1], fliersize=0)
ax[1].set_ylabel("")
ax[1].set_yticklabels(["","","",""])
ax[1].set_xlabel("Topic Filter")
ax[1].set_xticklabels(xlabels, rotation=45)

plt.tight_layout()
plt.show()

In [None]:
dist1 = pd.read_csv("dtgw_degrees_output.tsv",sep="\t", index_col=0)
dist2 = pd.read_csv("tfidf_matrix.tsv",sep="\t", index_col=0)
dist2.columns = [re.sub("\.","",x)+".tsv" for x in dist2.columns]
dist2.index = dist2.columns
dist3 = pd.read_csv("shuffle_4.tsv",sep="\t", index_col=0)
dist4 = pd.read_csv("global.tsv",sep="\t", index_col=0)
dist4.columns = [re.sub("\.","",x)+".tsv" for x in dist4.columns]
dist4.index = dist4.columns

In [None]:
dist4

In [None]:
files = os.listdir("slices")
files = [x for x in files if not x.startswith("s")]
files = [x for x in files if x in dist1.columns]
result = pd.DataFrame()
result["fname"] = files
result["genre"] = list(meta.loc[[re.sub("shuffle\_|\.+tsv","",x) for x in files], "genre"])
result["ID"] = result["fname"].apply(lambda x: re.sub("shuffle\_|\.+tsv","",x))

h = list(result[result.genre == "horror"]["fname"])
m = list(result[result.genre == "liebe"]["fname"])


In [None]:
full_frame = pd.DataFrame()

size = 500
case = ["tfidf", "global", "dtw", "dtgw"]
i = 0
for mat in [dist2, dist4, dist3, dist1]:
    print(case[i])
    frame = pd.DataFrame()
    a = get_distances(mat, h, h, size)
    b = get_distances(mat, m, m, size)
    c = get_distances(mat, m, h, size)

    d = a + b + c
    frame["Distance"] = d
    frame["Model"] = case[i]
    frame["Group"] = size*["Romance"]+size*["Horror"]+size*["Both"]
    full_frame = pd.concat([frame, full_frame])
    
    i+=1

In [None]:
fig, ax = plt.subplots(1,4,figsize=(16,9))

xlabels = ["Romance","Horror","Both"]

pdata = full_frame[full_frame["Model"] == "tfidf"]
sns.boxplot(data=pdata, x="Group", y="Distance", ax=ax[0], fliersize=0)

ax[0].set_xlabel("tf-idf")
ax[0].set_yticklabels(["","","",""])
ax[0].set_xticklabels(xlabels, rotation=45)

pdata = full_frame[full_frame["Model"] == "global"]
sns.boxplot(data=pdata, x="Group", y="Distance", ax=ax[1], fliersize=0)

ax[1].set_xlabel("Global\nCharacteristics")
ax[1].set_yticklabels(["","","",""])
ax[1].set_xticklabels(xlabels, rotation=45)
ax[1].set_ylabel("")

pdata = full_frame[full_frame["Model"] == "dtw"]
sns.boxplot(data=pdata, x="Group", y="Distance", ax=ax[2], fliersize=0)

ax[2].set_xlabel("Time Series")
ax[2].set_yticklabels(["","","",""])
ax[2].set_xticklabels(xlabels, rotation=45)
ax[2].set_ylabel("")

pdata = full_frame[full_frame["Model"] == "dtgw"]
sns.boxplot(data=pdata, x="Group", y="Distance", ax=ax[3], fliersize=0)

ax[3].set_xlabel("Temporal Graph")
ax[3].set_yticklabels(["","","",""])
ax[3].set_xticklabels(xlabels, rotation=45)
ax[3].set_ylabel("")

#plt.show()
plt.savefig("plots/eval1.png", dpi=300, bbox_inches="tight")