In [3]:
import os
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

from dtaidistance import dtw
from dtaidistance import dtw_ndim
from dtaidistance import dtw_visualisation as dtwvis
from tqdm.notebook import tqdm
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from umap import UMAP
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import silhouette_score
from scipy.spatial.distance import euclidean
from sklearn.metrics import pairwise_distances

numpy.ndarray size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject


In [1]:
# silhoette scores:
1
Horror:0.01088312162833432
Liebe:0.008217209253881742
2 
Horror:0.009584810473050489
Liebe:0.003108224844360406
3 
Horror:0.009350265181363011
Liebe:0.006739503172550692
4 
Horror:0.00794170947674884
Liebe:0.002448520975266826

Essential Fuctions

In [63]:
def filter_matching(data1, data2, filter_krit):
    
    filter_dist = pairwise_distances(data2.loc[:,filter_krit], data1.loc[:,filter_krit])
    filter_frame = pd.DataFrame(filter_dist)
    filter_frame.index = data2.index
    filter_frame.columns = data1.index
    
    values = []
    for index, row in filter_frame.iterrows():    
        for index2, col in row.iteritems():
            values.append([index,index2,col])
            
    vframe = pd.DataFrame(values)
    vframe.columns = ["d2", "d1", "distance"]
    vframe = vframe.sort_values("distance")
    
    d1_rel = []
    d2_rel = []
    for index, row in vframe.iterrows():

        if len(d1_rel) <= len(data1)/100*20:
            d1_rel.append(int(row["d1"]))

        if len(d2_rel) <= len(data2)/100*20:
            d2_rel.append(int(row["d2"]))


        if len(d1_rel) >= len(data1)/100*30 and len(d2_rel) >= len(data2)/100*30:
            break

    d1_rel = list(set(d1_rel))
    d2_rel = list(set(d2_rel))
    d1_rel.sort()
    d2_rel.sort()
    
    return d1_rel, d2_rel

def type_attr(x):
    
    if x.startswith("s"):
        return "fake"
    else:
        return "real"
    
def measure_distance_sep(d1,d2):
    
    toscale = [x for x in d1.columns if not x.startswith("text") and not x.startswith("speech")]
    noscale = [x for x in d1.columns if x.startswith("text") or x.startswith("speech")]
    
    d1.loc[:,toscale] = StandardScaler().fit_transform(d1.loc[:,toscale])
    d2.loc[:,toscale] = StandardScaler().fit_transform(d2.loc[:,toscale])
    
    dist_struc = dtw_ndim.distance_fast(np.array(d1.loc[:,toscale], dtype=np.double), np.array(d2.loc[:,toscale], dtype=np.double), use_pruning=True)
    dist_sem = dtw_ndim.distance_fast(np.array(d1.loc[:,noscale], dtype=np.double), np.array(d2.loc[:,noscale], dtype=np.double), use_pruning=True)
    
    return (dist_struc + dist_sem)/2
    
def measure_distance_alt(d1,d2):
    
    toscale = [x for x in d1.columns if not x.startswith("text") and not x.startswith("speech")]
    d1.loc[:,toscale] = StandardScaler().fit_transform(d1.loc[:,toscale])
    d2.loc[:,toscale] = StandardScaler().fit_transform(d2.loc[:,toscale])
    dist = dtw_ndim.distance_fast(np.array(d1, dtype=np.double), np.array(d2, dtype=np.double), use_pruning=True)
    
    return dist

def measure_distance(d1,d2):
    
    d1 = StandardScaler().fit_transform(d1)
    d2 = StandardScaler().fit_transform(d2)
    dist = dtw_ndim.distance_fast(np.array(d1, dtype=np.double), np.array(d2, dtype=np.double), use_pruning=True)
    
    return dist

def get_heatmap(dist_matrix, groups, group_labels):
    
    super_dist = []
    samplesize = 500
    for group1 in groups:
        for group2 in groups:
            s = 0
            d = []
            while s < samplesize:

                g1 = np.random.choice(group1,size=20)
                g2 = np.random.choice(group2,size=20)
                g1 = [x for x in g1 if x not in g2]
                
                d.append(dist_matrix.loc[g1,g2].mean().mean())
                s+=1
            super_dist.append(np.mean(d))

    super_dist = pd.DataFrame(np.array(super_dist).reshape(len(groups),len(groups)))
    super_dist.columns = group_labels
    super_dist.index = group_labels

    sns.heatmap(super_dist.loc[group_labels[:2],group_labels[:2]], cmap="Reds")
    plt.show()

    sns.heatmap(super_dist.loc[group_labels[2:],group_labels[2:]], cmap="Reds")
    plt.show()
    
def scatter_svd(dist_matrix, result):
    
    dimred = TruncatedSVD()
    vecs = dimred.fit_transform(dist_matrix)
    result["v1"] = vecs[:,0]
    result["v2"] = vecs[:,1]

    fig = px.scatter(result, x="v1", y="v2", symbol="type", color="genre",  hover_data=["ID"])
    fig.show()
    
    focus = result[result.genre == "horror"]
    ids = list(focus["fname"])
    dimred = UMAP(n_neighbors=15, min_dist=0.9, metric="euclidean")
    vecs = dimred.fit_transform(dist_matrix.loc[ids, ids])
    focus["v1"] = vecs[:,0]
    focus["v2"] = vecs[:,1]

    fig = px.scatter(focus, x="v1", y="v2", color="type",  hover_data=["fname"])
    fig.show()
    
    focus = result[result.genre == "liebe"]
    ids = list(focus["fname"])
    dimred = UMAP(n_neighbors=15, min_dist=0.9, metric="euclidean")
    vecs = dimred.fit_transform(dist_matrix.loc[ids, ids])
    focus["v1"] = vecs[:,0]
    focus["v2"] = vecs[:,1]

    fig = px.scatter(focus, x="v1", y="v2", color="type",  hover_data=["fname"])
    fig.show()

def scatter_umap(dist_matrix, result):
    
    dimred = UMAP(n_neighbors=15, min_dist=0.9, metric="euclidean")
    vecs = dimred.fit_transform(dist_matrix)
    result["v1"] = vecs[:,0]
    result["v2"] = vecs[:,1]

    fig = px.scatter(result, x="v1", y="v2", symbol="type", color="genre",  hover_data=["ID"])
    fig.show()
    
    focus = result[result.genre == "horror"]
    ids = list(focus["fname"])
    dimred = UMAP(n_neighbors=15, min_dist=0.9, metric="euclidean")
    vecs = dimred.fit_transform(dist_matrix.loc[ids, ids])
    focus["v1"] = vecs[:,0]
    focus["v2"] = vecs[:,1]

    fig = px.scatter(focus, x="v1", y="v2", color="type",  hover_data=["fname"])
    fig.show()
    
    focus = result[result.genre == "liebe"]
    ids = list(focus["fname"])
    dimred = UMAP(n_neighbors=15, min_dist=0.9, metric="euclidean")
    vecs = dimred.fit_transform(dist_matrix.loc[ids, ids])
    focus["v1"] = vecs[:,0]
    focus["v2"] = vecs[:,1]

    fig = px.scatter(focus, x="v1", y="v2", color="type",  hover_data=["fname"])
    fig.show()
    
def sil(dist_matrix, result):
    
    split = result[result.genre == "horror"]
    focus = list(split["fname"])
    label = list(split["type"])

    print("Horror:" +str(silhouette_score(dist_matrix.loc[focus,focus], metric = "precomputed", labels=label)))
    
    split = result[result.genre == "liebe"]
    focus = list(split["fname"])
    label = list(split["type"])

    print("Liebe:" + str(silhouette_score(dist_matrix.loc[focus,focus], metric = "precomputed", labels=label)))    
    
    

0. Data loading

In [4]:
files = os.listdir("slices")
meta = pd.read_csv("/mnt2/data/collections/corpora/Heftromane/meta.tsv", sep="\t", index_col=0)

result = pd.DataFrame()
result["fname"] = files
result["type"] = [type_attr(x) for x in files]
result["genre"] = list(meta.loc[[re.sub("shuffle\_|\.+tsv","",x) for x in files], "genre"])
result["ID"] = result["fname"].apply(lambda x: re.sub("shuffle\_|\.+tsv","",x))

lr = list(result[(result.type == "real") & (result.genre == "liebe")]["fname"])
lf = list(result[(result.type == "fake") & (result.genre == "liebe")]["fname"])
hr = list(result[(result.type == "real") & (result.genre == "horror")]["fname"])
hf = list(result[(result.type == "fake") & (result.genre == "horror")]["fname"])

groups = [lr,lf,hr,hf]
group_labels = ["love_real","love_fake","horror_real","horror_fake"]

NameError: name 'type_attr' is not defined

In [7]:
files = os.listdir("slices")
files

['shuffle_k00300000852..tsv',
 'shuffle_1025686527.tsv',
 'k00300000740.tsv',
 'shuffle_k00300000755..tsv',
 'k00300000772.tsv',
 'shuffle_k00300000068.tsv',
 'k00300000053.tsv',
 'k00300000028..tsv',
 'k00300000088..tsv',
 'shuffle_k00300000048..tsv',
 'k00300000864.tsv',
 'shuffle_k00300000767.tsv',
 'shuffle_102568611X.tsv',
 'k00300000775.tsv',
 'shuffle_k00300000870.tsv',
 'k00300000845.tsv',
 'shuffle_k00300000760..tsv',
 'shuffle_k00300000753.tsv',
 'k00300000751.tsv',
 'k00300000851..tsv',
 'k00300000729..tsv',
 'k00300001153.tsv',
 'k00300000084..tsv',
 '1025686357..tsv',
 'shuffle_k00300000855..tsv',
 'k00300001172..tsv',
 'shuffle_k00300000058..tsv',
 'shuffle_k00300000744.tsv',
 'shuffle_k00300000095..tsv',
 'shuffle_k00300000867.tsv',
 'k00300000052..tsv',
 'shuffle_k00300000004..tsv',
 'shuffle_k00300000772.tsv',
 'shuffle_k00300001181.tsv',
 'shuffle_k00300000067..tsv',
 'k00300000747..tsv',
 'shuffle_k00300001153.tsv',
 'k00300000741.tsv',
 'shuffle_k00300000089.tsv',
 

In [10]:
data = pd.read_csv("slices/shuffle_k00300000852..tsv", sep="\t", index_col=0).fillna(3)
data = data.reset_index()

1. All Scenes no topic model

In [None]:
len(files)

In [4]:
distances = []
krit = ['pers_count', 'speech', 'fight', 'erotic', 'males', 'females', 'personal_weight', 'action', 'valence', 'arousal']
for fname1 in tqdm(files):
    data1 = pd.read_csv("slices/"+fname1, sep="\t", index_col=0).fillna(3)
    
    for fname2 in files:
        data2 = pd.read_csv("slices/"+fname2, sep="\t", index_col=0).fillna(3)
        
        distance = measure_distance(data1.loc[:,krit], data2.loc[:,krit])
        distances.append(distance)
        
dist_matrix = pd.DataFrame(np.array(distances).reshape(len(files),len(files)))
dist_matrix.columns = files
dist_matrix.index = files

a = np.array(dist_matrix).flatten()
maximum = a[np.where(np.isinf(a),-np.Inf, a).argmax()]+1
dist_matrix.replace([np.inf, -np.inf], maximum, inplace=True)
dist_matrix.to_csv("shuffle_1.tsv", sep="\t")

NameError: name 'files' is not defined

In [7]:
sil(dist_matrix, result)

Horror:0.01088312162833432
Liebe:0.008217209253881742


2. Relevant Scenes no topic model

In [7]:
distances = []
krit = ['pers_count', 'speech', 'fight', 'erotic', 'males', 'females', 'personal_weight', 'action', 'valence', 'arousal']
num_scenes = 20
for fname1 in tqdm(files):
    
    data1 = pd.read_csv("slices/"+fname1, sep="\t", index_col=0).fillna(0)
    data1 = data1[data1.arousal_relevance < num_scenes]
    
    for fname2 in files:
        
        data2 = pd.read_csv("slices/"+fname2, sep="\t", index_col=0).fillna(0)
        data2 = data2[data2.arousal_relevance < num_scenes]
        
        distance = measure_distance(data1.loc[:,krit], data2.loc[:,krit])
        distances.append(distance)
        
dist_matrix = pd.DataFrame(np.array(distances).reshape(len(files),len(files)))
dist_matrix.columns = files
dist_matrix.index = files

a = np.array(dist_matrix).flatten()
maximum = a[np.where(np.isinf(a),-np.Inf, a).argmax()]+1
dist_matrix.replace([np.inf, -np.inf], maximum, inplace=True)
dist_matrix.to_csv("shuffle_2.tsv", sep="\t")

  0%|          | 0/384 [00:00<?, ?it/s]

In [11]:
sil(dist_matrix, result)

Horror:0.009584810473050489
Liebe:0.003108224844360406


3. All Scenes with topic model

In [None]:
distances = []
krit = ['pers_count', 'speech', 'fight', 
        'erotic', 'males', 'females', 'personal_weight', 
        'action', 'valence', 'arousal',"text_1","text_2","text_3","text_4","text_5",
        'speech_2','speech_3', 'speech_4',"speech_5"]

filter_krit = ["text_1","text_2","text_3","text_4","text_5",
               'speech_1', 'speech_2','speech_3', 'speech_4',"speech_5"]

for fname1 in tqdm(files):
    data1 = pd.read_csv("slices/"+fname1, sep="\t", index_col=0).fillna(3)
    
    for fname2 in files:
        data2 = pd.read_csv("slices/"+fname2, sep="\t", index_col=0).fillna(3)
        
        d1_rel, d2_rel = filter_matching(data1, data2, filter_krit)
        
        distance = measure_distance(data1.loc[d1_rel,krit], data2.loc[d2_rel,krit])
        distances.append(distance)
        
dist_matrix = pd.DataFrame(np.array(distances).reshape(len(files),len(files)))
dist_matrix.columns = files
dist_matrix.index = files

a = np.array(dist_matrix).flatten()
maximum = a[np.where(np.isinf(a),-np.Inf, a).argmax()]+1
dist_matrix.replace([np.inf, -np.inf], maximum, inplace=True)
dist_matrix.to_csv("shuffle_3_filtered.tsv", sep="\t")

  0%|          | 0/384 [00:00<?, ?it/s]

In [17]:
d1 = data1.loc[:,krit]

In [4]:
data1 = pd.read_csv("slices/"+files[0], sep="\t", index_col=0).fillna(3)
data2 = pd.read_csv("slices/"+files[1], sep="\t", index_col=0).fillna(3)

In [64]:
data1.columns

Index(['pers_count', 'speech', 'fight', 'erotic', 'males', 'females',
       'personal_weight', 'action', 'valence', 'arousal', 'arousal_relevance',
       'text_1', 'text_2', 'speech_1', 'speech_2', 'first_meeting', 'text_3',
       'text_4', 'text_5', 'speech_3', 'speech_4', 'speech_5'],
      dtype='object')

In [18]:
toscale = [x for x in d1.columns if not x.startswith("text") and not x.startswith("speech")]
d1.loc[:,toscale] = StandardScaler().fit_transform(d1.loc[:,toscale])
d1

Unnamed: 0,pers_count,speech,fight,erotic,males,females,personal_weight,action,valence,arousal,...,text_2,text_3,text_4,text_5,speech_1,speech_2,speech_2.1,speech_3,speech_4,speech_5
0,0.474123,0.537313,-0.677185,-0.834983,-0.100931,1.425366,0.666041,-1.121601,-1.519222,-0.701404,...,-1.783077,3.239870,0.589199,3.303059,16.001453,0.640682,0.640682,21.748215,7.642618,8.302791
1,-0.023128,0.041860,-0.677185,-0.834983,-0.100931,0.211688,0.164717,-0.304596,0.854158,-0.203462,...,-0.872190,2.173133,0.355640,3.430898,12.093566,0.719631,0.719631,21.501808,9.254362,10.093770
2,-0.520379,0.258427,-0.310909,0.895343,-0.720937,0.211688,-0.143791,0.308158,0.276977,-0.368915,...,2.835088,-0.212994,1.269150,2.150037,14.217569,2.038256,2.038256,20.161423,5.945496,10.127579
3,-0.520379,0.120370,-0.677185,-0.834983,-0.720937,0.211688,-0.143791,-0.713098,-2.248539,-2.468891,...,-0.641760,2.094001,-1.125703,2.216228,12.058609,-1.042899,-1.042899,22.032434,10.011757,8.726249
4,0.474123,0.696429,-0.677185,0.030180,0.519075,0.211688,0.820295,-0.815224,1.052163,0.997997,...,-1.162044,3.050787,1.006104,3.901462,12.884089,5.249876,5.249876,19.554700,11.556478,9.703283
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
81,-0.023128,0.208556,1.154195,0.030180,-0.100931,0.211688,0.190426,-0.304596,0.094948,1.703576,...,-2.447695,4.740943,-0.089112,4.995480,13.320568,-1.068243,-1.068243,22.020758,9.399398,9.088406
82,1.468625,0.268493,2.253023,-0.834983,1.139081,1.425366,1.668690,0.359221,-0.459077,0.784179,...,-2.928221,7.621860,2.168294,4.785818,14.332829,-0.907717,-0.907717,23.624376,9.542114,8.488934
83,-1.017630,0.319298,1.520471,0.895343,-0.720937,-1.001990,-0.837932,0.103907,0.264121,2.538694,...,-3.122350,8.137480,2.375418,4.946149,18.197577,0.306296,0.306296,21.048723,11.074277,7.638268
84,1.965876,0.347222,1.886747,0.030180,1.759087,1.425366,1.295910,0.103907,-1.413992,-0.003028,...,-1.740657,5.175779,-0.663463,4.625773,11.642310,-2.171463,-2.171463,23.378730,8.101710,8.532567


In [15]:
sil(dist_matrix, result)

Horror:0.009350265181363011
Liebe:0.006739503172550692


4. Relevant Scenes with topic model

In [9]:
distances = []
krit = ['pers_count', 'speech', 'fight', 
        'erotic', 'males', 'females', 'personal_weight', 
        'action', 'valence', 'arousal',"text_1","text_2","speech_1","speech_2"]

num_scenes = 20
for fname1 in tqdm(files):
    
    data1 = pd.read_csv("slices/"+fname1, sep="\t", index_col=0).fillna(0)
    data1 = data1[data1.arousal_relevance < num_scenes]
    
    for fname2 in files:
        
        data2 = pd.read_csv("slices/"+fname2, sep="\t", index_col=0).fillna(0)
        data2 = data2[data2.arousal_relevance < num_scenes]
        
        distance = measure_distance(data1.loc[:,krit], data2.loc[:,krit])
        distances.append(distance)
        
dist_matrix = pd.DataFrame(np.array(distances).reshape(len(files),len(files)))
dist_matrix.columns = files
dist_matrix.index = files

a = np.array(dist_matrix).flatten()
maximum = a[np.where(np.isinf(a),-np.Inf, a).argmax()]+1
dist_matrix.replace([np.inf, -np.inf], maximum, inplace=True)
dist_matrix.to_csv("shuffle_4.tsv", sep="\t")

  0%|          | 0/384 [00:00<?, ?it/s]