In [None]:
from google.colab import drive

drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [None]:
%cd gdrive/MyDrive/3_year/parallel_corpora

/content/gdrive/MyDrive/3_year/parallel_corpora


In [None]:
import os, sys
import numpy as np
import matplotlib
matplotlib.use('Agg')
from matplotlib import pyplot as plt
from matplotlib.collections import LineCollection

from sklearn.manifold import MDS
from sklearn.metrics import euclidean_distances

import numpy as np
from scipy import ndimage
from matplotlib import pyplot as plt
import matplotlib.patches as mpatches

from sklearn import manifold, datasets
import pandas as pd
from tqdm import tqdm

In [None]:
import csv
from collections import defaultdict
import os
from itertools import combinations

In [None]:
from pprint import pprint

In [None]:
from collections import Counter

In [None]:
def read_the_data(fname):
    """Делает список из списков тэгов. Один список - тэги внутри одной переводной единицы"""
    
    with open(fname, 'r', encoding='utf-8') as tsvfile:
        dataset = csv.reader(tsvfile, delimiter='\t')
        transl_units = defaultdict(list)
        for line in dataset:
            transl_units[line[1]].append([line[2], line[4]])
    tags = []
    for unit in transl_units:
        # if len(transl_units[unit]) != 4:
        #     print(unit)
        sents = sorted(transl_units[unit], key=lambda x: x[0])
        tags.append([sent[1] for sent in sents])       
    return tags

def dissimilarity_matrix(data, n_lang=7):
    print('!!!!')
    #подсчет матрицы
    matrix = []
    for i in range(len(data)):
        sim_row = []
        for j in range(len(data)):
            sim_loc = 0
            for k in range(len(data[i])):
                if data[i][k] == data[j][k]:
                    sim_loc += 1
            sim_row.append(1 - sim_loc/n_lang) 
        matrix.append(sim_row)
    return matrix

def mds_stuff(matrix, ndim):
    matrix = np.array(matrix)
    mds = MDS(n_components=ndim, random_state=42, dissimilarity='precomputed')
    
    pos = mds.fit_transform(matrix)

    #print(mds.stress_)
    DE = euclidean_distances(pos)
    stress = 0.5 * np.sum((DE - matrix)**2)

    ## Kruskal's stress (or stress formula 1)
    stress1 = np.sqrt(stress / (0.5 * np.sum(matrix**2)))
    print("Kruskal's Stress :")
    print("[Poor > 0.2 > Fair > 0.1 > Good > 0.05 > Excellent > 0.025 > Perfect > 0.0]")
    print(stress1)
    return pos, (stress, stress1)

def make_plot(pos, lang_ind, langs, dim_x, dim_y, tagset):
    fig = plt.subplots(dpi=700, figsize=(9,8))
    
    tag_cmap = plt.cm.get_cmap('gist_rainbow', len(tagset))
    
    for i in range(len(tags)-1):
        tag = tags[i][lang_ind]
        if tag == 'missing':
            continue
        if tag in special_tags_colors:
            color = special_tags_colors[tag]
        else:
            color = tag_cmap(tagset.index(tag))
        plt.scatter(pos[i, dim_x], pos[i, dim_y], color=color)
        plt.text(pos[i, dim_x], pos[i, dim_y], tag, fontsize=6)

    plt.title(langs[lang_ind])
    plt.xlabel ('DIMENSION '+str(dim_x))
    plt.ylabel ('DIMENSION '+str(dim_y))

    pic_dir = os.path.join('pictures', str(selected_dim))
    if not os.path.exists(pic_dir):
        os.makedirs(pic_dir)
    plt.savefig(os.path.join(pic_dir, selected_dataset + '_' + langs[lang_ind] + '_' + str(dim_x) + '_' + str(dim_y) + '.png'))
    #plt.close(fig)

In [28]:
# CONSTS
datasets = {
    '1': 'csv_version_5columns.tsv', 
    '2': 'selected_lang_only_5columns.tsv',
    '3': 'csv_version_5columns_2.tsv'
}
special_tags_colors = {
    'imp': '#004b23',
    'missing': '#e9ebf8',
    'lexical': '#e9ebf8',
    'inter': '#007200',  
    'jus': '#38b000',
    'cond': '#70e000',
    'fut': '#22577a', 
    'other': '#e0e1dd', 
    'щоб + v': '#1982c4', 
    'чтобы + v': '#1982c4', 
    'pour  + v': '#1982c4'
}

In [29]:
#  expirimental setup

selected_dataset = '3'
n_dim = [2, 3, 5, 7, 10, 15, 20]

# langs = sorted(['ru', 'en', 'de', 'it', 'es', 'fr', 'uk'])
langs = sorted(['ru', 'en', 'fr', 'uk'])

In [30]:
filename = datasets[selected_dataset]
tags = read_the_data(filename)

In [31]:
stresses_dict = {}
for dim in n_dim:
    matrix = dissimilarity_matrix(tags[1:], n_lang=len(langs))
    pos, stresses = mds_stuff(matrix, dim)
    stresses_dict[dim] = stresses

!!!!
Kruskal's Stress :
[Poor > 0.2 > Fair > 0.1 > Good > 0.05 > Excellent > 0.025 > Perfect > 0.0]
0.34178401122822333
!!!!
Kruskal's Stress :
[Poor > 0.2 > Fair > 0.1 > Good > 0.05 > Excellent > 0.025 > Perfect > 0.0]
0.24084991794586788
!!!!
Kruskal's Stress :
[Poor > 0.2 > Fair > 0.1 > Good > 0.05 > Excellent > 0.025 > Perfect > 0.0]
0.15181399904631412
!!!!
Kruskal's Stress :
[Poor > 0.2 > Fair > 0.1 > Good > 0.05 > Excellent > 0.025 > Perfect > 0.0]
0.11177265969039721
!!!!
Kruskal's Stress :
[Poor > 0.2 > Fair > 0.1 > Good > 0.05 > Excellent > 0.025 > Perfect > 0.0]
0.08130052850542216
!!!!
Kruskal's Stress :
[Poor > 0.2 > Fair > 0.1 > Good > 0.05 > Excellent > 0.025 > Perfect > 0.0]
0.06107381774150653
!!!!
Kruskal's Stress :
[Poor > 0.2 > Fair > 0.1 > Good > 0.05 > Excellent > 0.025 > Perfect > 0.0]
0.05488299183816553


In [32]:
plt.close('all')

In [33]:
selected_dim = 20

In [34]:
dim_x = 0
dim_y = 1
matrix = dissimilarity_matrix(tags[1:], n_lang=len(langs))
pos, stresses = mds_stuff(matrix, selected_dim)
for i in range(len(langs)):
    lang_tagset = list(set([u[i] for u in tags]))
    # counter_tags = Counter([u[i] for u in tags]).most_common()
    # counter_tags = sorted(counter_tags, key=lambda x: x[1] * 1 if x[1] % 2 == 0 else x[1] * -1)
    # lang_tagset = [t[0] for t in counter_tags]
    make_plot(pos, i, langs, dim_x, dim_y, lang_tagset)
for i in range(3):
    dim_y = np.random.randint(1, high=selected_dim - 1, size=1)[0]
    dim_x = dim_y - 1
    for i in range(len(langs)):
        lang_tagset = list(set([u[i] for u in tags]))
        # counter_tags = Counter([u[i] for u in tags]).most_common()
        # counter_tags = sorted(counter_tags, key=lambda x: x[1] * 1 if x[1] % 2 == 0 else x[1] * -1)
        # lang_tagset = [t[0] for t in counter_tags]
        make_plot(pos, i, langs, dim_x, dim_y, lang_tagset)

!!!!
Kruskal's Stress :
[Poor > 0.2 > Fair > 0.1 > Good > 0.05 > Excellent > 0.025 > Perfect > 0.0]
0.05488299183816553
