# Create Embedding

In [1]:
import warnings
warnings.filterwarnings("ignore")

import __init__
import models.cross_validator as cross_validator
import models.datahandler as datahandler
from models.datamodel import DataModel
import models.outputmaker as outputmaker
import evaluator.metrics as metrics
from models.cmm import CMM

trial = datahandler.load_train('../data/dataset/tsd_trial.csv', verbose=True)
train = datahandler.load_train('../data/dataset/tsd_train.csv', verbose=True)

Unnamed: 0,spans,text
0,"[15, 16, 17, 18, 19, 27, 28, 29, 30, 31]",Because he's a moron and a bigot. It's not any...
1,"[29, 30, 31, 32, 33, 34]",How about we stop protecting idiots and let na...
2,"[166, 167, 168, 169, 170, 171]","If people were smart, they would Boycott th..."


Unnamed: 0,spans,text
0,"[8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,...",Another violent and aggressive immigrant killi...
1,"[33, 34, 35, 36, 37, 38, 39]","I am 56 years old, I am not your fucking junio..."
2,"[0, 1, 2, 3]","Damn, a whole family. Sad indeed."


Initialize `Models` Methods

### Train on train data, test on trial data

In [2]:
model = CMM()

datamodel = DataModel(model='cmm', remove_stop_words=True)

output_maker = outputmaker.crf_output

evaluator = metrics

In [3]:
X_train, y_train, train_taboo_words, train_texts = cross_validator.load_data(train, [i for i in range(train.shape[0])],
                                                                             datamodel, logger=True)

X_test, y_test, test_taboo_words, test_texts = cross_validator.load_data(trial, [i for i in range(trial.shape[0])], 
                                                                   datamodel, logger=True)

100%|██████████| 7939/7939 [02:11<00:00, 60.39it/s]
100%|██████████| 690/690 [00:11<00:00, 61.51it/s]


# Before doing some preprocessing, only using stopwordremoval
* CNS Train: `{'normal': 158417, 'toxic': 18451}`
* CNS Test: `{'normal': 13283, 'toxic': 1405}`

# After doing some preprocessing and using stopwordremoval
* CNS Train: `{'normal': 131143, 'toxic': 17168}`
* CNS Test: `{'normal': 11197, 'toxic': 1335}`

In [5]:
train_words = []
for X in X_train:
    for x in X:
        train_words.append(x[0])

test_words = []
for X in X_test:
    for x in X:
        test_words.append(x[0])

In [8]:
print("size of train words:", len(train_words))
print("size of test words:", len(test_words))

print("size of train words after set:", len(set(train_words)))
print("size of test words after set:", len(set(test_words)))

size of train words: 148311
size of test words: 12532
size of train words after set: 22682
size of test words after set: 5088


In [9]:
train_word_set = set(train_words)
test_word_set = set(test_words)

In [11]:
from nltk.stem.porter import *
stemmer = PorterStemmer()

In [39]:
train_word_set_stem = list(set([stemmer.stem(word) for word in train_word_set]))
test_word_set_stem = list(set([stemmer.stem(word) for word in test_word_set]))

print("size of train words after stemming:", len(set(train_word_set_stem)))
print("size of test words after stemming:", len(set(test_word_set_stem)))

size of train words after stemming: 13148
size of test words after stemming: 3570


In [40]:
combined_words = list(set([ word for word in train_word_set_stem] + [word for word in test_word_set_stem]))

print("Combined lenght:", len(combined_words))

print(combined_words[:10])

Combined lenght: 13673
['hypocrisi', 'pace', 'preview', 'seth', 'africa', 'kiddi', 'ador', 'heir', 'moan', 'suppos']


In [53]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
vader = SentimentIntensityAnalyzer()

def sia(word):
    return vader.polarity_scores(word)['compound']

def get_vader(X):
    return [[sia(word)] for word in X]

In [54]:
import numpy as np

sia_arr = np.array(get_vader(combined_words))

In [58]:
sia_arr[:10], sia_arr.shape

(array([[ 0.    ],
        [ 0.    ],
        [ 0.    ],
        [ 0.    ],
        [ 0.    ],
        [ 0.    ],
        [ 0.    ],
        [ 0.    ],
        [-0.1531],
        [ 0.    ]]),
 (13673, 1))

In [55]:
np.std(sia_arr), np.mean(sia_arr)

(0.10140381286297358, -0.005290777444598845)

In [49]:
import numpy as np
from sklearn.utils.graph import graph_shortest_path
from tqdm import tqdm

def distance_mat(X_emo, n_neighbors=6, verbose=False):
    """
    Compute the square distance matrix using Euclidean distance
    :param X: Input data, a numpy array of shape (img_height, img_width)
    :param n_neighbors: Number of nearest neighbors to consider, int
    :return: numpy array of shape (img_height, img_height), numpy array of shape (img_height, n_neighbors)
    """
    def dist1(a, b):
        if a == 0 and b == 0:
            return 0.0
        return np.sqrt(sum((a - b)**2))
    
    def dist2(a, b):
        return np.sqrt(sum((a - b)**2))
    if verbose:
        print("Compute full distance matrix\n------------------\n")
        
    distances = np.array([[dist1(p1, p2) for p2 in X_emo] for p1 in tqdm(X_emo)])
    #distances = np.array([[dist2(p1, p2) for p2 in X] for p1 in tqdm(X)])
    #distances = (distances + distances_emo)/2
    if verbose:
        print("DISTANCE.SHAPE:",distances.shape)
        print("DISTANCE::\n------------------\n",distances)
        print("Keep only the {} nearest neighbors, others set to 0 (= unreachable)".format(n_neighbors))
        
    neighbors = np.zeros_like(distances)
    sort_distances = np.argsort(distances, axis=1)[:, 1:n_neighbors+1] #distances_emo
    for k,i in enumerate(sort_distances):
        neighbors[k,i] = distances[k,i]
    if verbose:
        print("NEIGHBORS::\n------------------\n",neighbors)
        print("SORTED  DISTANCES:\n--------------------\n",sort_distances)
    return neighbors, sort_distances

In [50]:
def center(K):
    """
    Method to center the distance matrix
    :param K: numpy array of shape mxm
    :return: numpy array of shape mxm
    """
    n_samples = K.shape[0]

    # Mean for each row/column
    meanrows = np.sum(K, axis=0) / n_samples
    meancols = (np.sum(K, axis=1)/n_samples)[:, np.newaxis]

    # Mean across all rows (entire matrix)
    meanall = meanrows.sum() / n_samples

    K -= meanrows
    K -= meancols
    K += meanall
    return K

In [51]:
def mds(data, n_components=2):
    """
    Apply multidimensional scaling (aka Principal Coordinates Analysis)
    :param data: nxn square distance matrix
    :param n_components: number of components for projection
    :return: projected output of shape (n_components, n)
    """

    # Center distance matrix
    data = center(data)
    
    # Make a list of (eigenvalue, eigenvector) tuples
    eig_val_cov, eig_vec_cov = np.linalg.eig(data)
    eig_pairs = [
        (np.abs(eig_val_cov[i]), eig_vec_cov[:, i]) for i in range(len(eig_val_cov))
    ]
    # Select n_components eigenvectors with largest eigenvalues, obtain subspace transform matrix
    eig_pairs.sort(key=lambda x: x[0], reverse=True)
    eig_pairs = np.array(eig_pairs)
    matrix_w = np.hstack(
        [eig_pairs[i, 1].reshape(data.shape[1], 1) for i in range(n_components)]
    )

    # Return samples in new subspace
    return matrix_w

In [52]:
def isomap(data, n_components=2, n_neighbors=6, verbose=False):
    """
    Dimensionality reduction with isomap algorithm
    :param data: input image matrix of shape (n,m) if dist=False, square distance matrix of size (n,n) if dist=True
    :param n_components: number of components for projection
    :param n_neighbors: number of neighbors for distance matrix computation
    :return: Projected output of shape (n_components, n)
    """
    # Compute distance matrix
    data, _ = distance_mat(data, n_neighbors, verbose=verbose)

    # Compute shortest paths from distance matrix
    #from sklearn.utils.graph import graph_shortest_path
    graph = graph_shortest_path(data, directed=False)
    graph = -0.5 * (graph ** 2)
    
    # Return the MDS projection on the shortest paths graph
    return mds(graph, n_components)

In [59]:
train_words_lsts = train_word_set_stem
train_vader_lsts = np.array(get_vader(train_word_set_stem))

In [None]:
isomap = isomap(train_vader_lsts, n_components=30, n_neighbors=6)

 96%|█████████▌| 12584/13148 [10:42<00:25, 22.00it/s]