# Preprocessing and Feature Creation

In this notebook we import the data, preprocess the data and create features for supervised and unsupervised cross-lingual-information retrieval models.

## I. Import Data

In this section we import the English and German europarl datasets and combine them into a parallel sentence translation dataframe.

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
import sys
sys.path.append(os.path.dirname((os.path.abspath(''))))

from src.data import create_data_subset

In [None]:
create_data_subset(sentence_data_source_path='../data/external/europarl-v7.it-en.en',
                   sentence_data_target_path='../data/external/europarl-v7.it-en.it',
                   sample_size=25000,
                   sentence_data_sampled_path="../data/interim/europarl_en_it.pkl",)

## II. Preprocess data

In this section we preprocess the parallel sentence data for the feature generation

In [None]:
import spacy
from nltk.corpus import stopwords
from textblob import TextBlob as textblob_source
from textblob_de import TextBlobDE as textblob_target
import en_core_web_sm
# import de_core_news_sm
import it_core_news_sm
# import pl_core_news_sm
import time
from src.data import PreprocessingEuroParl

In [4]:
stopwords_source = stopwords.words('english')
# stopwords_target = stopwords.words('german') # German stopwords
stopwords_target = stopwords.words('italian') # Italian stopwords
# stopwords_target = stopwords.words('polish') # Polish stopwords
nlp_source = en_core_web_sm.load()
# nlp_target = de_core_news_sm.load() # German pipeline
nlp_target = it_core_news_sm.load() # Italian pipeline
# nlp_target = pl_core_news_sm.load() # Polish pipeline

In [5]:
# parallel_sentences = PreprocessingEuroParl(df_sampled_path="../data/interim/europarl_en_de.pkl") # German
parallel_sentences = PreprocessingEuroParl(df_sampled_path="../data/interim/europarl_en_it.pkl") # Italien
# parallel_sentences = PreprocessingEuroParl(df_sampled_path="../data/interim/europarl_en_pol.pkl") # Polnisch

Finished function: 'import_data' in 0.02 seconds.


In [6]:
parallel_sentences.preprocess_sentences(nlp_source, nlp_target, stopwords_source, stopwords_target)

100%|██████████| 25000/25000 [03:14<00:00, 128.46it/s]
100%|██████████| 25000/25000 [00:00<00:00, 175210.04it/s]
  0%|          | 0/25000 [00:00<?, ?it/s]

Finished function: 'spacy' in 194.62 seconds.
Finished function: 'remove_punctuation' in 0.15 seconds.


100%|██████████| 25000/25000 [00:00<00:00, 179839.54it/s]
 38%|███▊      | 9392/25000 [00:00<00:00, 93910.69it/s]

Finished function: 'remove_numbers' in 0.14 seconds.


100%|██████████| 25000/25000 [00:00<00:00, 92326.90it/s]
100%|██████████| 25000/25000 [00:00<00:00, 182429.89it/s]
  0%|          | 0/25000 [00:00<?, ?it/s]

Finished function: 'lemmatize' in 0.27 seconds.
Finished function: 'lowercase_spacy' in 0.14 seconds.


100%|██████████| 25000/25000 [00:00<00:00, 26668.52it/s]
  0%|          | 16/25000 [00:00<02:40, 155.87it/s]

Finished function: 'remove_stopwords' in 0.94 seconds.
Finished function: 'create_cleaned_token_embedding' in 196.45 seconds.


100%|██████████| 25000/25000 [03:03<00:00, 136.41it/s]
100%|██████████| 25000/25000 [00:00<00:00, 151451.94it/s]
  0%|          | 0/25000 [00:00<?, ?it/s]

Finished function: 'spacy' in 183.27 seconds.
Finished function: 'remove_punctuation' in 0.17 seconds.


100%|██████████| 25000/25000 [00:00<00:00, 140138.43it/s]
 30%|██▉       | 7474/25000 [00:00<00:00, 74738.47it/s]

Finished function: 'remove_numbers' in 0.18 seconds.


100%|██████████| 25000/25000 [00:00<00:00, 80638.85it/s]
100%|██████████| 25000/25000 [00:00<00:00, 171634.91it/s]
  0%|          | 0/25000 [00:00<?, ?it/s]

Finished function: 'lemmatize' in 0.31 seconds.
Finished function: 'lowercase_spacy' in 0.15 seconds.


100%|██████████| 25000/25000 [00:01<00:00, 15804.86it/s]
  1%|          | 221/25000 [00:00<00:11, 2209.52it/s]

Finished function: 'remove_stopwords' in 1.58 seconds.
Finished function: 'create_cleaned_token_embedding' in 185.98 seconds.


100%|██████████| 25000/25000 [00:04<00:00, 5178.40it/s]
 16%|█▌        | 4050/25000 [00:00<00:01, 20210.90it/s]

Finished function: 'tokenize_sentence' in 4.83 seconds.


100%|██████████| 25000/25000 [00:01<00:00, 22145.61it/s]
100%|██████████| 25000/25000 [00:00<00:00, 180539.32it/s]
 35%|███▌      | 8866/25000 [00:00<00:00, 34186.08it/s]

Finished function: 'remove_stopwords' in 1.13 seconds.
Finished function: 'strip_whitespace' in 0.14 seconds.


100%|██████████| 25000/25000 [00:00<00:00, 70089.64it/s]
  8%|▊         | 2047/25000 [00:00<00:01, 20460.75it/s]

Finished function: 'lowercase' in 0.36 seconds.


100%|██████████| 25000/25000 [00:01<00:00, 23229.21it/s]
  2%|▏         | 465/25000 [00:00<00:05, 4644.80it/s]

Finished function: 'remove_stopwords' in 1.08 seconds.
Finished function: 'create_cleaned_text' in 7.58 seconds.


100%|██████████| 25000/25000 [00:05<00:00, 4385.32it/s]
  4%|▍         | 960/25000 [00:00<00:02, 9599.80it/s]

Finished function: 'tokenize_sentence' in 5.7 seconds.


100%|██████████| 25000/25000 [00:01<00:00, 13169.30it/s]
100%|██████████| 25000/25000 [00:00<00:00, 180647.56it/s]
  0%|          | 0/25000 [00:00<?, ?it/s]

Finished function: 'remove_stopwords' in 1.9 seconds.
Finished function: 'strip_whitespace' in 0.14 seconds.


100%|██████████| 25000/25000 [00:00<00:00, 63038.95it/s]
 11%|█▏        | 2815/25000 [00:00<00:01, 14460.47it/s]

Finished function: 'lowercase' in 0.4 seconds.


100%|██████████| 25000/25000 [00:01<00:00, 14186.16it/s]

Finished function: 'remove_stopwords' in 1.76 seconds.
Finished function: 'create_cleaned_text' in 9.97 seconds.





In [7]:
parallel_sentences.extract_sentence_information(nlp_source, nlp_target)

100%|██████████| 25000/25000 [00:00<00:00, 103231.09it/s]
100%|█████████▉| 24964/25000 [00:00<00:00, 118303.99it/s]

Finished function: 'number_punctuations_total' in 0.24 seconds.


100%|██████████| 25000/25000 [00:00<00:00, 121369.14it/s]
100%|██████████| 25000/25000 [00:00<00:00, 285705.41it/s]
100%|██████████| 25000/25000 [00:00<00:00, 244303.15it/s]
  0%|          | 0/25000 [00:00<?, ?it/s]

Finished function: 'number_punctuations_total' in 0.21 seconds.
Finished function: 'number_words' in 0.09 seconds.
Finished function: 'number_words' in 0.1 seconds.


100%|██████████| 25000/25000 [00:00<00:00, 38348.08it/s]
 19%|█▉        | 4735/25000 [00:00<00:00, 47332.44it/s]

Finished function: 'number_unique_words' in 0.65 seconds.


100%|██████████| 25000/25000 [00:00<00:00, 45282.13it/s]
 36%|███▌      | 8963/25000 [00:00<00:00, 43989.38it/s]

Finished function: 'number_unique_words' in 0.55 seconds.


100%|██████████| 25000/25000 [00:00<00:00, 53651.41it/s]
 55%|█████▍    | 13707/25000 [00:00<00:00, 65546.35it/s]

Finished function: 'number_characters' in 0.47 seconds.


100%|██████████| 25000/25000 [00:00<00:00, 70954.39it/s]
  return (character_vector / word_vector).replace(np.nan, 0).replace(np.inf, 0).replace(np.log(0), 0)
100%|██████████| 25000/25000 [00:00<00:00, 358402.98it/s]
100%|██████████| 25000/25000 [00:00<00:00, 308715.24it/s]
  0%|          | 0/25000 [00:00<?, ?it/s]

Finished function: 'number_characters' in 0.35 seconds.
Finished function: 'average_characters' in 0.02 seconds.
Finished function: 'average_characters' in 0.0 seconds.
Finished function: 'number_punctuation_marks' in 0.07 seconds.
Finished function: 'number_punctuation_marks' in 0.08 seconds.


100%|██████████| 25000/25000 [00:00<00:00, 309646.94it/s]
100%|██████████| 25000/25000 [00:00<00:00, 267027.94it/s]
100%|██████████| 25000/25000 [00:00<00:00, 313786.14it/s]
  0%|          | 0/25000 [00:00<?, ?it/s]

Finished function: 'number_punctuation_marks' in 0.08 seconds.
Finished function: 'number_punctuation_marks' in 0.1 seconds.
Finished function: 'number_punctuation_marks' in 0.08 seconds.


100%|██████████| 25000/25000 [00:00<00:00, 298821.33it/s]
100%|██████████| 25000/25000 [00:00<00:00, 316475.77it/s]
100%|██████████| 25000/25000 [00:00<00:00, 308150.93it/s]
  0%|          | 0/25000 [00:00<?, ?it/s]

Finished function: 'number_punctuation_marks' in 0.09 seconds.
Finished function: 'number_punctuation_marks' in 0.08 seconds.
Finished function: 'number_punctuation_marks' in 0.08 seconds.


100%|██████████| 25000/25000 [00:00<00:00, 343283.30it/s]
100%|██████████| 25000/25000 [00:00<00:00, 325199.34it/s]
100%|██████████| 25000/25000 [00:00<00:00, 351255.85it/s]
  0%|          | 0/25000 [00:00<?, ?it/s]

Finished function: 'number_punctuation_marks' in 0.08 seconds.
Finished function: 'number_punctuation_marks' in 0.08 seconds.
Finished function: 'number_punctuation_marks' in 0.07 seconds.


100%|██████████| 25000/25000 [00:00<00:00, 271921.62it/s]
100%|██████████| 25000/25000 [00:00<00:00, 320401.87it/s]
100%|██████████| 25000/25000 [00:00<00:00, 273505.41it/s]
  0%|          | 0/25000 [00:00<?, ?it/s]

Finished function: 'number_punctuation_marks' in 0.09 seconds.
Finished function: 'number_punctuation_marks' in 0.08 seconds.
Finished function: 'number_punctuation_marks' in 0.09 seconds.


100%|██████████| 25000/25000 [00:00<00:00, 334059.49it/s]
100%|██████████| 25000/25000 [00:00<00:00, 237566.01it/s]
  0%|          | 0/25000 [00:00<?, ?it/s]

Finished function: 'number_punctuation_marks' in 0.08 seconds.
Finished function: 'number_punctuation_marks' in 0.11 seconds.


100%|██████████| 25000/25000 [00:00<00:00, 272153.11it/s]
100%|██████████| 25000/25000 [00:00<00:00, 253223.57it/s]
100%|██████████| 25000/25000 [00:00<00:00, 306470.34it/s]
  0%|          | 0/25000 [00:00<?, ?it/s]

Finished function: 'number_punctuation_marks' in 0.09 seconds.
Finished function: 'number_punctuation_marks' in 0.1 seconds.
Finished function: 'number_punctuation_marks' in 0.08 seconds.


100%|██████████| 25000/25000 [00:00<00:00, 296271.52it/s]
100%|██████████| 25000/25000 [00:00<00:00, 339654.96it/s]
100%|██████████| 25000/25000 [00:00<00:00, 306211.69it/s]
  0%|          | 0/25000 [00:00<?, ?it/s]

Finished function: 'number_punctuation_marks' in 0.09 seconds.
Finished function: 'number_punctuation_marks' in 0.08 seconds.
Finished function: 'number_punctuation_marks' in 0.08 seconds.


100%|██████████| 25000/25000 [00:00<00:00, 333989.27it/s]
100%|██████████| 25000/25000 [00:00<00:00, 257092.77it/s]
100%|██████████| 25000/25000 [00:00<00:00, 287053.03it/s]
  0%|          | 0/25000 [00:00<?, ?it/s]

Finished function: 'number_punctuation_marks' in 0.08 seconds.
Finished function: 'number_punctuation_marks' in 0.1 seconds.
Finished function: 'number_punctuation_marks' in 0.09 seconds.


100%|██████████| 25000/25000 [00:00<00:00, 262007.76it/s]
100%|██████████| 25000/25000 [00:00<00:00, 321975.01it/s]
100%|██████████| 25000/25000 [00:00<00:00, 315800.00it/s]
  0%|          | 0/25000 [00:00<?, ?it/s]

Finished function: 'number_punctuation_marks' in 0.1 seconds.
Finished function: 'number_punctuation_marks' in 0.08 seconds.
Finished function: 'number_punctuation_marks' in 0.08 seconds.


100%|██████████| 25000/25000 [00:00<00:00, 358916.99it/s]
100%|██████████| 25000/25000 [00:00<00:00, 383131.03it/s]
100%|██████████| 25000/25000 [00:00<00:00, 482438.84it/s]
  0%|          | 0/25000 [00:00<?, ?it/s]

Finished function: 'number_punctuation_marks' in 0.07 seconds.
Finished function: 'number_punctuation_marks' in 0.07 seconds.
Finished function: 'number_punctuation_marks' in 0.05 seconds.


100%|██████████| 25000/25000 [00:00<00:00, 277229.44it/s]
100%|██████████| 25000/25000 [00:00<00:00, 291884.88it/s]
100%|██████████| 25000/25000 [00:00<00:00, 314019.18it/s]
  0%|          | 0/25000 [00:00<?, ?it/s]

Finished function: 'number_punctuation_marks' in 0.09 seconds.
Finished function: 'number_punctuation_marks' in 0.09 seconds.
Finished function: 'number_punctuation_marks' in 0.08 seconds.


100%|██████████| 25000/25000 [00:00<00:00, 384461.33it/s]
100%|██████████| 25000/25000 [00:00<00:00, 382110.44it/s]
100%|██████████| 25000/25000 [00:00<00:00, 488787.37it/s]
100%|██████████| 25000/25000 [00:00<00:00, 370309.58it/s]
  0%|          | 0/25000 [00:00<?, ?it/s]

Finished function: 'number_punctuation_marks' in 0.07 seconds.
Finished function: 'number_punctuation_marks' in 0.07 seconds.
Finished function: 'number_punctuation_marks' in 0.05 seconds.
Finished function: 'number_punctuation_marks' in 0.07 seconds.


100%|██████████| 25000/25000 [00:00<00:00, 307529.15it/s]
100%|██████████| 25000/25000 [00:00<00:00, 259870.14it/s]
100%|██████████| 25000/25000 [00:00<00:00, 308497.26it/s]
  0%|          | 0/25000 [00:00<?, ?it/s]

Finished function: 'number_punctuation_marks' in 0.08 seconds.
Finished function: 'number_punctuation_marks' in 0.1 seconds.
Finished function: 'number_punctuation_marks' in 0.08 seconds.


100%|██████████| 25000/25000 [00:00<00:00, 337618.65it/s]
100%|██████████| 25000/25000 [00:00<00:00, 386415.09it/s]
100%|██████████| 25000/25000 [00:00<00:00, 347173.81it/s]
  0%|          | 0/25000 [00:00<?, ?it/s]

Finished function: 'number_punctuation_marks' in 0.08 seconds.
Finished function: 'number_punctuation_marks' in 0.07 seconds.
Finished function: 'number_punctuation_marks' in 0.07 seconds.


100%|██████████| 25000/25000 [00:00<00:00, 421770.38it/s]
100%|██████████| 25000/25000 [00:00<00:00, 380754.85it/s]
100%|██████████| 25000/25000 [00:00<00:00, 351105.31it/s]
  0%|          | 0/25000 [00:00<?, ?it/s]

Finished function: 'number_punctuation_marks' in 0.06 seconds.
Finished function: 'number_punctuation_marks' in 0.07 seconds.
Finished function: 'number_punctuation_marks' in 0.07 seconds.


100%|██████████| 25000/25000 [00:00<00:00, 332623.62it/s]
100%|██████████| 25000/25000 [00:00<00:00, 473879.11it/s]
100%|██████████| 25000/25000 [00:00<00:00, 346837.34it/s]
100%|██████████| 25000/25000 [00:00<00:00, 394732.76it/s]
  0%|          | 0/25000 [00:00<?, ?it/s]

Finished function: 'number_punctuation_marks' in 0.08 seconds.
Finished function: 'number_punctuation_marks' in 0.05 seconds.
Finished function: 'number_punctuation_marks' in 0.07 seconds.
Finished function: 'number_punctuation_marks' in 0.06 seconds.


100%|██████████| 25000/25000 [00:00<00:00, 353972.41it/s]
100%|██████████| 25000/25000 [00:00<00:00, 458865.28it/s]
100%|██████████| 25000/25000 [00:00<00:00, 434842.97it/s]
100%|██████████| 25000/25000 [00:00<00:00, 407006.92it/s]
  0%|          | 0/25000 [00:00<?, ?it/s]

Finished function: 'number_punctuation_marks' in 0.07 seconds.
Finished function: 'number_punctuation_marks' in 0.06 seconds.
Finished function: 'number_punctuation_marks' in 0.06 seconds.
Finished function: 'number_punctuation_marks' in 0.06 seconds.


100%|██████████| 25000/25000 [00:00<00:00, 255788.92it/s]
100%|██████████| 25000/25000 [00:00<00:00, 296197.87it/s]
100%|██████████| 25000/25000 [00:00<00:00, 377359.36it/s]
  0%|          | 0/25000 [00:00<?, ?it/s]

Finished function: 'number_punctuation_marks' in 0.1 seconds.
Finished function: 'number_punctuation_marks' in 0.09 seconds.
Finished function: 'number_punctuation_marks' in 0.07 seconds.


100%|██████████| 25000/25000 [00:00<00:00, 372506.50it/s]
100%|██████████| 25000/25000 [00:00<00:00, 373747.94it/s]
100%|██████████| 25000/25000 [00:00<00:00, 431466.59it/s]
100%|██████████| 25000/25000 [00:00<00:00, 372172.00it/s]
  0%|          | 0/25000 [00:00<?, ?it/s]

Finished function: 'number_punctuation_marks' in 0.07 seconds.
Finished function: 'number_punctuation_marks' in 0.07 seconds.
Finished function: 'number_punctuation_marks' in 0.06 seconds.
Finished function: 'number_punctuation_marks' in 0.07 seconds.


100%|██████████| 25000/25000 [00:00<00:00, 294752.47it/s]
100%|██████████| 25000/25000 [00:00<00:00, 257846.98it/s]
  0%|          | 0/25000 [00:00<?, ?it/s]

Finished function: 'number_punctuation_marks' in 0.09 seconds.
Finished function: 'number_punctuation_marks' in 0.1 seconds.


100%|██████████| 25000/25000 [03:16<00:00, 127.19it/s]
  0%|          | 18/25000 [00:00<02:26, 170.66it/s]

Finished function: 'spacy' in 196.56 seconds.


100%|██████████| 25000/25000 [02:45<00:00, 150.71it/s]
  4%|▍         | 1083/25000 [00:00<00:02, 10827.51it/s]

Finished function: 'spacy' in 165.88 seconds.


100%|██████████| 25000/25000 [00:00<00:00, 25724.53it/s]
 91%|█████████ | 22743/25000 [00:00<00:00, 58682.65it/s]

Finished function: 'number_pos' in 0.97 seconds.


100%|██████████| 25000/25000 [00:00<00:00, 115269.93it/s]
100%|██████████| 25000/25000 [00:00<00:00, 166892.30it/s]
  0%|          | 0/25000 [00:00<?, ?it/s]

Finished function: 'number_pos' in 0.22 seconds.
Finished function: 'number_pos' in 0.15 seconds.


100%|██████████| 25000/25000 [00:00<00:00, 172084.74it/s]
100%|██████████| 25000/25000 [00:00<00:00, 143444.83it/s]
  0%|          | 0/25000 [00:00<?, ?it/s]

Finished function: 'number_pos' in 0.15 seconds.
Finished function: 'number_pos' in 0.18 seconds.


100%|██████████| 25000/25000 [00:00<00:00, 143855.74it/s]
  4%|▍         | 1044/25000 [00:00<00:02, 10432.00it/s]

Finished function: 'number_pos' in 0.18 seconds.


100%|██████████| 25000/25000 [00:02<00:00, 10038.41it/s]
  4%|▍         | 1053/25000 [00:00<00:02, 10527.78it/s]

Finished function: 'number_times' in 2.49 seconds.


100%|██████████| 25000/25000 [00:02<00:00, 9947.71it/s] 
  4%|▎         | 928/25000 [00:00<00:02, 9278.97it/s]

Finished function: 'number_times' in 2.52 seconds.


100%|██████████| 25000/25000 [00:02<00:00, 10206.21it/s]
  4%|▎         | 883/25000 [00:00<00:02, 8788.50it/s]

Finished function: 'number_times' in 2.45 seconds.


100%|██████████| 25000/25000 [00:02<00:00, 9730.21it/s] 
  7%|▋         | 1656/25000 [00:00<00:02, 7977.93it/s]

Finished function: 'number_times' in 2.57 seconds.


100%|██████████| 25000/25000 [00:02<00:00, 9422.72it/s]
  3%|▎         | 839/25000 [00:00<00:02, 8386.99it/s]

Finished function: 'number_times' in 2.66 seconds.


100%|██████████| 25000/25000 [00:03<00:00, 8151.00it/s] 
100%|██████████| 25000/25000 [00:00<00:00, 155480.26it/s]
  0%|          | 0/25000 [00:00<?, ?it/s]

Finished function: 'number_times' in 3.07 seconds.
Finished function: 'named_numbers' in 0.16 seconds.


100%|██████████| 25000/25000 [00:00<00:00, 155235.12it/s]

Finished function: 'named_numbers' in 0.16 seconds.





In [8]:
parallel_sentences.create_embedding_information("proc_5k", language_pair="en_it")

Finished function: 'load_embeddings' in 1.47 seconds.


  0%|          | 103/25000 [00:00<00:51, 485.13it/s]

Finished function: 'load_embeddings' in 1.49 seconds.


100%|██████████| 25000/25000 [00:38<00:00, 650.44it/s]
  0%|          | 59/25000 [00:00<00:42, 580.99it/s]

Finished function: 'word_embeddings' in 38.44 seconds.


100%|██████████| 25000/25000 [00:53<00:00, 465.09it/s]


Finished function: 'word_embeddings' in 53.76 seconds.


100%|██████████| 25000/25000 [00:00<00:00, 171492.59it/s]
  0%|          | 0/25000 [00:00<?, ?it/s]

Finished function: 'create_translation_dictionary' in 74.17 seconds.
Finished function: 'translate_words' in 0.15 seconds.


100%|██████████| 25000/25000 [00:00<00:00, 151826.05it/s]


Finished function: 'translate_words' in 0.17 seconds.


100%|██████████| 25000/25000 [00:09<00:00, 2511.07it/s]


Finished function: 'tf_idf_vector' in 10.31 seconds.


100%|██████████| 25000/25000 [00:12<00:00, 1993.72it/s]
  1%|          | 129/25000 [00:00<00:19, 1286.59it/s]

Finished function: 'tf_idf_vector' in 12.96 seconds.


100%|██████████| 25000/25000 [00:16<00:00, 1497.79it/s]
  1%|          | 167/25000 [00:00<00:14, 1667.18it/s]

Finished function: 'sentence_embedding_average' in 16.69 seconds.


100%|██████████| 25000/25000 [00:23<00:00, 1081.09it/s]
  0%|          | 19/25000 [00:00<02:11, 189.54it/s]

Finished function: 'sentence_embedding_average' in 23.13 seconds.


  return [pd.Series(embedding_dataframe.values.mean(axis=1))]
100%|██████████| 25000/25000 [01:47<00:00, 232.70it/s]
  0%|          | 17/25000 [00:00<02:28, 168.22it/s]

Finished function: 'sentence_embedding_tf_idf' in 107.49 seconds.


100%|██████████| 25000/25000 [02:25<00:00, 171.92it/s]

Finished function: 'sentence_embedding_tf_idf' in 145.5 seconds.





In [9]:
parallel_sentences.create_embedding_information("proc_b_1k", language_pair="en_it")

Finished function: 'load_embeddings' in 1.05 seconds.


  0%|          | 39/25000 [00:00<01:05, 383.66it/s]

Finished function: 'load_embeddings' in 0.96 seconds.


100%|██████████| 25000/25000 [00:57<00:00, 433.18it/s]
  0%|          | 45/25000 [00:00<00:55, 449.48it/s]

Finished function: 'word_embeddings' in 57.73 seconds.


100%|██████████| 25000/25000 [00:55<00:00, 447.37it/s]


Finished function: 'word_embeddings' in 55.88 seconds.


100%|██████████| 25000/25000 [00:00<00:00, 219421.07it/s]
  0%|          | 0/25000 [00:00<?, ?it/s]

Finished function: 'create_translation_dictionary' in 66.05 seconds.
Finished function: 'translate_words' in 0.12 seconds.


100%|██████████| 25000/25000 [00:00<00:00, 189350.55it/s]


Finished function: 'translate_words' in 0.13 seconds.


100%|██████████| 25000/25000 [00:09<00:00, 2669.25it/s]


Finished function: 'tf_idf_vector' in 9.67 seconds.


100%|██████████| 25000/25000 [00:13<00:00, 1911.45it/s]
  0%|          | 106/25000 [00:00<00:23, 1057.51it/s]

Finished function: 'tf_idf_vector' in 13.39 seconds.


100%|██████████| 25000/25000 [00:12<00:00, 2009.17it/s]
  1%|          | 181/25000 [00:00<00:13, 1807.00it/s]

Finished function: 'sentence_embedding_average' in 12.44 seconds.


100%|██████████| 25000/25000 [00:13<00:00, 1851.43it/s]
  0%|          | 30/25000 [00:00<01:23, 298.54it/s]

Finished function: 'sentence_embedding_average' in 13.51 seconds.


100%|██████████| 25000/25000 [01:35<00:00, 260.75it/s]
  0%|          | 17/25000 [00:00<02:29, 167.66it/s]

Finished function: 'sentence_embedding_tf_idf' in 95.89 seconds.


100%|██████████| 25000/25000 [01:51<00:00, 224.33it/s]

Finished function: 'sentence_embedding_tf_idf' in 111.46 seconds.





In [10]:
parallel_sentences.create_embedding_information("vecmap", language_pair="en_it")

Finished function: 'load_embeddings' in 0.79 seconds.


  0%|          | 44/25000 [00:00<00:58, 425.29it/s]

Finished function: 'load_embeddings' in 0.64 seconds.


100%|██████████| 25000/25000 [00:45<00:00, 546.68it/s]
  0%|          | 55/25000 [00:00<00:45, 547.77it/s]

Finished function: 'word_embeddings' in 45.73 seconds.


100%|██████████| 25000/25000 [00:44<00:00, 556.65it/s]


Finished function: 'word_embeddings' in 44.91 seconds.


100%|██████████| 25000/25000 [00:00<00:00, 216966.66it/s]
  0%|          | 0/25000 [00:00<?, ?it/s]

Finished function: 'create_translation_dictionary' in 75.22 seconds.
Finished function: 'translate_words' in 0.12 seconds.


100%|██████████| 25000/25000 [00:00<00:00, 152191.19it/s]


Finished function: 'translate_words' in 0.17 seconds.


100%|██████████| 25000/25000 [00:09<00:00, 2585.05it/s]


Finished function: 'tf_idf_vector' in 10.0 seconds.


100%|██████████| 25000/25000 [00:12<00:00, 1945.22it/s]
  0%|          | 111/25000 [00:00<00:22, 1108.37it/s]

Finished function: 'tf_idf_vector' in 13.25 seconds.


100%|██████████| 25000/25000 [00:18<00:00, 1346.49it/s]
  1%|          | 140/25000 [00:00<00:17, 1398.27it/s]

Finished function: 'sentence_embedding_average' in 18.57 seconds.


100%|██████████| 25000/25000 [00:14<00:00, 1705.18it/s]
  0%|          | 26/25000 [00:00<01:38, 253.04it/s]

Finished function: 'sentence_embedding_average' in 14.66 seconds.


100%|██████████| 25000/25000 [02:02<00:00, 203.67it/s]
  0%|          | 19/25000 [00:00<02:17, 181.31it/s]

Finished function: 'sentence_embedding_tf_idf' in 122.77 seconds.


100%|██████████| 25000/25000 [01:51<00:00, 224.39it/s]

Finished function: 'sentence_embedding_tf_idf' in 111.5 seconds.





In [11]:
parallel_sentences.preprocessed.to_json("../data/interim/preprocessed_data_en_it.json")

In [12]:
parallel_sentences.preprocessed

Unnamed: 0,id_source,id_target,token_preprocessed_embedding_source,token_preprocessed_embedding_target,Translation,number_punctuations_total_source,number_punctuations_total_target,number_words_source,number_words_target,number_unique_words_source,...,sentence_embedding_average_proc_b_1k_source,sentence_embedding_average_proc_b_1k_target,sentence_embedding_tf_idf_proc_b_1k_source,sentence_embedding_tf_idf_proc_b_1k_target,translated_to_target_vecmap_source,translated_to_source_vecmap_target,sentence_embedding_average_vecmap_source,sentence_embedding_average_vecmap_target,sentence_embedding_tf_idf_vecmap_source,sentence_embedding_tf_idf_vecmap_target
0,0,0,"[finally, mr, president, although, fall, withi...","[ultimare, signore, presidente, competere, com...",1,10,9,38,39,35,...,"[[0.008693028401467018, 0.009497930798234473, ...","[[0.008830122081288957, 0.004547863627094022, ...","[[0.0018075439090904097, 0.0013963363123755275...","[[0.0015286852722734012, 0.0007927902201681273...","[finalmente, signor, presidente, tuttavia, cad...","[finalize, lord, president, compete, commissio...","[[-0.2608074678984635, 0.035465999670764976, 0...","[[-0.2317512307439328, 0.043051783598082906, 0...","[[-0.04178289919088128, 0.003289124538910825, ...","[[-0.03635919295923167, 0.0053623071707199, 0...."
1,1,1,[applause],[applausi],1,2,2,1,1,1,...,"[[0.0347091443836689, 0.08055263757705688, 0.0...","[[-0.027894562110304832, 0.10238229483366013, ...","[[0.0347091443836689, 0.08055263757705688, 0.0...","[[-0.027894562110304832, 0.10238229483366013, ...",[applausi],[applause],"[[-0.10786788910627365, 0.14334604144096375, -...","[[-0.14058560132980347, 0.10015597939491272, -...","[[-0.10786788910627365, 0.14334604144096375, -...","[[-0.14058560132980347, 0.10015597939491272, -..."
2,2,2,"[lisbon, continuation]","[lisbona, proseguimento]",1,2,2,2,2,2,...,"[[-0.020988833159208298, 0.04784630052745342, ...","[[-0.0201482642441988, 0.029058500658720732, 0...","[[-0.005021797963031628, 0.031505242847675384,...","[[-0.012601990050846867, 0.01683720292953982, ...","[lisbona, prosecuzione]","[lisbon, continuation]","[[-0.02069856971502304, -0.1291940463706851, 0...","[[-0.04886355251073837, -0.13083997648209333, ...","[[-0.04082566372040908, -0.07944126290264207, ...","[[-0.07017921368832104, -0.07720816915820333, ..."
3,3,3,"[identical, murder, attempt, recently, town, m...","[tentare, omicidio, identico, essere, stare, p...",1,0,1,14,18,14,...,"[[0.03894698836042413, 0.013754402486873525, 0...","[[0.021405176129466033, 0.032778284019407106, ...","[[0.010250564928934999, 0.0027849285443413904,...","[[0.005357049436194455, 0.0063608610116189495,...","[identico, omicidio, tentativo, recentemente, ...","[attempt, murder, identical, therefore, stay, ...","[[-0.18444362449060595, -0.007816310356637197,...","[[-0.18242603102151086, 0.03894012695287957, 0...","[[-0.04227790194857881, -0.00697155921507797, ...","[[-0.0341803192431627, 0.004996396428697362, 0..."
4,4,4,"[reconsider, soon, practically, possible]","[verrà, riconsiderare, appena, essere, pratica...",1,0,0,4,5,4,...,"[[0.003843584709102288, 0.06975685886573046, 0...","[[0.021365371843179066, 0.017269847448915243, ...","[[0.0025937276066553857, 0.037731961813923534,...","[[0.011209410370540865, 0.0070219193348595695,...","[riconsiderare, presto, praticamente, possibile]","[gets, reconsider, immediately, therefore, vir...","[[-0.3457260988652706, 0.11624429188668728, 0....","[[-0.32295674333969754, 0.08559711432705323, 0...","[[-0.16844998753889942, 0.057748788697695064, ...","[[-0.11822048330145368, 0.03690763003978965, 0..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24995,24995,24995,"[discussion, underway, establish, help, would,...","[correre, discussione, stabilire, forma, aiuta...",1,0,0,6,7,6,...,"[[0.027733889951681096, 0.045572447086063526, ...","[[-0.0012786872684955597, 0.01170461175206583,...","[[0.009949558184369987, 0.01824453850351808, 0...","[[0.002030095925314283, 0.002094960642065595, ...","[discussione, avviato, stabilire, aiuto, dovut...","[overtake, discussion, determine, form, help, ...","[[-0.345364381869634, 0.04808976097653309, 0.2...","[[-0.2989570051431656, 0.041093380481470376, 0...","[[-0.13079217395478768, 0.01262385431881695, 0...","[[-0.1001236591809881, 0.01070314706811008, 0...."
24996,24996,24996,"[need, large, majority]","[adesso, servire, maggioranza, ampio]",1,0,0,3,4,3,...,"[[-0.024014365548888843, 0.013163011521100998,...","[[0.013197294436395168, 0.04709733300842345, 0...","[[-0.007652249980452337, 0.005586281052024325,...","[[0.005589276491623362, 0.023743840989073752, ...","[bisogna, piccole, maggioranza]","[anyway, serve, majority, broad]","[[-0.32679545879364014, -0.02295448196431001, ...","[[-0.2798319607973099, -0.015366747509688139, ...","[[-0.1793727918531777, -0.01970709230942802, 0...","[[-0.14012105415330461, -0.007039710147790793,..."
24997,24997,24997,"[list, publicly, available, intend, debate, to...","[tali, elenco, essere, pubblico, dovere, esser...",1,0,0,8,10,8,...,"[[-0.02947679255157709, 0.0017494010244263336,...","[[0.026079811376985163, -0.0184820672031492, 0...","[[-0.011494453582837134, -0.001609825355740220...","[[0.008537422638065442, -0.008387655900818757,...","[lista, pubblicamente, disponibile, permetterm...","[certain, list, therefore, public, obligation,...","[[-0.2680252157151699, 0.010025895680882968, 0...","[[-0.26727680712938306, 0.001577067608013749, ...","[[-0.09347213943120848, 0.0061355563743250244,...","[[-0.08303151566023889, -0.0011806212530717536..."
24998,24998,24998,"[however, ring, friend, different, project, pr...","[e', però, altro, discorrere, riguardare, anel...",1,4,6,29,29,24,...,"[[0.027616000079433434, 0.023816229969573516, ...","[[0.01671971460261072, 0.010999239597974034, 0...","[[0.004424109903798761, 0.004199808899204925, ...","[[0.002870340549292681, 0.0009921893565988488,...","[tuttavia, anello, amico, diverse, progetto, p...","[however, another, relate, ring, friend, relat...","[[-0.2592687489038023, 0.021379880510115374, 0...","[[-0.2660550732786457, 0.040747578954324126, 0...","[[-0.05026916672191941, 0.005777651995093183, ...","[[-0.04288726434918977, 0.006473223171571389, ..."


In [13]:
parallel_sentences.dataframe

Unnamed: 0,id_source,text_source,text_target,id_target,text_preprocessed_source,text_preprocessed_target,text_source_spacy,text_target_spacy,word_embedding_proc_5k_source,word_embedding_proc_5k_target,tf_idf_proc_5k_source,tf_idf_proc_5k_target,word_embedding_proc_b_1k_source,word_embedding_proc_b_1k_target,tf_idf_proc_b_1k_source,tf_idf_proc_b_1k_target,word_embedding_vecmap_source,word_embedding_vecmap_target,tf_idf_vecmap_source,tf_idf_vecmap_target
0,0,"And, finally, Mr President, although it does n...","Da ultimo, signor Presidente, per quanto non c...",0,"[,, finally, ,, mr, president, ,, although, fa...","[ultimo, ,, signor, presidente, ,, competa, co...","[And, ,, finally, ,, Mr, President, ,, althoug...","[Da, ultimo, ,, signor, Presidente, ,, per, qu...",finally mr president although ...,ultimare signore presidente competere...,"{'finally': 0.15004748845222704, 'mr': 0.09151...","{'ultimare': 0.15208587310966074, 'signore': 0...",finally mr president although ...,ultimare signore presidente competere...,"{'finally': 0.15004748845222704, 'mr': 0.09151...","{'ultimare': 0.15208587310966074, 'signore': 0...",finally mr president although ...,ultimare signore presidente competere...,"{'finally': 0.15004748845222704, 'mr': 0.09151...","{'ultimare': 0.15208587310966074, 'signore': 0..."
1,1,(Applause),(Applausi),1,"[(, applause, )]","[(, applausi, )]","[(, Applause, )]","[(, Applausi, )]",applause 0 -0.003095 1 0.091659 2 ...,applausi 0 -0.027895 1 0.102382 2 ...,{'applause': 1.0},{'applausi': 1.0},applause 0 0.034709 1 0.080553 2 ...,applausi 0 -0.027895 1 0.102382 2 ...,{'applause': 1.0},{'applausi': 1.0},applause 0 -0.107868 1 0.143346 2 ...,applausi 0 -0.140586 1 0.100156 2 ...,{'applause': 1.0},{'applausi': 1.0}
2,2,Lisbon (continuation),Lisbona (proseguimento),2,"[lisbon, (, continuation, )]","[lisbona, (, proseguimento, )]","[Lisbon, (, continuation, )]","[Lisbona, (, proseguimento, )]",lisbon continuation 0 -0.067397 ...,lisbona proseguimento 0 -0.017876 ...,"{'lisbon': 0.5995482209311499, 'continuation':...","{'lisbona': 0.5696539880474568, 'proseguimento...",lisbon continuation 0 -0.070327 ...,lisbona proseguimento 0 -0.017876 ...,"{'lisbon': 0.5995482209311499, 'continuation':...","{'lisbona': 0.5696539880474568, 'proseguimento...",lisbon continuation 0 0.144877 ...,lisbona proseguimento 0 0.135594 ...,"{'lisbon': 0.5995482209311499, 'continuation':...","{'lisbona': 0.5696539880474568, 'proseguimento..."
3,3,An identical murder was attempted very recentl...,Un tentato omicidio identico a questo è stato ...,3,"[identical, murder, attempted, recently, town,...","[tentato, omicidio, identico, stato, perpetrat...","[An, identical, murder, was, attempted, very, ...","[Un, tentato, omicidio, identico, a, questo, è...",identical murder attempt recently ...,tentare omicidio identico essere ...,"{'identical': 0.29620584974051617, 'murder': 0...","{'tentare': 0.20705547145212297, 'omicidio': 0...",identical murder attempt recently ...,tentare omicidio identico essere ...,"{'identical': 0.29620584974051617, 'murder': 0...","{'tentare': 0.20705547145212297, 'omicidio': 0...",identical murder attempt recently ...,tentare omicidio identico essere ...,"{'identical': 0.29620584974051617, 'murder': 0...","{'tentare': 0.20705547145212297, 'omicidio': 0..."
4,4,It will be reconsidered as soon as is practica...,Verrà riconsiderata non appena sarà praticamen...,4,"[reconsidered, soon, practically, possible, .]","[verrà, riconsiderata, appena, praticamente, p...","[It, will, be, reconsidered, as, soon, as, is,...","[Verrà, riconsiderata, non, appena, sarà, prat...",reconsider soon practically possib...,verrà riconsiderare appena esse...,"{'reconsider': 0.5963543599059239, 'soon': 0.4...","{'verrà': 0.5690474904743331, 'riconsiderare':...",reconsider soon practically possib...,verrà riconsiderare appena esse...,"{'reconsider': 0.5963543599059239, 'soon': 0.4...","{'verrà': 0.5690474904743331, 'riconsiderare':...",reconsider soon practically possib...,verrà riconsiderare appena esse...,"{'reconsider': 0.5963543599059239, 'soon': 0.4...","{'verrà': 0.5690474904743331, 'riconsiderare':..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24995,24995,Discussions are underway to establish what hel...,Sono in corso discussioni per stabilire quali ...,24995,"[discussions, underway, establish, help, would...","[corso, discussioni, stabilire, quali, forme, ...","[Discussions, are, underway, to, establish, wh...","[Sono, in, corso, discussioni, per, stabilire,...",discussion underway establish help...,correre discussione stabilire form...,"{'discussion': 0.37636434315557893, 'underway'...","{'correre': 0.3674241790185676, 'discussione':...",discussion underway establish help...,correre discussione stabilire form...,"{'discussion': 0.37636434315557893, 'underway'...","{'correre': 0.3674241790185676, 'discussione':...",discussion underway establish help...,correre discussione stabilire form...,"{'discussion': 0.37636434315557893, 'underway'...","{'correre': 0.3674241790185676, 'discussione':..."
24996,24996,We now need that large majority.,Adesso ci serve una maggioranza ampia.,24996,"[need, large, majority, .]","[adesso, serve, maggioranza, ampia, .]","[We, now, need, that, large, majority, .]","[Adesso, ci, serve, una, maggioranza, ampia, .]",need large majority 0 -0.04593...,adesso servire maggioranza ampio...,"{'need': 0.43167925904307497, 'large': 0.61841...","{'adesso': 0.5096849295725233, 'servire': 0.51...",need large majority 0 -0.04532...,adesso servire maggioranza ampio...,"{'need': 0.43167925904307497, 'large': 0.61841...","{'adesso': 0.5096849295725233, 'servire': 0.51...",need large majority 0 -0.18137...,adesso servire maggioranza ampio...,"{'need': 0.43167925904307497, 'large': 0.61841...","{'adesso': 0.5096849295725233, 'servire': 0.51..."
24997,24997,These lists are publicly available and are int...,Tali elenchi sono pubblici e dovrebbero essere...,24997,"[lists, publicly, available, intended, debate,...","[tali, elenchi, pubblici, dovrebbero, essere, ...","[These, lists, are, publicly, available, and, ...","[Tali, elenchi, sono, pubblici, e, dovrebbero,...",list publicly available intend ...,tali elenco essere pubblico ...,"{'list': 0.356935026424723, 'publicly': 0.4921...","{'tali': 0.37609964295086695, 'elenco': 0.4827...",list publicly available intend ...,tali elenco essere pubblico ...,"{'list': 0.356935026424723, 'publicly': 0.4921...","{'tali': 0.37609964295086695, 'elenco': 0.4827...",list publicly available intend ...,tali elenco essere pubblico ...,"{'list': 0.356935026424723, 'publicly': 0.4921...","{'tali': 0.37609964295086695, 'elenco': 0.4827..."
24998,24998,"However, the ring of friends is a different pr...",E' però un altro discorso quello che riguarda ...,24998,"[however, ,, ring, friends, different, project...","[', però, altro, discorso, riguarda, l'anello,...","[However, ,, the, ring, of, friends, is, a, di...","[E', però, un, altro, discorso, quello, che, r...",however ring friend different ...,però altro riguardare anello ...,"{'however': 0.11754323865835557, 'ring': 0.235...","{'e'': 0.12661952741185759, 'però': 0.14103306...",however ring friend different ...,però altro riguardare anello ...,"{'however': 0.11754323865835557, 'ring': 0.235...","{'e'': 0.12661952741185759, 'però': 0.14103306...",however ring friend different ...,però altro riguardare anello ...,"{'however': 0.11754323865835557, 'ring': 0.235...","{'e'': 0.12661952741185759, 'però': 0.14103306..."


In [None]:
import pandas as pd
preprocessed_data = pd.read_json("../data/interim/preprocessed_data_en_it.json")
parallel_sentences = PreprocessingEuroParl(df_sampled_path="../data/interim/europarl_en_it.pkl")
parallel_sentences.preprocessed = preprocessed_data

In [None]:
parallel_sentences.preprocessed.columns[:20]

## III. Create data set

In this section we create the datasets for the training of the supervised model and the data for the supervised and unsupervised retrieval.

In [14]:
from src.data import DataSet

In [15]:
n_model = 20000
n_queries = 100
n_retrieval = 5000
k = 10
sample_size_k = 100

In [16]:
dataset = DataSet(parallel_sentences.preprocessed)
#dataset = DataSet(preprocessed_data)

Finished function: '__init__' in 0.0 seconds.


In [17]:
dataset.split_model_retrieval(n_model, n_retrieval)

Finished function: 'split_model_retrieval' in 0.0 seconds.


In [None]:
dataset.create_model_index(n_model, k, sample_size_k,
     "sentence_embedding_tf_idf_proc_5k_source", "sentence_embedding_tf_idf_proc_5k_target")

In [None]:
dataset.model_dataset_index.reset_index(drop=True).to_feather("../data/processed/dataset_model_index_en_it.feather")

In [None]:
# import pandas as pd
# pd.read_feather("../data/processed/dataset_model_index.feather")

In [18]:
#dataset.create_retrieval_index(n_queries)
import pandas as pd
# If your pandas version is old, use this instead
query = pd.DataFrame({"id_source": dataset.retrieval_subset.iloc[:n_queries]["id_source"]})
documents = pd.DataFrame({"id_target": dataset.retrieval_subset["id_target"]})
index = pd.MultiIndex.from_product([dataset.retrieval_subset.iloc[:n_queries]["id_source"], dataset.retrieval_subset["id_target"]], names = ["id_source", "id_target"])
dataset.retrieval_dataset_index = pd.DataFrame(index = index).reset_index()

In [19]:
dataset.retrieval_dataset_index.reset_index(drop=True).to_feather("../data/processed/dataset_retrieval_index_en_it.feather")

In [None]:
# import pandas as pd
# pd.read_feather("../data/processed/dataset_retrieval_index.feather")

## IV. Create features

In this section we create features for our model, that are sentence based and should be created before the text is preprocessed.

In [20]:
#%autoreload 2
from src.features import feature_generation_class

In [None]:
# import pickle
# with open(r"../data/processed/correlated_features.pkl", "rb") as file:
#    chosen_features = pickle.load(file)

Generation of the training data for the supervised classifciation model.

In [None]:
features_model = feature_generation_class.FeatureGeneration(dataset.model_dataset_index, 
                                                             parallel_sentences.preprocessed)

In [None]:
features_model.create_feature_dataframe()

In [None]:
features_model.create_sentence_features()

In [None]:
features_model.create_embedding_features("proc_5k")

In [None]:
features_model.create_embedding_features("proc_b_1k")

In [None]:
features_model.create_embedding_features("vecmap")

In [None]:
features_model.feature_dataframe.reset_index(drop=True).to_feather("../data/processed/feature_model_en_it.feather")

In [None]:
# import pandas as pd
# pd.read_feather("../data/processed/feature_model.feather")

Generation of the data for the crosslingual information retrieval task.

In [21]:
features_retrieval = feature_generation_class.FeatureGeneration(dataset.retrieval_dataset_index, 
                                                            parallel_sentences.preprocessed)

In [22]:
features_retrieval.create_feature_dataframe()

Finished function: 'create_feature_dataframe' in 0.03 seconds.


In [23]:
features_retrieval.create_sentence_features()

Finished function: 'difference_numerical' in 0.11 seconds.
Finished function: 'relative_difference_numerical' in 0.02 seconds.
Finished function: 'normalized_difference_numerical' in 0.03 seconds.


  return abs(target_array - source_array).replace(np.nan, 0).replace(np.inf, 0).replace(np.log(0), 0)
  0), 0)
  np.log(0), 0)


Finished function: 'difference_numerical' in 0.01 seconds.
Finished function: 'relative_difference_numerical' in 0.02 seconds.
Finished function: 'normalized_difference_numerical' in 0.04 seconds.
Finished function: 'difference_numerical' in 0.02 seconds.
Finished function: 'relative_difference_numerical' in 0.03 seconds.
Finished function: 'normalized_difference_numerical' in 0.03 seconds.
Finished function: 'difference_numerical' in 0.01 seconds.
Finished function: 'relative_difference_numerical' in 0.03 seconds.
Finished function: 'normalized_difference_numerical' in 0.03 seconds.
Finished function: 'difference_numerical' in 0.01 seconds.
Finished function: 'relative_difference_numerical' in 0.02 seconds.
Finished function: 'normalized_difference_numerical' in 0.03 seconds.
Finished function: 'difference_numerical' in 0.02 seconds.
Finished function: 'relative_difference_numerical' in 0.03 seconds.
Finished function: 'normalized_difference_numerical' in 0.03 seconds.
Finished functi

  0%|          | 0/500000 [00:00<?, ?it/s]

Finished function: 'relative_difference_numerical' in 0.02 seconds.
Finished function: 'normalized_difference_numerical' in 0.02 seconds.


100%|██████████| 500000/500000 [00:11<00:00, 41836.95it/s]

Finished function: 'jaccard' in 12.1 seconds.
Finished function: 'create_sentence_features' in 18.0 seconds.





In [24]:
features_retrieval.create_embedding_features("proc_5k")

100%|██████████| 500000/500000 [04:26<00:00, 1878.83it/s]
  0%|          | 0/500000 [00:00<?, ?it/s]

Finished function: 'cosine_similarity_vector' in 266.2 seconds.


100%|██████████| 500000/500000 [03:31<00:00, 2362.75it/s]
  0%|          | 302/500000 [00:00<02:45, 3018.43it/s]

Finished function: 'cosine_similarity_vector' in 211.71 seconds.


100%|██████████| 500000/500000 [02:51<00:00, 2923.89it/s]
  0%|          | 0/500000 [00:00<?, ?it/s]

Finished function: 'euclidean_distance_vector' in 171.08 seconds.


100%|██████████| 500000/500000 [02:29<00:00, 3348.84it/s]
  0%|          | 0/500000 [00:00<?, ?it/s]

Finished function: 'euclidean_distance_vector' in 149.41 seconds.


100%|██████████| 500000/500000 [00:12<00:00, 39142.67it/s]
  0%|          | 1995/500000 [00:00<00:24, 19944.79it/s]

Finished function: 'jaccard' in 12.87 seconds.


100%|██████████| 500000/500000 [00:19<00:00, 26076.36it/s]

Finished function: 'jaccard' in 19.26 seconds.
Finished function: 'create_embedding_features' in 830.61 seconds.





In [25]:
features_retrieval.create_embedding_features("proc_b_1k")

100%|██████████| 500000/500000 [03:27<00:00, 2404.07it/s]
  0%|          | 0/500000 [00:00<?, ?it/s]

Finished function: 'cosine_similarity_vector' in 208.07 seconds.


100%|██████████| 500000/500000 [03:24<00:00, 2444.37it/s]
  0%|          | 320/500000 [00:00<02:36, 3194.76it/s]

Finished function: 'cosine_similarity_vector' in 204.65 seconds.


100%|██████████| 500000/500000 [02:13<00:00, 3756.44it/s]
  0%|          | 694/500000 [00:00<02:28, 3356.21it/s]

Finished function: 'euclidean_distance_vector' in 133.16 seconds.


100%|██████████| 500000/500000 [02:20<00:00, 3567.88it/s]
  1%|          | 3426/500000 [00:00<00:14, 34222.66it/s]

Finished function: 'euclidean_distance_vector' in 140.19 seconds.


100%|██████████| 500000/500000 [00:12<00:00, 41363.17it/s]
  1%|          | 3563/500000 [00:00<00:13, 35625.02it/s]

Finished function: 'jaccard' in 12.17 seconds.


100%|██████████| 500000/500000 [00:11<00:00, 41815.80it/s]

Finished function: 'jaccard' in 12.04 seconds.
Finished function: 'create_embedding_features' in 710.35 seconds.





In [26]:
features_retrieval.create_embedding_features("vecmap")

100%|██████████| 500000/500000 [03:15<00:00, 2555.87it/s]
  0%|          | 175/500000 [00:00<04:45, 1748.69it/s]

Finished function: 'cosine_similarity_vector' in 195.71 seconds.


100%|██████████| 500000/500000 [03:16<00:00, 2539.70it/s]
  0%|          | 699/500000 [00:00<02:29, 3345.99it/s]

Finished function: 'cosine_similarity_vector' in 196.97 seconds.


100%|██████████| 500000/500000 [02:13<00:00, 3745.78it/s]
  0%|          | 700/500000 [00:00<02:29, 3350.54it/s]

Finished function: 'euclidean_distance_vector' in 133.53 seconds.


100%|██████████| 500000/500000 [02:13<00:00, 3756.71it/s]
  1%|          | 3630/500000 [00:00<00:13, 36293.11it/s]

Finished function: 'euclidean_distance_vector' in 133.15 seconds.


100%|██████████| 500000/500000 [00:15<00:00, 32559.28it/s]
  0%|          | 2465/500000 [00:00<00:20, 24645.32it/s]

Finished function: 'jaccard' in 15.43 seconds.


100%|██████████| 500000/500000 [00:12<00:00, 39087.99it/s]

Finished function: 'jaccard' in 12.89 seconds.
Finished function: 'create_embedding_features' in 687.71 seconds.





In [27]:
features_retrieval.feature_dataframe.reset_index(drop=True).to_feather("../data/processed/feature_retrieval_en_it.feather")

In [None]:
# import pandas as pd
# pd.read_feather("../data/processed/feature_retrieval.feather")