# Preprocessing and Feature Creation

In this notebook we import the data, preprocess the data and create features for supervised and unsupervised cross-lingual-information retrieval models.

## I. Import Data

In this section we import the English and German europarl datasets and combine them into a parallel sentence translation dataframe.

In [None]:
%load_ext autoreload
%autoreload 2

In [5]:
import os
import sys
sys.path.append(os.path.dirname((os.path.abspath(''))))

from src.data import create_data_subset

In [None]:
create_data_subset(sentence_data_source_path='../data/external/europarl-v7.it-en.en',
                   sentence_data_target_path='../data/external/europarl-v7.it-en.it',
                   sample_size=25000,
                   sentence_data_sampled_path="../data/interim/europarl_en_it.pkl",)

## II. Preprocess data

In this section we preprocess the parallel sentence data for the feature generation

In [6]:
import spacy
from nltk.corpus import stopwords
from textblob import TextBlob as textblob_source
from textblob_de import TextBlobDE as textblob_target
import en_core_web_sm
import de_core_news_sm
# import it_core_news_sm
# import pl_core_news_sm
import time
from src.data import PreprocessingEuroParl

In [7]:
stopwords_source = stopwords.words('english')
stopwords_target = stopwords.words('german') # German stopwords
# stopwords_target = stopwords.words('italian') # Italian stopwords
# stopwords_target = stopwords.words('polish') # Polish stopwords
nlp_source = en_core_web_sm.load()
nlp_target = de_core_news_sm.load() # German pipeline
# nlp_target = it_core_news_sm.load() # Italian pipeline
# nlp_target = pl_core_news_sm.load() # Polish pipeline

In [8]:
parallel_sentences = PreprocessingEuroParl(df_sampled_path="../data/interim/feature_retrieval_doc.pickle") # German
# parallel_sentences = PreprocessingEuroParl(df_sampled_path="../data/interim/europarl_en_it.pkl") # Italien
# parallel_sentences = PreprocessingEuroParl(df_sampled_path="../data/interim/europarl_en_pol.pkl") # Polnisch

Finished function: 'import_data' in 0.03 seconds.


In [43]:
len(parallel_sentences.dataframe["text_target"][0].split(" "))

313

In [12]:
import numpy as np
parallel_sentences.dataframe["id_source"] = np.arange(len(parallel_sentences.dataframe))
parallel_sentences.dataframe["id_target"] = np.arange(len(parallel_sentences.dataframe))

In [14]:
parallel_sentences.preprocess_sentences(nlp_source, nlp_target, stopwords_source, stopwords_target)

100%|██████████| 5000/5000 [00:39<00:00, 128.12it/s]
100%|██████████| 5000/5000 [00:00<00:00, 230130.04it/s]
100%|██████████| 5000/5000 [00:00<00:00, 130959.05it/s]
100%|██████████| 5000/5000 [00:00<00:00, 143538.30it/s]
100%|██████████| 5000/5000 [00:00<00:00, 57258.87it/s]
  0%|          | 0/5000 [00:00<?, ?it/s]

Finished function: 'spacy' in 39.02 seconds.
Finished function: 'remove_stopwords' in 0.02 seconds.
Finished function: 'remove_punctuation' in 0.04 seconds.
Finished function: 'remove_numbers' in 0.04 seconds.
Finished function: 'lemmatize' in 0.1 seconds.


100%|██████████| 5000/5000 [00:00<00:00, 145179.85it/s]
  0%|          | 3/5000 [00:00<04:18, 19.30it/s]

Finished function: 'lowercase_spacy' in 0.04 seconds.
Finished function: 'create_cleaned_token_embedding' in 39.32 seconds.


100%|██████████| 5000/5000 [03:28<00:00, 23.97it/s]
100%|██████████| 5000/5000 [00:00<00:00, 43803.92it/s]
  0%|          | 0/5000 [00:00<?, ?it/s]

Finished function: 'spacy' in 208.61 seconds.
Finished function: 'remove_stopwords' in 0.12 seconds.


100%|██████████| 5000/5000 [00:00<00:00, 20139.01it/s]
 45%|████▍     | 2240/5000 [00:00<00:00, 22396.18it/s]

Finished function: 'remove_punctuation' in 0.25 seconds.


100%|██████████| 5000/5000 [00:00<00:00, 19154.89it/s]
 13%|█▎        | 655/5000 [00:00<00:00, 6548.49it/s]

Finished function: 'remove_numbers' in 0.26 seconds.


100%|██████████| 5000/5000 [00:00<00:00, 8066.27it/s]
 61%|██████    | 3042/5000 [00:00<00:00, 15765.87it/s]

Finished function: 'lemmatize' in 0.62 seconds.


100%|██████████| 5000/5000 [00:00<00:00, 16413.20it/s]
 15%|█▌        | 774/5000 [00:00<00:01, 3665.89it/s]

Finished function: 'lowercase_spacy' in 0.31 seconds.
Finished function: 'create_cleaned_token_embedding' in 210.55 seconds.


100%|██████████| 5000/5000 [00:01<00:00, 4331.66it/s]
100%|██████████| 5000/5000 [00:00<00:00, 148919.01it/s]
100%|██████████| 5000/5000 [00:00<00:00, 183291.85it/s]
100%|██████████| 5000/5000 [00:00<00:00, 184741.80it/s]
  1%|          | 60/5000 [00:00<00:08, 595.98it/s]

Finished function: 'tokenize_sentence' in 1.16 seconds.
Finished function: 'remove_stopwords' in 0.04 seconds.
Finished function: 'strip_whitespace' in 0.03 seconds.
Finished function: 'lowercase' in 0.03 seconds.
Finished function: 'create_cleaned_text' in 1.25 seconds.


100%|██████████| 5000/5000 [00:06<00:00, 778.48it/s]
100%|██████████| 5000/5000 [00:00<00:00, 51586.27it/s]
 31%|███       | 1547/5000 [00:00<00:00, 15468.61it/s]

Finished function: 'tokenize_sentence' in 6.42 seconds.
Finished function: 'remove_stopwords' in 0.1 seconds.


100%|██████████| 5000/5000 [00:00<00:00, 24269.53it/s]
 77%|███████▋  | 3826/5000 [00:00<00:00, 19919.52it/s]

Finished function: 'strip_whitespace' in 0.21 seconds.


100%|██████████| 5000/5000 [00:00<00:00, 19602.78it/s]

Finished function: 'lowercase' in 0.26 seconds.
Finished function: 'create_cleaned_text' in 7.04 seconds.





In [15]:
parallel_sentences.extract_sentence_information(nlp_source, nlp_target)

100%|██████████| 5000/5000 [00:00<00:00, 86479.42it/s]
 16%|█▌        | 807/5000 [00:00<00:00, 8062.49it/s]

Finished function: 'number_punctuations_total' in 0.06 seconds.


100%|██████████| 5000/5000 [00:00<00:00, 9190.79it/s]
100%|██████████| 5000/5000 [00:00<00:00, 176410.64it/s]
 47%|████▋     | 2368/5000 [00:00<00:00, 23674.77it/s]

Finished function: 'number_punctuations_total' in 0.55 seconds.
Finished function: 'number_words' in 0.03 seconds.


100%|██████████| 5000/5000 [00:00<00:00, 24808.97it/s]
100%|██████████| 5000/5000 [00:00<00:00, 32148.39it/s]
  0%|          | 0/5000 [00:00<?, ?it/s]

Finished function: 'number_words' in 0.2 seconds.
Finished function: 'number_unique_words' in 0.16 seconds.


100%|██████████| 5000/5000 [00:00<00:00, 7387.41it/s]
100%|██████████| 5000/5000 [00:00<00:00, 71167.10it/s]
 23%|██▎       | 1141/5000 [00:00<00:00, 11402.37it/s]

Finished function: 'number_unique_words' in 0.68 seconds.
Finished function: 'number_characters' in 0.07 seconds.


100%|██████████| 5000/5000 [00:00<00:00, 13578.26it/s]
  return (character_vector / word_vector).replace(np.nan, 0).replace(np.inf, 0).replace(np.log(0), 0)
100%|██████████| 5000/5000 [00:00<00:00, 427170.73it/s]
100%|██████████| 5000/5000 [00:00<00:00, 97405.13it/s]
100%|██████████| 5000/5000 [00:00<00:00, 435310.53it/s]
100%|██████████| 5000/5000 [00:00<00:00, 100333.08it/s]
100%|██████████| 5000/5000 [00:00<00:00, 432375.73it/s]
  0%|          | 0/5000 [00:00<?, ?it/s]

Finished function: 'number_characters' in 0.37 seconds.
Finished function: 'average_characters' in 0.01 seconds.
Finished function: 'average_characters' in 0.0 seconds.
Finished function: 'number_punctuation_marks' in 0.01 seconds.
Finished function: 'number_punctuation_marks' in 0.05 seconds.
Finished function: 'number_punctuation_marks' in 0.01 seconds.
Finished function: 'number_punctuation_marks' in 0.05 seconds.
Finished function: 'number_punctuation_marks' in 0.01 seconds.


100%|██████████| 5000/5000 [00:00<00:00, 94580.48it/s]
100%|██████████| 5000/5000 [00:00<00:00, 397835.87it/s]
100%|██████████| 5000/5000 [00:00<00:00, 85257.36it/s]
100%|██████████| 5000/5000 [00:00<00:00, 368924.62it/s]
100%|██████████| 5000/5000 [00:00<00:00, 86445.56it/s]
100%|██████████| 5000/5000 [00:00<00:00, 376164.01it/s]
  0%|          | 0/5000 [00:00<?, ?it/s]

Finished function: 'number_punctuation_marks' in 0.05 seconds.
Finished function: 'number_punctuation_marks' in 0.01 seconds.
Finished function: 'number_punctuation_marks' in 0.06 seconds.
Finished function: 'number_punctuation_marks' in 0.01 seconds.
Finished function: 'number_punctuation_marks' in 0.06 seconds.
Finished function: 'number_punctuation_marks' in 0.01 seconds.


100%|██████████| 5000/5000 [00:00<00:00, 88288.13it/s]
100%|██████████| 5000/5000 [00:00<00:00, 344875.27it/s]
100%|██████████| 5000/5000 [00:00<00:00, 82601.79it/s]
100%|██████████| 5000/5000 [00:00<00:00, 367947.22it/s]
100%|██████████| 5000/5000 [00:00<00:00, 71698.98it/s]
100%|██████████| 5000/5000 [00:00<00:00, 314302.50it/s]
  0%|          | 0/5000 [00:00<?, ?it/s]

Finished function: 'number_punctuation_marks' in 0.06 seconds.
Finished function: 'number_punctuation_marks' in 0.02 seconds.
Finished function: 'number_punctuation_marks' in 0.06 seconds.
Finished function: 'number_punctuation_marks' in 0.02 seconds.
Finished function: 'number_punctuation_marks' in 0.07 seconds.
Finished function: 'number_punctuation_marks' in 0.02 seconds.


100%|██████████| 5000/5000 [00:00<00:00, 75258.18it/s]
100%|██████████| 5000/5000 [00:00<00:00, 357961.29it/s]
100%|██████████| 5000/5000 [00:00<00:00, 85483.60it/s]
100%|██████████| 5000/5000 [00:00<00:00, 414694.59it/s]
100%|██████████| 5000/5000 [00:00<00:00, 91989.63it/s]
100%|██████████| 5000/5000 [00:00<00:00, 321876.17it/s]
  0%|          | 0/5000 [00:00<?, ?it/s]

Finished function: 'number_punctuation_marks' in 0.07 seconds.
Finished function: 'number_punctuation_marks' in 0.02 seconds.
Finished function: 'number_punctuation_marks' in 0.06 seconds.
Finished function: 'number_punctuation_marks' in 0.01 seconds.
Finished function: 'number_punctuation_marks' in 0.06 seconds.
Finished function: 'number_punctuation_marks' in 0.02 seconds.


100%|██████████| 5000/5000 [00:00<00:00, 88286.64it/s]
100%|██████████| 5000/5000 [00:00<00:00, 428558.70it/s]
100%|██████████| 5000/5000 [00:00<00:00, 95520.04it/s]
100%|██████████| 5000/5000 [00:00<00:00, 432259.87it/s]
100%|██████████| 5000/5000 [00:00<00:00, 98360.86it/s]
100%|██████████| 5000/5000 [00:00<00:00, 432902.32it/s]
  0%|          | 0/5000 [00:00<?, ?it/s]

Finished function: 'number_punctuation_marks' in 0.06 seconds.
Finished function: 'number_punctuation_marks' in 0.01 seconds.
Finished function: 'number_punctuation_marks' in 0.05 seconds.
Finished function: 'number_punctuation_marks' in 0.01 seconds.
Finished function: 'number_punctuation_marks' in 0.05 seconds.
Finished function: 'number_punctuation_marks' in 0.01 seconds.


100%|██████████| 5000/5000 [00:00<00:00, 91164.67it/s]
100%|██████████| 5000/5000 [00:00<00:00, 392990.03it/s]
100%|██████████| 5000/5000 [00:00<00:00, 82112.13it/s]
100%|██████████| 5000/5000 [00:00<00:00, 366059.00it/s]
100%|██████████| 5000/5000 [00:00<00:00, 71044.86it/s]
100%|██████████| 5000/5000 [00:00<00:00, 400151.12it/s]
  0%|          | 0/5000 [00:00<?, ?it/s]

Finished function: 'number_punctuation_marks' in 0.06 seconds.
Finished function: 'number_punctuation_marks' in 0.01 seconds.
Finished function: 'number_punctuation_marks' in 0.06 seconds.
Finished function: 'number_punctuation_marks' in 0.02 seconds.
Finished function: 'number_punctuation_marks' in 0.07 seconds.
Finished function: 'number_punctuation_marks' in 0.01 seconds.


100%|██████████| 5000/5000 [00:00<00:00, 92053.83it/s]
100%|██████████| 5000/5000 [00:00<00:00, 399236.99it/s]
100%|██████████| 5000/5000 [00:00<00:00, 88969.82it/s]
100%|██████████| 5000/5000 [00:00<00:00, 380456.44it/s]
100%|██████████| 5000/5000 [00:00<00:00, 88542.72it/s]
100%|██████████| 5000/5000 [00:00<00:00, 406243.73it/s]
  0%|          | 0/5000 [00:00<?, ?it/s]

Finished function: 'number_punctuation_marks' in 0.06 seconds.
Finished function: 'number_punctuation_marks' in 0.02 seconds.
Finished function: 'number_punctuation_marks' in 0.06 seconds.
Finished function: 'number_punctuation_marks' in 0.01 seconds.
Finished function: 'number_punctuation_marks' in 0.06 seconds.
Finished function: 'number_punctuation_marks' in 0.01 seconds.


100%|██████████| 5000/5000 [00:00<00:00, 87865.69it/s]
100%|██████████| 5000/5000 [00:00<00:00, 372913.21it/s]
100%|██████████| 5000/5000 [00:00<00:00, 86852.26it/s]
100%|██████████| 5000/5000 [00:00<00:00, 375604.83it/s]
100%|██████████| 5000/5000 [00:00<00:00, 90943.67it/s]
100%|██████████| 5000/5000 [00:00<00:00, 388836.73it/s]
  0%|          | 0/5000 [00:00<?, ?it/s]

Finished function: 'number_punctuation_marks' in 0.06 seconds.
Finished function: 'number_punctuation_marks' in 0.02 seconds.
Finished function: 'number_punctuation_marks' in 0.06 seconds.
Finished function: 'number_punctuation_marks' in 0.01 seconds.
Finished function: 'number_punctuation_marks' in 0.06 seconds.
Finished function: 'number_punctuation_marks' in 0.01 seconds.


100%|██████████| 5000/5000 [00:00<00:00, 93564.80it/s]
100%|██████████| 5000/5000 [00:00<00:00, 401184.53it/s]
100%|██████████| 5000/5000 [00:00<00:00, 88494.15it/s]
100%|██████████| 5000/5000 [00:00<00:00, 391815.26it/s]
100%|██████████| 5000/5000 [00:00<00:00, 94716.75it/s]
100%|██████████| 5000/5000 [00:00<00:00, 390654.77it/s]
  0%|          | 0/5000 [00:00<?, ?it/s]

Finished function: 'number_punctuation_marks' in 0.05 seconds.
Finished function: 'number_punctuation_marks' in 0.01 seconds.
Finished function: 'number_punctuation_marks' in 0.06 seconds.
Finished function: 'number_punctuation_marks' in 0.01 seconds.
Finished function: 'number_punctuation_marks' in 0.05 seconds.
Finished function: 'number_punctuation_marks' in 0.01 seconds.


100%|██████████| 5000/5000 [00:00<00:00, 97468.06it/s]
100%|██████████| 5000/5000 [00:00<00:00, 412192.30it/s]
100%|██████████| 5000/5000 [00:00<00:00, 92531.07it/s]
100%|██████████| 5000/5000 [00:00<00:00, 390102.49it/s]
100%|██████████| 5000/5000 [00:00<00:00, 94188.41it/s]
100%|██████████| 5000/5000 [00:00<00:00, 393883.14it/s]
  0%|          | 0/5000 [00:00<?, ?it/s]

Finished function: 'number_punctuation_marks' in 0.05 seconds.
Finished function: 'number_punctuation_marks' in 0.01 seconds.
Finished function: 'number_punctuation_marks' in 0.06 seconds.
Finished function: 'number_punctuation_marks' in 0.01 seconds.
Finished function: 'number_punctuation_marks' in 0.05 seconds.
Finished function: 'number_punctuation_marks' in 0.01 seconds.


100%|██████████| 5000/5000 [00:00<00:00, 93176.11it/s]
100%|██████████| 5000/5000 [00:00<00:00, 404894.68it/s]
100%|██████████| 5000/5000 [00:00<00:00, 93771.47it/s]
100%|██████████| 5000/5000 [00:00<00:00, 386821.36it/s]
100%|██████████| 5000/5000 [00:00<00:00, 90221.82it/s]
  0%|          | 0/5000 [00:00<?, ?it/s]

Finished function: 'number_punctuation_marks' in 0.06 seconds.
Finished function: 'number_punctuation_marks' in 0.01 seconds.
Finished function: 'number_punctuation_marks' in 0.05 seconds.
Finished function: 'number_punctuation_marks' in 0.01 seconds.
Finished function: 'number_punctuation_marks' in 0.06 seconds.


100%|██████████| 5000/5000 [00:34<00:00, 145.28it/s]
  0%|          | 3/5000 [00:00<02:50, 29.37it/s]

Finished function: 'spacy' in 34.42 seconds.


100%|██████████| 5000/5000 [03:24<00:00, 24.48it/s]
100%|██████████| 5000/5000 [00:00<00:00, 161144.60it/s]
 47%|████▋     | 2353/5000 [00:00<00:00, 23526.43it/s]

Finished function: 'spacy' in 204.28 seconds.
Finished function: 'number_pos' in 0.03 seconds.


100%|██████████| 5000/5000 [00:00<00:00, 26443.83it/s]
100%|██████████| 5000/5000 [00:00<00:00, 158172.96it/s]
 47%|████▋     | 2363/5000 [00:00<00:00, 23628.84it/s]

Finished function: 'number_pos' in 0.19 seconds.
Finished function: 'number_pos' in 0.03 seconds.


100%|██████████| 5000/5000 [00:00<00:00, 27044.01it/s]
100%|██████████| 5000/5000 [00:00<00:00, 168405.36it/s]
 48%|████▊     | 2416/5000 [00:00<00:00, 24155.65it/s]

Finished function: 'number_pos' in 0.19 seconds.
Finished function: 'number_pos' in 0.03 seconds.


100%|██████████| 5000/5000 [00:00<00:00, 26977.91it/s]
 50%|█████     | 2517/5000 [00:00<00:00, 12373.90it/s]

Finished function: 'number_pos' in 0.19 seconds.


100%|██████████| 5000/5000 [00:00<00:00, 13133.73it/s]
  2%|▏         | 114/5000 [00:00<00:04, 1135.67it/s]

Finished function: 'number_times' in 0.38 seconds.


100%|██████████| 5000/5000 [00:03<00:00, 1369.01it/s]
 25%|██▌       | 1259/5000 [00:00<00:00, 12584.10it/s]

Finished function: 'number_times' in 3.65 seconds.


100%|██████████| 5000/5000 [00:00<00:00, 13418.40it/s]
  2%|▏         | 114/5000 [00:00<00:04, 1137.66it/s]

Finished function: 'number_times' in 0.37 seconds.


100%|██████████| 5000/5000 [00:03<00:00, 1375.42it/s]
 52%|█████▏    | 2593/5000 [00:00<00:00, 12615.25it/s]

Finished function: 'number_times' in 3.64 seconds.


100%|██████████| 5000/5000 [00:00<00:00, 13077.94it/s]
  2%|▏         | 106/5000 [00:00<00:04, 1052.56it/s]

Finished function: 'number_times' in 0.38 seconds.


100%|██████████| 5000/5000 [00:03<00:00, 1376.39it/s]
100%|██████████| 5000/5000 [00:00<00:00, 166140.00it/s]
 43%|████▎     | 2167/5000 [00:00<00:00, 21661.76it/s]

Finished function: 'number_times' in 3.63 seconds.
Finished function: 'named_numbers' in 0.03 seconds.


100%|██████████| 5000/5000 [00:00<00:00, 22844.52it/s]

Finished function: 'named_numbers' in 0.22 seconds.





In [16]:
parallel_sentences.create_embedding_information("proc_5k")

Finished function: 'load_embeddings' in 0.93 seconds.


  3%|▎         | 148/5000 [00:00<00:06, 718.01it/s]

Finished function: 'load_embeddings' in 0.57 seconds.


100%|██████████| 5000/5000 [00:06<00:00, 732.75it/s]
  0%|          | 8/5000 [00:00<01:08, 73.40it/s]

Finished function: 'word_embeddings' in 6.83 seconds.


100%|██████████| 5000/5000 [00:58<00:00, 86.11it/s] 


Finished function: 'word_embeddings' in 58.07 seconds.


100%|██████████| 5000/5000 [00:00<00:00, 147858.57it/s]
 31%|███       | 1533/5000 [00:00<00:00, 15325.70it/s]

Finished function: 'create_translation_dictionary' in 50.15 seconds.
Finished function: 'translate_words' in 0.04 seconds.


100%|██████████| 5000/5000 [00:00<00:00, 17067.76it/s]
  0%|          | 0/5000 [00:00<?, ?it/s]

Finished function: 'translate_words' in 0.29 seconds.


100%|██████████| 5000/5000 [00:02<00:00, 1729.43it/s]


Finished function: 'tf_idf_vector' in 3.04 seconds.


100%|██████████| 5000/5000 [00:35<00:00, 139.56it/s]
 12%|█▏        | 586/5000 [00:00<00:01, 2836.72it/s]

Finished function: 'tf_idf_vector' in 36.79 seconds.


100%|██████████| 5000/5000 [00:01<00:00, 2943.32it/s]
  4%|▎         | 177/5000 [00:00<00:05, 894.39it/s]

Finished function: 'sentence_embedding_average' in 1.7 seconds.


100%|██████████| 5000/5000 [00:04<00:00, 1103.99it/s]
  1%|          | 33/5000 [00:00<00:15, 328.47it/s]

Finished function: 'sentence_embedding_average' in 4.53 seconds.


100%|██████████| 5000/5000 [00:14<00:00, 347.19it/s]
  0%|          | 5/5000 [00:00<02:07, 39.11it/s]

Finished function: 'sentence_embedding_tf_idf' in 14.4 seconds.


100%|██████████| 5000/5000 [02:31<00:00, 32.96it/s]


Finished function: 'sentence_embedding_tf_idf' in 151.71 seconds.


In [17]:
parallel_sentences.create_embedding_information("proc_b_1k")

Finished function: 'load_embeddings' in 1.04 seconds.


  1%|          | 62/5000 [00:00<00:08, 617.06it/s]

Finished function: 'load_embeddings' in 0.79 seconds.


100%|██████████| 5000/5000 [00:07<00:00, 665.98it/s]
  0%|          | 8/5000 [00:00<01:04, 77.59it/s]

Finished function: 'word_embeddings' in 7.51 seconds.


100%|██████████| 5000/5000 [01:14<00:00, 67.12it/s] 


Finished function: 'word_embeddings' in 74.49 seconds.


100%|██████████| 5000/5000 [00:00<00:00, 151805.83it/s]
 30%|██▉       | 1491/5000 [00:00<00:00, 14903.44it/s]

Finished function: 'create_translation_dictionary' in 65.33 seconds.
Finished function: 'translate_words' in 0.04 seconds.


100%|██████████| 5000/5000 [00:00<00:00, 16913.21it/s]
  0%|          | 0/5000 [00:00<?, ?it/s]

Finished function: 'translate_words' in 0.3 seconds.


100%|██████████| 5000/5000 [00:02<00:00, 1720.12it/s]


Finished function: 'tf_idf_vector' in 3.07 seconds.


100%|██████████| 5000/5000 [00:37<00:00, 133.82it/s]
  5%|▌         | 272/5000 [00:00<00:01, 2716.80it/s]

Finished function: 'tf_idf_vector' in 38.32 seconds.


100%|██████████| 5000/5000 [00:04<00:00, 1112.15it/s]
  1%|▏         | 70/5000 [00:00<00:07, 691.14it/s]

Finished function: 'sentence_embedding_average' in 4.5 seconds.


100%|██████████| 5000/5000 [00:10<00:00, 478.46it/s]
  0%|          | 16/5000 [00:00<00:31, 155.97it/s]

Finished function: 'sentence_embedding_average' in 10.45 seconds.


100%|██████████| 5000/5000 [00:19<00:00, 253.59it/s]
  0%|          | 3/5000 [00:00<04:27, 18.70it/s]

Finished function: 'sentence_embedding_tf_idf' in 19.73 seconds.


100%|██████████| 5000/5000 [02:41<00:00, 30.99it/s]

Finished function: 'sentence_embedding_tf_idf' in 161.37 seconds.





In [18]:
parallel_sentences.create_embedding_information("vecmap")

Finished function: 'load_embeddings' in 0.88 seconds.


  1%|          | 49/5000 [00:00<00:10, 485.97it/s]

Finished function: 'load_embeddings' in 0.54 seconds.


100%|██████████| 5000/5000 [00:08<00:00, 599.94it/s]
  0%|          | 6/5000 [00:00<01:38, 50.80it/s]

Finished function: 'word_embeddings' in 8.34 seconds.


100%|██████████| 5000/5000 [01:15<00:00, 66.20it/s] 


Finished function: 'word_embeddings' in 75.53 seconds.


100%|██████████| 5000/5000 [00:00<00:00, 140734.29it/s]
 29%|██▉       | 1454/5000 [00:00<00:00, 14538.14it/s]

Finished function: 'create_translation_dictionary' in 71.19 seconds.
Finished function: 'translate_words' in 0.04 seconds.


100%|██████████| 5000/5000 [00:00<00:00, 15830.55it/s]
  0%|          | 0/5000 [00:00<?, ?it/s]

Finished function: 'translate_words' in 0.32 seconds.


100%|██████████| 5000/5000 [00:03<00:00, 1420.06it/s]


Finished function: 'tf_idf_vector' in 3.69 seconds.


100%|██████████| 5000/5000 [00:42<00:00, 118.35it/s]
  5%|▍         | 235/5000 [00:00<00:02, 2347.18it/s]

Finished function: 'tf_idf_vector' in 43.54 seconds.


100%|██████████| 5000/5000 [00:02<00:00, 1818.78it/s]
  2%|▏         | 94/5000 [00:00<00:05, 935.46it/s]

Finished function: 'sentence_embedding_average' in 2.75 seconds.


100%|██████████| 5000/5000 [00:05<00:00, 991.11it/s] 
  1%|          | 35/5000 [00:00<00:14, 344.49it/s]

Finished function: 'sentence_embedding_average' in 5.05 seconds.


100%|██████████| 5000/5000 [00:19<00:00, 250.69it/s]
  0%|          | 4/5000 [00:00<02:07, 39.08it/s]

Finished function: 'sentence_embedding_tf_idf' in 19.95 seconds.


100%|██████████| 5000/5000 [02:21<00:00, 35.22it/s]

Finished function: 'sentence_embedding_tf_idf' in 142.01 seconds.





In [19]:
parallel_sentences.preprocessed.to_json("../data/interim/preprocessed_data_doc.json")

In [20]:
import pandas as pd
preprocessed_data = pd.read_json("../data/interim/preprocessed_data_doc.json")
parallel_sentences = PreprocessingEuroParl(df_sampled_path="../data/interim/feature_retrieval_doc.pickle")
parallel_sentences.preprocessed = preprocessed_data

Finished function: 'import_data' in 0.03 seconds.


## III. Create data set

In this section we create the datasets for the training of the supervised model and the data for the supervised and unsupervised retrieval.

In [21]:
from src.data import DataSet

In [22]:
n_model = 0
n_queries = 100
n_retrieval = 5000
k = 10
sample_size_k = 100

In [23]:
dataset = DataSet(parallel_sentences.preprocessed)
#dataset = DataSet(preprocessed_data)

Finished function: '__init__' in 0.0 seconds.


In [26]:
dataset.split_model_retrieval(n_model, n_retrieval)

Finished function: 'split_model_retrieval' in 0.01 seconds.


In [None]:
dataset.create_model_index(n_model, k, sample_size_k,
     "sentence_embedding_tf_idf_proc_5k_source", "sentence_embedding_tf_idf_proc_5k_target")

In [None]:
dataset.model_dataset_index.reset_index(drop=True).to_feather("../data/processed/dataset_model_index_en_de.feather")

In [None]:
# import pandas as pd
# pd.read_feather("../data/processed/dataset_model_index.feather")

In [31]:
dataset.retrieval_dataset_index

Unnamed: 0,id_source,id_target
0,0,0
1,0,1
2,0,2
3,0,3
4,0,4
...,...,...
499995,99,4995
499996,99,4996
499997,99,4997
499998,99,4998


In [28]:
import pandas as pd
#dataset.create_retrieval_index(n_queries)

# If your pandas version is old, use this instead
query = pd.DataFrame({"id_source": dataset.retrieval_subset.iloc[:n_queries]["id_source"]})
documents = pd.DataFrame({"id_target": dataset.retrieval_subset["id_target"]})
index = pd.MultiIndex.from_product([dataset.retrieval_subset.iloc[:n_queries]["id_source"], dataset.retrieval_subset["id_target"]], names = ["id_source", "id_target"])
dataset.retrieval_dataset_index = pd.DataFrame(index = index).reset_index()

In [29]:
dataset.retrieval_dataset_index.reset_index(drop=True).to_feather("../data/processed/dataset_retrieval_index_en_de.feather")

In [None]:
# import pandas as pd
# pd.read_feather("../data/processed/dataset_retrieval_index.feather")

## IV. Create features

In this section we create features for our model, that are sentence based and should be created before the text is preprocessed.

In [30]:
#%autoreload 2
from src.features import feature_generation_class

In [None]:
# import pickle
# with open(r"../data/processed/correlated_features.pkl", "rb") as file:
#    chosen_features = pickle.load(file)

Generation of the training data for the supervised classifciation model.

In [None]:
features_model = feature_generation_class.FeatureGeneration(dataset.model_dataset_index, 
                                                             parallel_sentences.preprocessed)

In [None]:
features_model.create_feature_dataframe()

In [None]:
features_model.create_sentence_features()

In [None]:
features_model.create_embedding_features("proc_5k")

In [None]:
features_model.create_embedding_features("proc_b_1k")

In [None]:
features_model.create_embedding_features("vecmap")

In [None]:
features_model.feature_dataframe.reset_index(drop=True).to_feather("../data/processed/feature_model_en_de.feather")

In [None]:
# import pandas as pd
# pd.read_feather("../data/processed/feature_model.feather")

Generation of the data for the crosslingual information retrieval task.

In [32]:
features_retrieval = feature_generation_class.FeatureGeneration(dataset.retrieval_dataset_index, 
                                                            parallel_sentences.preprocessed)

In [34]:
features_retrieval.create_feature_dataframe()

Finished function: 'create_feature_dataframe' in 0.01 seconds.


In [35]:
features_retrieval.create_sentence_features()

  return abs(target_array - source_array).replace(np.nan, 0).replace(np.inf, 0).replace(np.log(0), 0)
  0), 0)
  np.log(0), 0)


Finished function: 'difference_numerical' in 0.02 seconds.
Finished function: 'relative_difference_numerical' in 0.01 seconds.
Finished function: 'normalized_difference_numerical' in 0.01 seconds.
Finished function: 'difference_numerical' in 0.0 seconds.
Finished function: 'relative_difference_numerical' in 0.01 seconds.
Finished function: 'normalized_difference_numerical' in 0.02 seconds.
Finished function: 'difference_numerical' in 0.0 seconds.
Finished function: 'relative_difference_numerical' in 0.01 seconds.
Finished function: 'normalized_difference_numerical' in 0.02 seconds.
Finished function: 'difference_numerical' in 0.0 seconds.
Finished function: 'relative_difference_numerical' in 0.02 seconds.
Finished function: 'normalized_difference_numerical' in 0.02 seconds.
Finished function: 'difference_numerical' in 0.01 seconds.
Finished function: 'relative_difference_numerical' in 0.01 seconds.
Finished function: 'normalized_difference_numerical' in 0.02 seconds.
Finished function:

  0%|          | 0/500000 [00:00<?, ?it/s]

Finished function: 'difference_numerical' in 0.0 seconds.
Finished function: 'relative_difference_numerical' in 0.01 seconds.
Finished function: 'normalized_difference_numerical' in 0.02 seconds.


100%|██████████| 500000/500000 [00:13<00:00, 37651.01it/s]

Finished function: 'jaccard' in 13.4 seconds.
Finished function: 'create_sentence_features' in 16.18 seconds.





In [36]:
features_retrieval.create_embedding_features("proc_5k")

100%|██████████| 500000/500000 [03:25<00:00, 2436.26it/s]
  0%|          | 261/500000 [00:00<07:27, 1117.74it/s]

Finished function: 'cosine_similarity_vector' in 205.33 seconds.


100%|██████████| 500000/500000 [03:11<00:00, 2616.60it/s]
  0%|          | 371/500000 [00:00<02:14, 3704.97it/s]

Finished function: 'cosine_similarity_vector' in 191.16 seconds.


100%|██████████| 500000/500000 [02:09<00:00, 3848.98it/s]
  0%|          | 285/500000 [00:00<02:55, 2847.98it/s]

Finished function: 'euclidean_distance_vector' in 129.95 seconds.


100%|██████████| 500000/500000 [02:08<00:00, 3890.15it/s]
  0%|          | 0/500000 [00:00<?, ?it/s]

Finished function: 'euclidean_distance_vector' in 128.58 seconds.


100%|██████████| 500000/500000 [00:31<00:00, 15949.33it/s]
  0%|          | 0/500000 [00:00<?, ?it/s]

Finished function: 'jaccard' in 31.46 seconds.


100%|██████████| 500000/500000 [00:29<00:00, 16816.77it/s]

Finished function: 'jaccard' in 29.87 seconds.
Finished function: 'create_embedding_features' in 716.38 seconds.





In [37]:
features_retrieval.create_embedding_features("proc_b_1k")

100%|██████████| 500000/500000 [03:14<00:00, 2575.93it/s]
  0%|          | 157/500000 [00:00<05:18, 1569.37it/s]

Finished function: 'cosine_similarity_vector' in 194.19 seconds.


100%|██████████| 500000/500000 [03:30<00:00, 2372.52it/s]
  0%|          | 141/500000 [00:00<05:54, 1408.09it/s]

Finished function: 'cosine_similarity_vector' in 210.83 seconds.


100%|██████████| 500000/500000 [02:13<00:00, 3758.93it/s]
  0%|          | 311/500000 [00:00<02:40, 3108.94it/s]

Finished function: 'euclidean_distance_vector' in 133.09 seconds.


100%|██████████| 500000/500000 [02:00<00:00, 4152.09it/s]
  0%|          | 1239/500000 [00:00<00:40, 12384.20it/s]

Finished function: 'euclidean_distance_vector' in 120.48 seconds.


100%|██████████| 500000/500000 [00:25<00:00, 19320.67it/s]
  0%|          | 1355/500000 [00:00<00:36, 13549.30it/s]

Finished function: 'jaccard' in 25.97 seconds.


100%|██████████| 500000/500000 [00:25<00:00, 19679.13it/s]

Finished function: 'jaccard' in 25.5 seconds.
Finished function: 'create_embedding_features' in 710.1 seconds.





In [38]:
features_retrieval.create_embedding_features("vecmap")

100%|██████████| 500000/500000 [03:01<00:00, 2747.36it/s]
  0%|          | 151/500000 [00:00<05:31, 1506.67it/s]

Finished function: 'cosine_similarity_vector' in 182.11 seconds.


100%|██████████| 500000/500000 [02:52<00:00, 2903.38it/s]
  0%|          | 363/500000 [00:00<02:17, 3623.14it/s]

Finished function: 'cosine_similarity_vector' in 172.29 seconds.


100%|██████████| 500000/500000 [01:49<00:00, 4553.49it/s]
  0%|          | 154/500000 [00:00<05:24, 1539.96it/s]

Finished function: 'euclidean_distance_vector' in 109.86 seconds.


100%|██████████| 500000/500000 [02:06<00:00, 3955.79it/s]
  0%|          | 1063/500000 [00:00<00:46, 10625.32it/s]

Finished function: 'euclidean_distance_vector' in 126.46 seconds.


100%|██████████| 500000/500000 [00:23<00:00, 21659.24it/s]
  0%|          | 1153/500000 [00:00<00:43, 11523.33it/s]

Finished function: 'jaccard' in 23.16 seconds.


100%|██████████| 500000/500000 [00:27<00:00, 17869.69it/s]

Finished function: 'jaccard' in 28.05 seconds.
Finished function: 'create_embedding_features' in 641.98 seconds.





In [39]:
features_retrieval.feature_dataframe.reset_index(drop=True).to_feather("../data/processed/feature_retrieval_doc.feather")

In [None]:
# import pandas as pd
# pd.read_feather("../data/processed/feature_retrieval.feather")