# Preprocessing and Feature Creation

In this notebook we import the data, preprocess the data and create features for supervised and unsupervised cross-lingual-information retrieval models.

## I. Import Data

In this section we import the English and German europarl datasets and combine them into a parallel sentence translation dataframe.

In [1]:
from src.data.preprocessing_class import PreprocessingEuroParl

In [2]:
parallel_sentences = PreprocessingEuroParl(sentence_data_source='../data/external/europarl-v7.de-en.en',
                 sentence_data_target='../data/external/europarl-v7.de-en.de')

In [3]:
parallel_sentences.dataframe

Unnamed: 0,text_source,text_target
1352789,I am therefore very pleased that the Commissio...,"Ich freue mich daher sehr, dass die Kommission..."
1522032,An energy policy based on centralized producti...,"Eine Politik, die auf einer zentralen, mit ein..."
1217998,The fact that we have successfully achieved th...,"Die Tatsache, dass es uns gelungen ist, die Zu..."
131843,That is why I specifically welcome the deadlin...,"Deshalb begrüße ich ausdrücklich die Fristen, ..."
194652,Mr Lannoye and several others made a most impo...,Herr Lannoye hat - ebenso wie einige andere - ...
...,...,...
611639,"Why not, since it is the truth; it was a day w...","Warum nicht, denn das ist die Wahrheit; es ist..."
748901,"Mr President, I would like to thank the rap...","Herr Präsident, ich möchte dem Berichtersta..."
1053079,The European Union that we have now and that r...,"Die Europäische Union, die wir haben und die d..."
68661,"Like the ECHO case, the Fléchard case is an ex...",Die Fälle Fléchard und ECHO sind Beispiele für...


## II. Preprocess data

In this section we preprocess the parallel sentence data.

In [4]:
import spacy
from nltk.corpus import stopwords
from textblob import TextBlob as textblob_source
from textblob_de import TextBlobDE as textblob_target
import en_core_web_sm
import de_core_news_sm

In [5]:
stopwords_source = stopwords.words('english')
stopwords_target = stopwords.words('german')
nlp_source = en_core_web_sm.load()
nlp_target = de_core_news_sm.load()
embedding_matrix_source = "../data/interim/proc_b_src_emb.p"
embedding_dictionary_source =  "../data/interim/proc_b_src_word.p"
embedding_matrix_target = "../data/interim/proc_b_trg_emb.p"
embedding_dictionary_target =  "../data/interim/proc_b_trg_word.p"

In [6]:
parallel_sentences.preprocess_sentences_source(stopwords_source, nlp_source, textblob_source,
                                               embedding_matrix_source, embedding_dictionary_source)

In [7]:
parallel_sentences.preprocess_sentences_target(stopwords_target,nlp_target, textblob_target,
                                               embedding_matrix_target, embedding_dictionary_target)

In [8]:
parallel_sentences.combine_source_target()

In [9]:
parallel_sentences.add_label()

In [12]:
parallel_sentences.preprocessed_dataframe

Unnamed: 0,number_stopwords_source,number_punctuations_total_source,number_words_source,number_unique_words_source,number_characters_source,characters_avg_source,number_!_source,"number_""_source",number_#_source,number_$_source,...,number_VERB_target,number_X_target,number_Pres_target,number_Past_target,number__target,score_polarity_target,score_subjectivity_target,list_named_entities_target,sentence_embedding_target,Translation
1352789,44,0,8,8,62.0,7.750000,0,0,0,0,...,2,0,0,0,0,0.00,0.0,[(Kommission)],"[[-0.03287947177886963, 0.10836920142173767, -...",1
1522032,62,0,16,16,115.0,7.187500,0,0,0,0,...,2,0,0,0,0,0.00,0.0,[],"[[-0.06837829947471619, 0.04314956068992615, -...",1
1217998,69,0,16,16,120.0,7.500000,0,0,0,0,...,3,0,0,0,0,1.00,0.0,"[(Europäische, Parlament)]","[[0.04801248386502266, -0.014479025267064571, ...",1
131843,36,0,6,6,49.0,8.166667,0,0,0,0,...,2,0,0,0,0,0.70,0.0,[],"[[-0.023463550955057144, -0.030060242861509323...",1
194652,71,1,16,15,106.0,6.625000,0,0,0,0,...,3,0,0,0,0,0.00,0.0,[(Wichtiges)],"[[-0.04381972923874855, 0.13668182492256165, -...",1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
611639,40,2,8,8,48.0,6.000000,0,0,0,0,...,1,0,0,0,0,0.00,0.0,"[(Warum), (Sowjetarmee)]","[[-0.0044578853994607925, -0.03775882720947265...",1
748901,28,1,9,9,49.0,5.444444,0,0,0,0,...,1,0,0,0,0,0.15,0.0,[],"[[-0.04381972923874855, 0.13668182492256165, -...",1
1053079,60,2,12,12,86.0,7.166667,1,0,0,0,...,1,0,0,0,0,0.00,0.0,"[(Europäische, Union)]","[[-0.05865674838423729, 0.0236191563308239, -0...",1
68661,33,2,11,10,69.0,6.272727,0,0,0,0,...,0,0,0,0,0,-1.00,0.0,"[(ECHO), (europäischen)]","[[0.07210885733366013, -0.01837921142578125, -...",1


## III. Create data set

In [13]:
from src.data.dataset_class import DataSet

In [14]:
dataset = DataSet(parallel_sentences)

In [15]:
dataset.get_sample(50)

In [16]:
dataset.dataset

Unnamed: 0,number_stopwords_source,number_punctuations_total_source,number_words_source,number_unique_words_source,number_characters_source,characters_avg_source,number_!_source,"number_""_source",number_#_source,number_$_source,...,number_VERB_target,number_X_target,number_Pres_target,number_Past_target,number__target,score_polarity_target,score_subjectivity_target,list_named_entities_target,sentence_embedding_target,Translation
0,74,3,17,17,125.0,7.352941,0,0,0,0,...,1,0,0,0,0,0.0,0.0,"[(Europäische, Union), (Vereinigten, Staaten),...","[[-0.023463550955057144, -0.030060242861509323...",1
1,49,1,10,10,55.0,5.500000,0,0,0,0,...,1,0,0,0,0,0.0,0.0,[],"[[-0.029286062344908714, 0.11295177787542343, ...",1
2,24,1,10,10,64.0,6.400000,0,0,0,0,...,2,0,0,0,0,0.7,0.0,"[(Jean, Charles, de, Menezes), (Londoner, Metro)]","[[0.06748821586370468, 0.059450697153806686, -...",1
3,40,2,8,8,48.0,6.000000,0,0,0,0,...,1,0,0,0,0,0.0,0.0,"[(Warum), (Sowjetarmee)]","[[-0.0044578853994607925, -0.03775882720947265...",1
4,21,0,5,5,33.0,6.600000,0,0,0,0,...,1,0,0,0,0,0.0,0.0,[],"[[-0.03112952411174774, -0.05413830280303955, ...",1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,11,1,1,1,4.0,4.000000,0,0,0,0,...,0,0,0,0,0,-1.0,0.0,"[(ECHO), (europäischen)]","[[0.07210885733366013, -0.01837921142578125, -...",0
96,74,3,17,17,125.0,7.352941,0,0,0,0,...,0,0,0,0,0,0.0,0.0,[(EU)],"[[0.032416995614767075, 0.012670580297708511, ...",0
97,173,4,37,34,272.0,7.351351,0,0,0,0,...,1,0,0,0,0,0.0,0.0,"[(Europäische, Union)]","[[-0.05865674838423729, 0.0236191563308239, -0...",0
98,25,1,6,6,44.0,7.333333,0,0,0,0,...,2,0,0,0,0,0.0,0.0,[],"[[-0.025180837139487267, -0.0368436835706234, ...",0


## II. Create sentence based features

In this section we create features for our model, that are sentence based and should be created before the text is preprocessed.

In [17]:
from src.features.feature_generation_class import FeatureGeneration

In [18]:
features = FeatureGeneration(dataset.dataset)

In [19]:
features.feature_generation()

In [20]:
features.feature_dataframe

Unnamed: 0,number_punctuations_total_difference,number_punctuations_total_difference_relative,number_punctuations_total_difference_normalized,number_words_difference,number_words_difference_relative,number_words_difference_normalized,number_unique_words_difference,number_unique_words_difference_relative,number_unique_words_difference_normalized,number_!_difference,...,score_polarity_difference,score_polarity_difference_relative,score_polarity_difference_normalized,score_subjectivity_difference,score_subjectivity_difference_relative,score_subjectivity_difference_normalized,number_stopwords_difference,number_stopwords_difference_relative,number_stopwords_difference_normalized,Translation
0,-3,-1.000000,0.150000,2,0.117647,-0.150000,2,0.117647,-0.150000,0,...,-0.187500,-1.0,0.009375,-0.500000,-1.0,0.025000,-74,-1.0,3.700000,1
1,-1,-1.000000,0.090909,-3,-0.300000,-0.090909,-3,-0.300000,-0.090909,0,...,0.000000,0.0,0.000000,0.000000,0.0,0.000000,-49,-1.0,4.454545,1
2,0,0.000000,0.007576,1,0.100000,-0.007576,1,0.100000,-0.007576,0,...,0.692857,97.0,-0.057684,-0.285714,-1.0,0.025974,-24,-1.0,2.181818,1
3,1,0.500000,-0.133333,-2,-0.250000,0.133333,-2,-0.250000,0.133333,0,...,0.000000,0.0,0.000000,0.000000,0.0,0.000000,-40,-1.0,4.000000,1
4,1,0.000000,-0.111111,3,0.600000,0.111111,3,0.600000,0.111111,0,...,-0.300000,-1.0,0.060000,-0.450000,-1.0,0.090000,-21,-1.0,4.200000,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-1,-1.000000,0.500000,8,8.000000,-0.500000,8,8.000000,-0.500000,0,...,-0.687500,2.2,-0.045139,-0.687500,-1.0,0.343750,-11,-1.0,5.500000,0
96,-2,-0.666667,0.059091,-7,-0.411765,-0.059091,-7,-0.411765,-0.059091,0,...,-0.187500,-1.0,0.009375,-0.500000,-1.0,0.025000,-74,-1.0,3.700000,0
97,-1,-0.250000,-0.152439,-28,-0.756757,0.152439,-25,-0.735294,0.079268,1,...,-0.293452,-1.0,0.007157,-0.468452,-1.0,0.011426,-173,-1.0,4.219512,0
98,0,0.000000,-0.107143,-3,-0.500000,0.107143,-3,-0.500000,0.107143,0,...,0.500000,-1.0,-0.071429,-0.900000,-1.0,0.128571,-25,-1.0,3.571429,0


## III. Create token based features