# Preprocessing and Feature Creation

In this notebook we import the data, preprocess the data and create features for supervised and unsupervised cross-lingual-information retrieval models.

## I. Import Data

In this section we import the English and German europarl datasets and combine them into a parallel sentence translation dataframe.

In [1]:
from src.data.preprocessing_class import PreprocessingEuroParl

In [2]:
parallel_sentences = PreprocessingEuroParl(sentence_data_source='../data/external/europarl-v7.de-en.en',
                 sentence_data_target='../data/external/europarl-v7.de-en.de',number_datapoints=100)

In [3]:
parallel_sentences.dataframe

Unnamed: 0,text_source,text_target
665025,This is obviously a falsehood: the more we let...,"Dies ist ganz offensichtlich eine Unwahrheit, ..."
1134737,"But, if the honourable Member wishes, I could ...","Aber wenn es der Herr Abgeordnete wünscht, kön..."
1627566,"Thank you, Mr Posselt.","Danke, Herr Posselt."
1427234,I may be taking a minute more than I usually d...,Ich brauche vielleicht eine Minute länger als ...
909689,"Finally, there must be coordination of immigra...",Letztendlich ist eine Koordinierung der Einwan...
...,...,...
1025399,"The farmer usually pays too, because for it to...","Der Landwirt zahlt in der Regel auch, denn dam..."
732337,I think that figure has probably decreased ove...,Dieser Anteil hat sich möglicherweise im Verla...
1811271,"Mr President, Mr President-in-Office, Commissi...","Herr Präsident, Herr Ratspräsident, Herr Kommi..."
471457,"What is happening in Europe, with the national...","Das, was in Europa mit den nationalen Gesetzen..."


In [4]:
# #duc dataset
# parallel_sentences.dataframe['Translation']=1
# wrong= PreprocessingEuroParl(sentence_data_source='../data/external/europarl-v7.de-en.en',
#                  sentence_data_target='../data/external/europarl-v7.de-en.de',number_datapoints=10000)
# import pandas as pd
# wrong_data=pd.concat([wrong.dataframe.drop(columns='text_target').reset_index(drop=True),wrong.dataframe['text_target'].sample(frac=1).reset_index(drop=True)],axis=1)
# wrong_data['Translation']=0
# data=pd.concat([parallel_sentences.dataframe.reset_index(drop=True),wrong_data.reset_index(drop=True)])
# import pickle 
# filehandler = open('../data/processed/dataset_duc.pkl', 'wb') 
# pickle.dump(data, filehandler)

## II. Preprocess data

In this section we preprocess the parallel sentence data.

In [5]:
import spacy
from nltk.corpus import stopwords
from textblob import TextBlob as textblob_source
from textblob_de import TextBlobDE as textblob_target
import en_core_web_sm
import de_core_news_sm

In [6]:
stopwords_source = stopwords.words('english')
stopwords_target = stopwords.words('german')
nlp_source = en_core_web_sm.load()
nlp_target = de_core_news_sm.load()
embedding_matrix_source = "../data/interim/proc_b_src_emb.p"
embedding_dictionary_source =  "../data/interim/proc_b_src_word.p"
embedding_matrix_target = "../data/interim/proc_b_trg_emb.p"
embedding_dictionary_target =  "../data/interim/proc_b_trg_word.p"

In [7]:
parallel_sentences.preprocess_sentences(stopwords_source, nlp_source, textblob_source,
                                               embedding_matrix_source, embedding_dictionary_source,stopwords_target,nlp_target, textblob_target,
                                               embedding_matrix_target, embedding_dictionary_target,1)

  return (character_vector / word_vector).replace(np.nan, 0).replace(np.inf, 0).replace(np.log(0), 0)


In [9]:
parallel_sentences.preprocessed

Unnamed: 0,token_preprocessed_embedding_source,token_preprocessed_embedding_target,number_stopwords_source,number_punctuations_total_source,number_words_source,number_unique_words_source,number_characters_source,characters_avg_source,number_!_source,"number_""_source",...,pca_word_embedding_target,translated_to_target_source,translated_to_source_target,tf_idf_source,tf_idf_target,sentence_embedding_average_source,sentence_embedding_average_target,sentence_embedding_tf_idf_source,sentence_embedding_tf_idf_target,Translation
0,"[obviously, falsehood, let, legal, immigrant, ...","[ganz, offensichtlich, unwahrheit, je, mehr, l...",88,4,19,18,120,6.315789,0,0,...,ganz offensichtlich unwahrheit ...,"[funktioniert, volkspark, firma, gründen, plac...","[lister, rowohlt, auszutragen, short, anlass, ...","{'': 0.0, ''s': 0.0, '...': 0.0, 'able': 0.0, ...","{'': 0.0, '.': 0.0, '...': 0.0, '``': 0.0, 'ab...","[[-0.05045865563100532, 0.029355353073162192, ...","[[-0.0340845690054052, 0.022295029532602605, -...","[[-0.011220808422775097, 0.007381170066389235,...","[[-0.008989865219432267, 0.005986128855707191,...",1
1,"[honourable, member, wish, could, hand, table,...","[herr, abgeordnete, wünschen, tabelle, relevan...",51,3,10,10,64,6.400000,0,0,...,herr abgeordnete wünschen tabelle ...,"[adeligen, johann, fuhr, feb, gründete, ganzen...","[ffc, fehler, wien, klarinette, solicitor]","{'': 0.0, ''s': 0.0, '...': 0.0, 'able': 0.0, ...","{'': 0.0, '.': 0.0, '...': 0.0, '``': 0.0, 'ab...","[[-0.013026780862128362, 0.03791332170367241, ...","[[-0.017970938959883317, 0.06054798764590588, ...","[[-0.0036713532780875122, 0.01193993382544565,...","[[-0.005682782823479085, 0.019241104158050814,...",1
2,"[thank, mr, posselt]","[dank, herr, posselt]",7,1,3,3,14,4.666667,0,0,...,dank herr 0 0.091766 -0.047958 1 ...,"[stattdessen, position]","[ausgerichtete, ffc]","{'': 0.0, ''s': 0.0, '...': 0.0, 'able': 0.0, ...","{'': 0.0, '.': 0.0, '...': 0.0, '``': 0.0, 'ab...","[[-0.048036038875579834, 0.12062796205282211, ...","[[-0.06873552314937115, 0.10405069217085838, -...","[[-0.02636874734505317, 0.06177639793488936, -...","[[-0.03912050882732528, 0.05055658843291692, -...",1
3,"[may, take, minute, usually, mr, ai, weiwei, t...","[brauche, vielleicht, minute, lang, gewöhnlich...",38,3,12,12,60,5.000000,0,0,...,brauche vielleicht minute lang g...,"[so, quelle, weshalb, spiele, position, atlant...","[weshalb, größerer, ffc, atlanta, ausbleibende...","{'': 0.0, ''s': 0.0, '...': 0.0, 'able': 0.0, ...","{'': 0.0, '.': 0.0, '...': 0.0, '``': 0.0, 'ab...","[[-0.025614650073376568, 0.06095614851537076, ...","[[-0.016375068257647483, 0.034354716539382935,...","[[-0.00822455278971191, 0.01791740571102809, -...","[[-0.00408848766640528, 0.008938716271000586, ...",1
4,"[finally, must, coordination, immigration, pol...","[letztendlich, koordinierung, einwanderungspol...",65,2,15,15,115,7.666667,0,0,...,letztendlich koordinierung europäisch ...,"[px, auswahl, kommend, jahrhunderte, bischof, ...",[esslinger],"{'': 0.0, ''s': 0.0, '...': 0.0, 'able': 0.0, ...","{'': 0.0, '.': 0.0, '...': 0.0, '``': 0.443390...","[[-0.031561963501319824, -0.01573363205950175,...","[[-0.06436680792830884, -0.015493096318095922,...","[[-0.00825286159765296, -0.0046753939600710365...","[[-0.016702526663641006, -0.004208891823181344...",1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,"[farmer, usually, pay, profitable, transport, ...","[landwirt, zahlen, regel, rentabel, tier, weit...",52,2,10,9,65,6.500000,0,0,...,landwirt zahlen regel rentabel ...,"[namentlich, spiele, tübingen, realisieren, ge...","[verwandt, döhren]","{'': 0.0, ''s': 0.0, '...': 0.0, 'able': 0.0, ...","{'': 0.0, '.': 0.0, '...': 0.0, '``': 0.0, 'ab...","[[-0.03010354160020749, 0.03250550370042523, -...","[[-0.03930665057321841, 0.018983304923908276, ...","[[-0.013375775390273055, 0.01236822314046332, ...","[[-0.0140145629951614, 0.006758162131192769, -...",1
96,"[think, figure, probably, decrease, time, dram...","[anteil, möglicherweise, verlauf, zeit, verrin...",37,1,8,8,53,6.625000,0,0,...,anteil möglicherweise verlauf zei...,"[denen, royal, linie, ausserdem, kann, torschü...","[ideologische, short, zugesandt]","{'': 0.0, ''s': 0.0, '...': 0.0, 'able': 0.0, ...","{'': 0.0, '.': 0.0, '...': 0.0, '``': 0.0, 'ab...","[[-0.016857205773703754, 0.024118124973028898,...","[[-0.023646995292178223, 0.011523882315876628,...","[[-0.0058122972311169824, 0.007263560007680223...","[[-0.00960614818298269, 0.004667006088578905, ...",1
97,"[mr, president, mr, president, office, commiss...","[herr, präsident, herr, ratspräsident, herr, k...",71,5,16,15,132,8.250000,0,0,...,herr präsident kommissar lassen n...,"[position, heinrich, position, heinrich, dorf,...","[ffc, ffc, ffc, diepold, cunha, lister, bin, z...","{'': 0.0, ''s': 0.0, '...': 0.0, 'able': 0.0, ...","{'': 0.0, '.': 0.0, '...': 0.0, '``': 0.0, 'ab...","[[-0.0677268068306148, 0.006006998320420583, -...","[[-0.05274518303728352, 0.011208405925167931, ...","[[-0.016352633218081804, 0.0019238354876513059...","[[-0.011004737131303193, 0.00367659001130145, ...",1
98,"[happen, europe, national, law, member, states...","[europa, national, gesetz, mitgliedstaaten, ge...",101,4,21,19,144,6.857143,0,0,...,europa national gesetz mitgliedstaat...,"[au, •, team, meter, johann, zwischen, arts, k...","[local, team, mosbacher, einem, nebengewässer,...","{'': 0.0, ''s': 0.0, '...': 0.0, 'able': 0.0, ...","{'': 0.0, '.': 0.0, '...': 0.0, '``': 0.0, 'ab...","[[-0.03946517260843202, 0.015531172506407051, ...","[[-0.03433662308897409, 0.006303445610683411, ...","[[-0.009339824808378245, 0.0026040922806207277...","[[-0.007456995616521982, 0.0003984499595708473...",1


In [31]:
parallel_sentences.preprocessed.translated_to_target_source[0]

['funktioniert',
 'volkspark',
 'firma',
 'gründen',
 'places',
 'dann',
 'norm',
 'sondern',
 'anlass',
 'places',
 'rein',
 'allein',
 'einer',
 'gab',
 'georg',
 'einzelnen',
 'heinz',
 'deine']

In [29]:
parallel_sentences.preprocessed.token_preprocessed_embedding_target[0]

['ganz',
 'offensichtlich',
 'unwahrheit',
 'je',
 'mehr',
 'legale',
 'einwanderer',
 'aufnehmen',
 'desto',
 'stark',
 'wachsen',
 'zahl',
 'illegal',
 'einwanderer',
 'hoffnung',
 'tag',
 'spanien',
 'italien',
 'regularisiert']

In [None]:
# def save_object(obj, filename):
#     with open(filename, 'wb') as output:  # Overwrites any existing file.
#        pickle.dump(obj, output, pickle.HIGHEST_PROTOCOL)

# sample usage
# save_object(parallel_sentences, '../data/processed/processed_data_2505.pkl')

In [None]:
# import pickle 
# filehandler = open('../data/processed/processed_data_2505_2.pkl', 'wb') 
# pickle.dump(parallel_sentences, filehandler)

In [None]:
# import pickle 
# file = open("../data/processed/processed_data_2505_2.pkl",'rb')
# df = pickle.load(file)
# file.close()

## III. Create data set

In [32]:
from src.data.dataset_class import DataSet

In [33]:
dataset = DataSet(parallel_sentences)

In [34]:
n_training = 50
n_test_queries = 5
n_test_documents = 10

In [35]:
dataset.get_sample(n_training, n_test_queries, n_test_documents)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


## II. Create sentence based features

In this section we create features for our model, that are sentence based and should be created before the text is preprocessed.

In [36]:
from src.features.feature_generation_class import FeatureGeneration

In [37]:
features = FeatureGeneration(dataset.dataset)

In [38]:
features.feature_generation()

  return (target_array - source_array).replace(np.nan, 0).replace(np.inf, 0).replace(np.log(0), 0)
  return ((target_array - source_array) / source_array).replace(np.nan, 0).replace(np.inf, 0).replace(np.log(0), 0)
  return ((source_array / source_sentence_length) - (target_array / target_sentence_length)).replace(np.nan, 0).replace(np.inf, 0).replace(np.log(0), 0)


In [39]:
features.feature_dataframe

Unnamed: 0,number_punctuations_total_difference,number_punctuations_total_difference_relative,number_punctuations_total_difference_normalized,number_words_difference,number_words_difference_relative,number_words_difference_normalized,number_unique_words_difference,number_unique_words_difference_relative,number_unique_words_difference_normalized,number_!_difference,...,score_subjectivity_difference_relative,score_subjectivity_difference_normalized,number_stopwords_difference,number_stopwords_difference_relative,number_stopwords_difference_normalized,Translation,cosine_similarity_average,cosine_similarity_tfidf,jaccard_translation_source,jaccard_translation_target
0,0,0.0,0.000000,0,0.000000,0.000000,0,0.000000,0.000000,0,...,0.0,0.000000,-29,-1.0,4.833333,1,0.711407,0.722653,0.0,0.0
1,3,1.0,-0.083333,3,0.200000,0.083333,3,0.200000,0.083333,0,...,0.0,0.000000,-50,-1.0,2.777778,1,0.769030,0.764048,0.0,0.0
2,1,0.0,-0.090909,0,0.000000,0.090909,0,0.000000,0.090909,0,...,-1.0,0.055000,-60,-1.0,6.000000,1,0.886636,0.878452,0.0,0.0
3,0,0.0,0.007143,1,0.058824,-0.007143,1,0.066667,-0.011905,0,...,0.0,0.000000,-75,-1.0,3.750000,1,0.864285,0.817990,0.0,0.0
4,0,0.0,0.000000,-1,-0.333333,0.000000,-1,-0.333333,0.000000,0,...,0.0,0.000000,-18,-1.0,6.000000,1,0.481609,0.476647,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0,0.0,-0.033333,-3,-0.200000,0.033333,-4,-0.266667,0.100000,0,...,0.0,0.000000,-64,-1.0,3.555556,0,0.649919,0.643039,0.0,0.0
96,1,0.0,-0.142857,-16,-0.727273,0.142857,-16,-0.727273,0.142857,0,...,-1.0,0.028788,-105,-1.0,4.772727,0,0.619254,0.618318,0.0,0.0
97,-2,-1.0,0.111111,-11,-0.687500,-0.111111,-10,-0.666667,-0.166667,0,...,-1.0,0.027146,-68,-1.0,3.777778,0,0.567012,0.565174,0.0,0.0
98,-4,-1.0,0.210526,-8,-0.533333,-0.210526,-8,-0.533333,-0.210526,0,...,-1.0,0.025146,-46,-1.0,2.421053,0,0.594827,0.589755,0.0,0.0


In [None]:
import pickle 
filehandler = open('../data/processed/processed_data.pkl', 'wb') 
pickle.dump(features.feature_dataframe, filehandler)

In [None]:
import pickle 
file = open("../data/processed/processed_data.pkl",'rb')
df = pickle.load(file)
file.close()

In [None]:
df

## III. Create token based features

## IV. Feature selection

In [None]:
import seaborn as sn
import numpy as np
import matplotlib.pyplot as plt

In [None]:
#look at correlation matrix
np.cov(df, bias=True)
corrMatrix=df.corr()
f=plt.figure(figsize=(14,9))
sn.heatmap(corrMatrix, annot=False)
f.show()

In [None]:
#Korrelation
correlated_features = set()
correlation_matrix = df.corr()

for i in range(len(correlation_matrix.columns)):
    for j in range(i):
        if abs(correlation_matrix.iloc[i, j]) > 0.8:
            print(f"The following features are correlated: {correlation_matrix.columns[i]} and {correlation_matrix.columns[j]}. Correlation = {round(abs(correlation_matrix.iloc[i, j]),2)}")
            colname = correlation_matrix.columns[i]
            correlated_features.add(colname)

print(f"Drop the following features: {correlated_features}")

In [None]:
#drop correlated features, but only when looking at a big dataset
df=df.drop(columns=correlated_features)

In [None]:
df

## V. Train Model


### Create Train Test Split

In [None]:
target=df['Translation']
df=df.drop(columns=['Translation'])
df

In [None]:
sum(df.isna().any())

In [None]:
df=df.fillna(0)

In [None]:
#scale data otherwise logistic regression does not converge
from sklearn import preprocessing

scaler = preprocessing.MinMaxScaler()
df[df.columns] = scaler.fit_transform(df[df.columns])


In [None]:
df

In [None]:
from sklearn.model_selection import train_test_split

data_train, data_test, target_train, target_test = train_test_split(
    df,target,test_size=0.2, random_state = 42)

### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
lr = LogisticRegression(class_weight = 'balanced', max_iter=10000).fit(data_train, target_train)
prediction = lr.predict(data_test)
acc = accuracy_score(target_test,prediction) 
f1= f1_score(target_test,prediction) 
pr= precision_score(target_test,prediction) 
re= recall_score(target_test,prediction) 
print("The Accuracy on test set: {:.4f}".format(acc))
print("The F1-Score on test set: {:.4f}".format(f1))
print("The Precision-Score on test set: {:.4f}".format(pr))
print("The Recall-Score on test set: {:.4f}".format(re))

In [None]:
# feature importance
# get importance
importance = lr.coef_[0]
# summarize feature importance
for i, v in enumerate(importance):
    print(f'Feature: {i} {data_train.columns[i]}, Score: {v}')

# plot feature importance
plt.bar([x for x in range(len(importance))], importance)
plt.show()

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
nb = GaussianNB().fit(data_train, target_train)
prediction = nb.predict(data_test)
acc = accuracy_score(target_test,prediction) 
f1= f1_score(target_test,prediction) 
pr= precision_score(target_test,prediction) 
re= recall_score(target_test,prediction) 
print("The Accuracy on test set: {:.4f}".format(acc))
print("The F1-Score on test set: {:.4f}".format(f1))
print("The Precision-Score on test set: {:.4f}".format(pr))
print("The Recall-Score on test set: {:.4f}".format(re))

In [None]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
svc = SVC().fit(data_train, target_train)
prediction = svc.predict(data_test)
acc = accuracy_score(target_test,prediction) 
f1= f1_score(target_test,prediction) 
pr= precision_score(target_test,prediction) 
re= recall_score(target_test,prediction) 
print("The Accuracy on test set: {:.4f}".format(acc))
print("The F1-Score on test set: {:.4f}".format(f1))
print("The Precision-Score on test set: {:.4f}".format(pr))
print("The Recall-Score on test set: {:.4f}".format(re))

In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
mlp = MLPClassifier().fit(data_train, target_train)
prediction = mlp.predict(data_test)
acc = accuracy_score(target_test,prediction) 
f1= f1_score(target_test,prediction) 
pr= precision_score(target_test,prediction) 
re= recall_score(target_test,prediction) 
print("The Accuracy on test set: {:.4f}".format(acc))
print("The F1-Score on test set: {:.4f}".format(f1))
print("The Precision-Score on test set: {:.4f}".format(pr))
print("The Recall-Score on test set: {:.4f}".format(re))