# Preprocessing and Feature Creation

In this notebook we import the data, preprocess the data and create features for supervised and unsupervised cross-lingual-information retrieval models.

## I. Import Data

In this section we import the English and German europarl datasets and combine them into a parallel sentence translation dataframe.

In [1]:
# from src.data.import_data import create_data_subset

In [2]:
# create_data_subset(sentence_data_source='../data/external/europarl-v7.de-en.en',
#                        sentences_data_target='../data/external/europarl-v7.de-en.de',
#                        sample_size=200000)

In [26]:
from src.data.preprocessing_class import PreprocessingEuroParl

In [27]:
parallel_sentences = PreprocessingEuroParl(df_sampled_path="../data/interim/europarl_english_german.pkl")

In [28]:
parallel_sentences.dataframe = parallel_sentences.dataframe.iloc[0:5000,]

In [29]:
parallel_sentences.dataframe

Unnamed: 0,id,text_source,text_target
0,0,"If it is legal, we do not need a debate.","Falls es legal ist, dann brauchen wir keine De..."
1,1,14. Further macro-financial assistance for Geo...,14. Weitere Makrofinanzhilfe für Georgien (
2,2,"The request came to nothing, firstly because t...","Diese Forderung verlief im Sande, zum einen, w..."
3,3,"That is how the judicial system works, and Lor...","So funktioniert der Rechtsstaat, und Lord Beth..."
4,4,The Council's refusal to make the Charter of F...,"Wir sind daher durch die Weigerung des Rates, ..."
...,...,...,...
4995,4995,"May I end, Madam President, by expressing my s...","Gestatten Sie mir, Frau Präsidentin, abschließ..."
4996,4996,That is my appeal - to reflect overnight and t...,Das ist mein Appell: Überdenken Sie das Ganze ...
4997,4997,"Secondly, I speak as a European, to say that E...",In zweiter Linie spreche ich als Europäer und ...
4998,4998,The new approach being taken here should hopef...,"Der neue Anlauf, den wir hiermit nehmen, soll ..."


In [30]:
# #duc dataset
# parallel_sentences.dataframe['Translation']=1
# wrong= PreprocessingEuroParl(sentence_data_source='../data/external/europarl-v7.de-en.en',
#                  sentence_data_target='../data/external/europarl-v7.de-en.de',number_datapoints=10000)
# import pandas as pd
# wrong_data=pd.concat([wrong.dataframe.drop(columns='text_target').reset_index(drop=True),wrong.dataframe['text_target'].sample(frac=1).reset_index(drop=True)],axis=1)
# wrong_data['Translation']=0
# data=pd.concat([parallel_sentences.dataframe.reset_index(drop=True),wrong_data.reset_index(drop=True)])
# import pickle 
# filehandler = open('../data/processed/dataset_duc.pkl', 'wb') 
# pickle.dump(data, filehandler)

## II. Preprocess data

In this section we preprocess the parallel sentence data.

In [31]:
import spacy
from nltk.corpus import stopwords
from textblob import TextBlob as textblob_source
from textblob_de import TextBlobDE as textblob_target
import en_core_web_sm
import de_core_news_sm

In [32]:
stopwords_source = stopwords.words('english')
stopwords_target = stopwords.words('german')
nlp_source = en_core_web_sm.load()
nlp_target = de_core_news_sm.load()
embedding_array_source_path = "../data/interim/proc_5k_src_emb.pkl"
embedding_dictionary_source_path =  "../data/interim/proc_5k_src_word.pkl"
embedding_array_target_path = "../data/interim/proc_5k_trg_emb.pkl"
embedding_dictionary_target_path =  "../data/interim/proc_5k_trg_word.pkl"
number_translations = 1
number_pc = 10

In [None]:
import time
start = time.time()
parallel_sentences.preprocess_sentences(stopwords_source, nlp_source, textblob_source,
                                               embedding_array_source_path, embedding_dictionary_source_path,
                                                stopwords_target,nlp_target, textblob_target,
                                               embedding_array_target_path, embedding_dictionary_target_path,
                                                number_translations, number_pc)
end = time.time()
print(end - start)

In [None]:
parallel_sentences.preprocessed

In [None]:
parallel_sentences.preprocessed.to_csv("safe.com")

In [12]:
# def save_object(obj, filename):
#     with open(filename, 'wb') as output:  # Overwrites any existing file.
#        pickle.dump(obj, output, pickle.HIGHEST_PROTOCOL)

# sample usage
# save_object(parallel_sentences, '../data/processed/processed_data_2505.pkl')

In [13]:
# import pickle 
# filehandler = open('../data/processed/processed_data_2505_2.pkl', 'wb') 
# pickle.dump(parallel_sentences, filehandler)

In [14]:
# import pickle 
# file = open("../data/processed/processed_data_2505_2.pkl",'rb')
# df = pickle.load(file)
# file.close()

## III. Create data set

In [15]:
from src.data.dataset_class import DataSet

In [16]:
dataset = DataSet(parallel_sentences)

In [18]:
n_training = 90
n_test_queries = 1
n_test_documents = 10
k = 1

In [19]:
dataset.get_sample(n_training, n_test_queries, n_test_documents,k)

In [21]:
dataset.dataset

Unnamed: 0,id_source,token_preprocessed_embedding_source,token_preprocessed_embedding_target,number_stopwords_source,number_stopwords_target,number_punctuations_total_source,number_punctuations_total_target,number_words_source,number_words_target,number_unique_words_source,...,sentence_embedding_average_source,sentence_embedding_average_target,sentence_embedding_tf_idf_source,sentence_embedding_tf_idf_target,pca_sentence_embedding_average_source,pca_sentence_embedding_average_target,pca_sentence_embedding_tf_idf_source,pca_sentence_embedding_tf_idf_target,Translation,id_target
0,117,"[disease, figure, show, clearly]","[krankheitsdaten, zeigen, deutlich]",3,3,0,0,4,3,4,...,"[[0.010017333086580038, 0.04874441109132022, -...","[[-0.004746901104226708, -0.06784963794052601,...","[[0.004067703506351096, 0.024935910994378894, ...","[[-0.0029277145638999247, -0.03933923996773458...","[[-0.12949643761385232, 0.1388400699943304, 0....","[[0.26272866129875183, -0.03103486355394125, -...","[[-0.06441677273812196, 0.06838918978053031, 0...","[[0.1496176470976494, -0.017388465353272466, -...",1,117
1,85,"[last, week, council, work, party, responsible...","[letzter, woche, zuständig, arbeitsgruppe, rat...",14,16,2,3,21,15,21,...,"[[-0.07326376087786186, 0.04503160322617207, -...","[[-0.07079099017816286, 0.03427593677285282, -...","[[-0.016079724419953166, 0.009913920003582708,...","[[-0.01883112217359222, 0.008916389397625055, ...","[[-0.06141113793654811, 0.09360336490152847, 0...","[[0.13937350587608913, 0.000491696848863891, 0...","[[-0.01507173788268915, 0.02049019557942409, 0...","[[0.03820079045448178, -0.0008272818679564057,...",1,85
2,149,"[madam, president, welcome, mr, langen, 's, re...","[frau, präsidentin, begrüßen, bericht, herrn, ...",5,8,1,1,15,13,15,...,"[[-0.03869397675485483, 0.03579077340795526, -...","[[-0.05850736474773536, 0.03301882650703192, -...","[[-0.009665046811339483, 0.008924232385536473,...","[[-0.017341574364950343, 0.008830848670690616,...","[[-0.07742600954536881, 0.052053494195986004, ...","[[0.08310952534278233, 0.03424447542056441, 0....","[[-0.02239219137628126, 0.012529661851771649, ...","[[0.025677220316657062, 0.010224607017932963, ...",1,149
3,129,"[therefore, fully, support, approach, report, ...","[sinn, unterstützen, generell, ansatz, bericht...",9,11,2,1,12,11,12,...,"[[-0.029104450872788828, 0.0033055458140249052...","[[-0.055084214028384954, 0.0014666318893432617...","[[-0.008622773546289321, 0.0009145870283218041...","[[-0.016315173101892493, -1.695980891757127e-0...","[[-0.14916455737936, 0.06393573126600434, 0.02...","[[0.22662464601712096, -0.04656096011038042, 0...","[[-0.04286179085424419, 0.019926089500480325, ...","[[0.06923584030865106, -0.01478614797449512, 0...",1,129
4,95,"[welcome, strengthening, trade, relation, prov...","[ausbau, handelsbeziehungen, begrüßen, sofern,...",14,11,1,2,12,10,12,...,"[[-0.05008048461362099, -0.045622791008402906,...","[[-0.03224296417708198, -0.03528159980972608, ...","[[-0.014451369339695508, -0.013126734759606613...","[[-0.010332043053378426, -0.011383004184929412...","[[-0.1551031737277905, 0.06398595481490095, 0....","[[0.24881377898984486, 0.022551155933696363, 0...","[[-0.04456170613949181, 0.01879428920192543, 0...","[[0.07767272363372349, 0.008039208965487389, 0...",1,95
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
175,38,"[prove, impossible, get, council, commission, ...","[ergebnis, vorliegend, bericht, rücktritt, füh...",20,13,4,2,27,12,23,...,"[[-0.043347116979851344, 0.014273610826500732,...","[[-0.03156926203519106, 0.011037592416290532, ...","[[-0.009548701972919585, 0.002391840311229906,...","[[-0.008647833917607725, 0.0026701877388164634...","[[-0.15462185416127677, 0.07005518468339807, 0...","[[0.1398266918787902, 0.011092400855638763, 0....","[[-0.0317242380272315, 0.014733091576071637, -...","[[0.040598269927101915, 0.0029293871129557835,...",0,45
176,190,"[see, love, one, kill, war, yet, unable, reuni...","[erreichen, müssen, mitgliedstaaten, zusammena...",17,6,0,0,16,4,16,...,"[[-0.0388611298520118, 0.02212741502250234, -0...","[[-0.052134337835013866, 0.01858279202133417, ...","[[-0.010399794452965848, 0.005030973529368764,...","[[-0.02966943357626567, 0.008171631325840676, ...","[[-0.029443515402575334, 0.0987393864740928, 0...","[[0.22432024404406548, -0.05152263538911939, -...","[[-0.006774260985673793, 0.026174854559322492,...","[[0.10443982004797905, -0.027859116010950695, ...",0,68
177,158,"[exploit, much, ukrainian, people, profit]","[aussprache, schließen]",9,3,2,0,5,2,5,...,"[[-0.08924220204353332, 0.03766511082649231, -...","[[-0.06861083768308163, 0.006088372319936752, ...","[[-0.04001307538063078, 0.017034866933223033, ...","[[-0.04905906628671981, 0.0022598898558911645,...","[[-0.11853625131770969, 0.04598630052059889, 0...","[[0.15244074910879135, 0.009229101240634918, 0...","[[-0.05084606099253095, 0.014791298133105018, ...","[[0.10918291983873842, 0.008282805324573143, 0...",0,88
178,175,"[midterm, review, envisage, case, believe, rig...","[lieben, kollegin, kollege, estland, lettland,...",21,5,1,1,16,9,16,...,"[[-0.052550282853189856, 0.03253021911950782, ...","[[-0.06996113783679903, 0.04964602146355901, -...","[[-0.013521540375747054, 0.007978567522369412,...","[[-0.023408381358570637, 0.016555929249164755,...","[[-0.1434008825744968, 0.09875457649468444, 0....","[[0.037553663831204176, -0.010458275210112333,...","[[-0.03674212653322596, 0.024424341110714978, ...","[[0.011415838371322113, -0.0021621320821200917...",0,63


## II. Create sentence based features

In this section we create features for our model, that are sentence based and should be created before the text is preprocessed.

In [22]:
from src.features.feature_generation_class import FeatureGeneration

In [23]:
features_train = FeatureGeneration(dataset.dataset, number_pc)

In [24]:
features_train.feature_generation()

  return (target_array - source_array).replace(np.nan, 0).replace(np.inf, 0).replace(np.log(0), 0)
  return ((target_array - source_array) / source_array).replace(np.nan, 0).replace(np.inf, 0).replace(np.log(0), 0)
  return ((source_array / source_sentence_length) - (target_array / target_sentence_length)).replace(np.nan, 0).replace(np.inf, 0).replace(np.log(0), 0)


In [25]:
features_train.feature_dataframe

Unnamed: 0,number_punctuations_total_difference,number_punctuations_total_difference_relative,number_punctuations_total_difference_normalized,number_words_difference,number_words_difference_relative,number_words_difference_normalized,number_unique_words_difference,number_unique_words_difference_relative,number_unique_words_difference_normalized,number_!_difference,...,pca_embeddding_tf_idf_diff_5,pca_embeddding_average_diff_6,pca_embeddding_tf_idf_diff_6,pca_embeddding_average_diff_7,pca_embeddding_tf_idf_diff_7,pca_embeddding_average_diff_8,pca_embeddding_tf_idf_diff_8,pca_embeddding_average_diff_9,pca_embeddding_tf_idf_diff_9,Translation
0,0,0.000000,0.000000,-1,-0.250000,0.000000,-1,-0.250000,0.000000,0,...,-0.053989,0.001882,-0.000616,0.101184,0.057296,0.094184,0.054041,-0.013247,-0.005876,1
1,1,0.500000,-0.079710,-6,-0.285714,0.079710,-6,-0.285714,0.079710,0,...,0.001370,-0.032987,-0.005607,-0.019100,-0.001928,0.054891,0.013568,-0.009564,-0.000839,1
2,0,0.000000,-0.008929,-2,-0.133333,0.008929,-2,-0.133333,0.008929,0,...,-0.011399,-0.064867,-0.017260,0.002975,0.004078,-0.046337,-0.016028,-0.019219,-0.004022,1
3,-1,-0.500000,0.059524,-1,-0.083333,-0.059524,-1,-0.083333,-0.059524,0,...,-0.043491,-0.041101,-0.009994,0.027708,0.008766,0.057870,0.015263,-0.042500,-0.013060,1
4,1,1.000000,-0.089744,-2,-0.166667,0.089744,-2,-0.166667,0.089744,0,...,-0.029210,-0.082700,-0.024513,0.033305,0.012570,0.127930,0.039707,-0.076313,-0.022338,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
175,-2,-0.500000,-0.013825,-15,-0.555556,0.013825,-11,-0.478261,-0.115207,0,...,-0.017916,-0.019759,-0.000520,0.027083,0.009870,0.009747,0.001344,0.015695,0.008348,0
176,0,0.000000,0.000000,-12,-0.750000,0.000000,-12,-0.750000,0.000000,0,...,0.017273,0.010295,0.000567,-0.000344,0.011901,0.066914,0.022881,-0.007793,-0.007881,0
177,-2,-1.000000,0.285714,-3,-0.600000,-0.285714,-3,-0.600000,-0.285714,0,...,-0.033186,-0.024890,-0.005404,-0.119780,-0.061739,0.059441,0.044727,0.006044,0.005996,0
178,0,0.000000,-0.041176,-7,-0.437500,0.041176,-7,-0.437500,0.041176,1,...,-0.017414,-0.058098,-0.014064,-0.001037,0.003987,-0.028255,-0.007734,-0.037849,-0.012464,0


In [None]:
# import pickle 
# filehandler = open('../data/processed/processed_data.pkl', 'wb') 
# pickle.dump(features.feature_dataframe, filehandler)

In [None]:
# import pickle 
# file = open("../data/processed/processed_data.pkl",'rb')
# df = pickle.load(file)
# file.close()

In [None]:
# df

## IV. Unsupervised Classification

## IV. Feature selection

In [None]:
import seaborn as sn
import numpy as np
import matplotlib.pyplot as plt

In [None]:
#look at correlation matrix
np.cov(df, bias=True)
corrMatrix=df.corr()
f=plt.figure(figsize=(14,9))
sn.heatmap(corrMatrix, annot=False)
f.show()

In [None]:
#Korrelation
correlated_features = set()
correlation_matrix = df.corr()

for i in range(len(correlation_matrix.columns)):
    for j in range(i):
        if abs(correlation_matrix.iloc[i, j]) > 0.8:
            print(f"The following features are correlated: {correlation_matrix.columns[i]} and {correlation_matrix.columns[j]}. Correlation = {round(abs(correlation_matrix.iloc[i, j]),2)}")
            colname = correlation_matrix.columns[i]
            correlated_features.add(colname)

print(f"Drop the following features: {correlated_features}")

In [None]:
#drop correlated features, but only when looking at a big dataset
df=df.drop(columns=correlated_features)

In [None]:
df

## V. Train Model


### Create Train Test Split

In [None]:
target=df['Translation']
df=df.drop(columns=['Translation'])
df

In [None]:
sum(df.isna().any())

In [None]:
df=df.fillna(0)

In [None]:
#scale data otherwise logistic regression does not converge
from sklearn import preprocessing

scaler = preprocessing.MinMaxScaler()
df[df.columns] = scaler.fit_transform(df[df.columns])


In [None]:
df

In [None]:
from sklearn.model_selection import train_test_split

data_train, data_test, target_train, target_test = train_test_split(
    df,target,test_size=0.2, random_state = 42)

### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
lr = LogisticRegression(class_weight = 'balanced', max_iter=10000).fit(data_train, target_train)
prediction = lr.predict(data_test)
acc = accuracy_score(target_test,prediction) 
f1= f1_score(target_test,prediction) 
pr= precision_score(target_test,prediction) 
re= recall_score(target_test,prediction) 
print("The Accuracy on test set: {:.4f}".format(acc))
print("The F1-Score on test set: {:.4f}".format(f1))
print("The Precision-Score on test set: {:.4f}".format(pr))
print("The Recall-Score on test set: {:.4f}".format(re))

In [None]:
# feature importance
# get importance
importance = lr.coef_[0]
# summarize feature importance
for i, v in enumerate(importance):
    print(f'Feature: {i} {data_train.columns[i]}, Score: {v}')

# plot feature importance
plt.bar([x for x in range(len(importance))], importance)
plt.show()

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
nb = GaussianNB().fit(data_train, target_train)
prediction = nb.predict(data_test)
acc = accuracy_score(target_test,prediction) 
f1= f1_score(target_test,prediction) 
pr= precision_score(target_test,prediction) 
re= recall_score(target_test,prediction) 
print("The Accuracy on test set: {:.4f}".format(acc))
print("The F1-Score on test set: {:.4f}".format(f1))
print("The Precision-Score on test set: {:.4f}".format(pr))
print("The Recall-Score on test set: {:.4f}".format(re))

In [None]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
svc = SVC().fit(data_train, target_train)
prediction = svc.predict(data_test)
acc = accuracy_score(target_test,prediction) 
f1= f1_score(target_test,prediction) 
pr= precision_score(target_test,prediction) 
re= recall_score(target_test,prediction) 
print("The Accuracy on test set: {:.4f}".format(acc))
print("The F1-Score on test set: {:.4f}".format(f1))
print("The Precision-Score on test set: {:.4f}".format(pr))
print("The Recall-Score on test set: {:.4f}".format(re))

In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
mlp = MLPClassifier().fit(data_train, target_train)
prediction = mlp.predict(data_test)
acc = accuracy_score(target_test,prediction) 
f1= f1_score(target_test,prediction) 
pr= precision_score(target_test,prediction) 
re= recall_score(target_test,prediction) 
print("The Accuracy on test set: {:.4f}".format(acc))
print("The F1-Score on test set: {:.4f}".format(f1))
print("The Precision-Score on test set: {:.4f}".format(pr))
print("The Recall-Score on test set: {:.4f}".format(re))