# Preprocessing and Feature Creation

In this notebook we import the data, preprocess the data and create features for supervised and unsupervised cross-lingual-information retrieval models.

## I. Import Data

In this section we import the English and German europarl datasets and combine them into a parallel sentence translation dataframe.

In [1]:
# from src.data.import_data import create_data_subset

In [2]:
# create_data_subset(sentence_data_source='../data/external/europarl-v7.de-en.en',
#                        sentences_data_target='../data/external/europarl-v7.de-en.de',
#                        sample_size=200000)

In [3]:
from src.data.preprocessing_class import PreprocessingEuroParl

In [4]:
parallel_sentences = PreprocessingEuroParl(df_sampled_path="../data/interim/europarl_english_german.pkl")

In [5]:
parallel_sentences.dataframe = parallel_sentences.dataframe.iloc[0:1000,]

In [6]:
parallel_sentences.dataframe

Unnamed: 0,id,text_source,text_target
0,0,"If it is legal, we do not need a debate.","Falls es legal ist, dann brauchen wir keine De..."
1,1,14. Further macro-financial assistance for Geo...,14. Weitere Makrofinanzhilfe für Georgien (
2,2,"The request came to nothing, firstly because t...","Diese Forderung verlief im Sande, zum einen, w..."
3,3,"That is how the judicial system works, and Lor...","So funktioniert der Rechtsstaat, und Lord Beth..."
4,4,The Council's refusal to make the Charter of F...,"Wir sind daher durch die Weigerung des Rates, ..."
...,...,...,...
995,995,"The results that have been achieved, over many...","Die Ergebnisse, die über viele Jahre hinweg du..."
996,996,It is our hope that the men and women in Afgha...,"Wir wünschen uns, dass auch in Afghanistan die..."
997,997,Members will therefore receive written answers...,Die Mitglieder werden daher schriftliche Antwo...
998,998,It will apply not only to products within all ...,Die neue Regelung wird sich nicht nur auf Prod...


In [7]:
# #duc dataset
# parallel_sentences.dataframe['Translation']=1
# wrong= PreprocessingEuroParl(sentence_data_source='../data/external/europarl-v7.de-en.en',
#                  sentence_data_target='../data/external/europarl-v7.de-en.de',number_datapoints=10000)
# import pandas as pd
# wrong_data=pd.concat([wrong.dataframe.drop(columns='text_target').reset_index(drop=True),wrong.dataframe['text_target'].sample(frac=1).reset_index(drop=True)],axis=1)
# wrong_data['Translation']=0
# data=pd.concat([parallel_sentences.dataframe.reset_index(drop=True),wrong_data.reset_index(drop=True)])
# import pickle 
# filehandler = open('../data/processed/dataset_duc.pkl', 'wb') 
# pickle.dump(data, filehandler)

## II. Preprocess data

In this section we preprocess the parallel sentence data.

In [8]:
import spacy
from nltk.corpus import stopwords
from textblob import TextBlob as textblob_source
from textblob_de import TextBlobDE as textblob_target
import en_core_web_sm
import de_core_news_sm

In [9]:
stopwords_source = stopwords.words('english')
stopwords_target = stopwords.words('german')
nlp_source = en_core_web_sm.load()
nlp_target = de_core_news_sm.load()
embedding_array_source_path = "../data/interim/proc_5k_src_emb.pkl"
embedding_dictionary_source_path =  "../data/interim/proc_5k_src_word.pkl"
embedding_array_target_path = "../data/interim/proc_5k_trg_emb.pkl"
embedding_dictionary_target_path =  "../data/interim/proc_5k_trg_word.pkl"
number_translations = 1
number_pc = 10

In [10]:
import time
start = time.time()
parallel_sentences.preprocess_sentences(stopwords_source, nlp_source, textblob_source,
                                               embedding_array_source_path, embedding_dictionary_source_path,
                                                stopwords_target,nlp_target, textblob_target,
                                               embedding_array_target_path, embedding_dictionary_target_path,
                                                number_pc)
end = time.time()
print(end - start)

Finished 'lemmatize' in 5.354 secs
Finished 'tokenize_sentence' in 0.126 secs
Finished 'strip_whitespace' in 0.001 secs
Finished 'lowercase' in 0.002 secs
Finished 'remove_punctuation' in 0.003 secs
Finished 'remove_stopwords' in 0.034 secs
Finished 'remove_numbers' in 0.005 secs
Finished 'create_cleaned_token_embedding' in 5.527 secs
Finished 'lemmatize' in 4.951 secs
Finished 'tokenize_sentence' in 0.135 secs
Finished 'strip_whitespace' in 0.002 secs
Finished 'lowercase' in 0.003 secs
Finished 'remove_punctuation' in 0.003 secs
Finished 'remove_stopwords' in 0.042 secs
Finished 'remove_numbers' in 0.006 secs
Finished 'create_cleaned_token_embedding' in 5.142 secs
Finished 'tokenize_sentence' in 0.124 secs
Finished 'strip_whitespace' in 0.001 secs
Finished 'lowercase' in 0.002 secs
Finished 'create_cleaned_text' in 0.129 secs
Finished 'tokenize_sentence' in 0.139 secs
Finished 'strip_whitespace' in 0.001 secs
Finished 'lowercase' in 0.065 secs
Finished 'create_cleaned_text' in 0.206 s

  return (character_vector / word_vector).replace(np.nan, 0).replace(np.inf, 0).replace(np.log(0), 0)



Finished 'number_punctuation_marks' in 0.001 secs
Finished 'number_punctuation_marks' in 0.001 secs
Finished 'number_punctuation_marks' in 0.001 secs
Finished 'number_punctuation_marks' in 0.001 secs
Finished 'number_punctuation_marks' in 0.001 secs
Finished 'number_punctuation_marks' in 0.001 secs
Finished 'number_punctuation_marks' in 0.001 secs
Finished 'number_punctuation_marks' in 0.001 secs
Finished 'number_punctuation_marks' in 0.001 secs
Finished 'number_punctuation_marks' in 0.001 secs
Finished 'number_punctuation_marks' in 0.001 secs
Finished 'number_punctuation_marks' in 0.001 secs
Finished 'number_punctuation_marks' in 0.001 secs
Finished 'number_punctuation_marks' in 0.001 secs
Finished 'number_punctuation_marks' in 0.001 secs
Finished 'number_punctuation_marks' in 0.001 secs
Finished 'number_punctuation_marks' in 0.001 secs
Finished 'number_punctuation_marks' in 0.001 secs
Finished 'number_punctuation_marks' in 0.001 secs
Finished 'number_punctuation_marks' in 0.001 secs

  return [pd.Series(embedding_dataframe.values.mean(axis=1))]


Finished 'sentence_embedding_tf_idf' in 1.341 secs
Finished 'sentence_embedding_tf_idf' in 1.111 secs
Finished 'sentence_embedding_average' in 0.114 secs
Finished 'sentence_embedding_average' in 0.111 secs
Finished 'sentence_embedding_tf_idf' in 1.246 secs
Finished 'sentence_embedding_tf_idf' in 1.149 secs
94.1337890625


In [11]:
parallel_sentences.preprocessed

Unnamed: 0,id,token_preprocessed_embedding_source,token_preprocessed_embedding_target,number_stopwords_source,number_stopwords_target,number_punctuations_total_source,number_punctuations_total_target,number_words_source,number_words_target,number_unique_words_source,...,tf_idf_target,sentence_embedding_average_source,sentence_embedding_average_target,sentence_embedding_tf_idf_source,sentence_embedding_tf_idf_target,pca_sentence_embedding_average_source,pca_sentence_embedding_average_target,pca_sentence_embedding_tf_idf_source,pca_sentence_embedding_tf_idf_target,Translation
0,0,"[legal, need, debate]","[fall, legal, brauchen, debatte, fahren]",7,6,1,1,3,5,3,...,"{'': 0.0, ''s': 0.0, ',': 0.0, '--': 0.0, '-b'...","[[-0.0547864210481445, -0.001046886279558142, ...","[[-0.0004642516374588013, -0.00480425700079649...","[[-0.03485798347327795, -0.0002601149029797123...","[[-0.0008558102416588422, -0.00275132547459795...","[[-0.18665869534015656, 0.0541328601539135, 0....","[[0.1868635393679142, -0.10517097525298595, 0....","[[-0.1032800996159342, 0.030580838367026447, 0...","[[0.07970107030309546, -0.04937941786176913, 0...",1
1,1,"[, macro, financial, assistance, georgia]","[, weitere, makrofinanzhilfe, georgien]",2,1,1,1,4,4,4,...,"{'': 0.25233564621573196, ''s': 0.0, ',': 0.0,...","[[-0.07823536987416446, 0.028520435327664018, ...","[[-0.04885647725313902, -0.017179030925035477,...","[[-0.03417419916351019, 0.014746991299733562, ...","[[-0.026505450088098993, -0.010296839819223786...","[[-0.05651350598782301, -0.003383892122656107,...","[[-0.0351869510486722, 0.030359416268765926, -...","[[-0.028320199597242593, -0.003322719201683925...","[[-0.018461613968162153, 0.0168677438071605, -...",1
2,2,"[request, come, nothing, firstly, agreement, c...","[forderung, verlaufen, sand, abkomme, juli, , ...",23,22,3,5,22,16,22,...,"{'': 0.12676620105263722, ''s': 0.0, ',': 0.0,...","[[-0.045134014490759, 0.01751163163135061, -0....","[[-0.05232225576764904, 0.020760945742949843, ...","[[-0.010365411129523827, 0.0038327608980117133...","[[-0.013106617638670265, 0.004686814372174176,...","[[-0.1328783824108541, 0.09318970788735896, 0....","[[0.14227077613274255, 0.0006305481074377894, ...","[[-0.029045820329685612, 0.02108844935256146, ...","[[0.03488908218309481, 0.0008203417196038076, ...",1
3,3,"[judicial, system, work, lord, bethell, certai...","[funktionieren, rechtsstaat, lord, bethell, si...",17,12,1,3,14,12,13,...,"{'': 0.0, ''s': 0.0, ',': 0.0, '--': 0.0, '-b'...","[[-0.06474494597373101, 0.01477035337414306, -...","[[-0.04374217469659117, -0.018645388384660084,...","[[-0.017277284154384016, 0.005058955207043385,...","[[-0.012080231561605363, -0.005438625653106758...","[[-0.09408186576687373, 0.034677622553247675, ...","[[0.1254079987605413, -0.02755207723627488, 0....","[[-0.023637155804179286, 0.008096865210966515,...","[[0.04454561886093693, -0.013183248965263078, ...",1
4,4,"[council, 's, refusal, make, charter, fundamen...","[daher, weigerung, rat, grundrechtecharta, rec...",24,27,6,5,31,21,27,...,"{'': 0.0, ''s': 0.0, ',': 0.0, '--': 0.0, '-b'...","[[-0.025971028398579128, 0.013925570925214114,...","[[-0.03412883120795919, 0.008240846121528497, ...","[[-0.0044109827176865655, 0.001966324529453451...","[[-0.007807887274272604, 0.0016770821514765677...","[[-0.14225149118842986, 0.12023432431026147, 0...","[[0.2531268526282575, -0.032746767702822886, 0...","[[-0.027294689589584895, 0.0233857555393358, 0...","[[0.055980738282802846, -0.008415893602827644,...",1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,995,"[result, achieve, many, year, european, form, ...","[ergebnis, jahr, hinweg, europäische, form, so...",19,20,5,5,24,22,22,...,"{'': 0.0, ''s': 0.0, ',': 0.0, '--': 0.0, '-b'...","[[-0.04323588777333498, 0.01629018190890617, -...","[[-0.04303964310222202, 0.02331589128718608, -...","[[-0.009646008540982617, 0.0034379211438927766...","[[-0.00941331191963146, 0.005418964724541407, ...","[[-0.12084609662581768, 0.07701705311912865, 0...","[[0.1528394222776923, -0.004250000515538786, 0...","[[-0.025705371857393317, 0.01730794604275638, ...","[[0.03410446276745788, -0.001262826572331346, ...",1
996,996,"[hope, man, woman, afghanistan, believe, freed...","[wünschen, afghanistan, mann, frau, freiheit, ...",18,20,0,4,20,19,20,...,"{'': 0.0, ''s': 0.0, ',': 0.0, '--': 0.0, '-b'...","[[-0.055408882246793884, 0.03557439512683471, ...","[[-0.035435031442081225, 0.028897757121526143,...","[[-0.012152234091674813, 0.007647233675488792,...","[[-0.007682745401818995, 0.0064577388111917615...","[[-0.049277301216007846, 0.09290142847519171, ...","[[0.06168329710250392, -0.01276528722305289, 0...","[[-0.0105165061482805, 0.018708888845933053, 0...","[[0.01446977716742693, -0.003531519313800009, ...",1
997,997,"[member, therefore, receive, write, answer, qu...","[mitglied, daher, schriftlich, antwort, anfrag...",3,4,0,0,6,6,6,...,"{'': 0.0, ''s': 0.0, ',': 0.0, '--': 0.0, '-b'...","[[-0.0073543175822123885, 0.04231263898933927,...","[[-0.031383881655832134, 0.025362386057774227,...","[[-0.0028732507984971078, 0.016673778483697062...","[[-0.012763970955682755, 0.012213428779897206,...","[[-0.1528842349847158, 0.09079383251567681, 0....","[[0.14628264432152113, 0.009530164301395416, 0...","[[-0.061989143552626434, 0.039891886342773096,...","[[0.058890583197122814, 0.003327974210690777, ...",1
998,998,"[apply, product, within, eu, member, states, a...","[neue, regelung, produkt, sämtlich, eu-mitglie...",9,10,1,1,10,9,8,...,"{'': 0.0, ''s': 0.0, ',': 0.0, '--': 0.0, '-b'...","[[-0.018103930982761085, -0.015288953320123255...","[[-0.0181253104071532, -0.045000554860702584, ...","[[-0.004551185450077001, -0.007122507993248681...","[[-0.005375928984644692, -0.012707365399049776...","[[-0.1138174117077142, 0.011841099592857063, -...","[[0.1764306053519249, 0.03697663013424192, -0....","[[-0.05131239923001699, 0.008111775647292764, ...","[[0.06409916711630012, 0.005045074107518778, -...",1


In [12]:
parallel_sentences.preprocessed.translated_to_source_target

0                [case, illegal, need, criticism, train]
1                                     [several, armenia]
2      [precondition, connect, soil, june, end, view,...
3      [configure, democracy, exchequer, secure, dani...
4      [therefore, refusal, council, absurd, situatio...
                             ...                        
995    [outcome, year, across, european, form, solida...
996    [willing, afghanistan, man, woman, freedom, de...
997    [member, therefore, document, answer, reply, r...
998    [new, provision, product, almost, extend, vari...
999    [obviously, exclude, satisfaction, union, addr...
Name: translated_to_source_target, Length: 1000, dtype: object

In [None]:
embedding_array_normalized_source, embedding_dictionary_source = load_embeddings(
            embedding_array_source_path, embedding_dictionary_source_path)
embedding_array_normalized_target, embedding_dictionary_target = load_embeddings(
            embedding_array_target_path, embedding_dictionary_target_path)

In [None]:
import numpy as np

In [None]:
token_vector_source = parallel_sentences.preprocessed.token_preprocessed_embedding_source
token_vector_target = parallel_sentences.preprocessed.token_preprocessed_embedding_target

In [None]:
def create_translation_dictionary(token_vector_source, token_vector_target, 
                                  embedding_array_normalized_source, embedding_dictionary_source, 
                                  embedding_array_normalized_target, embedding_dictionary_target):
    unique_token_source = set([item for sublist in token_vector_source for item in sublist])
    unique_token_target = set([item for sublist in token_vector_target for item in sublist])
    
    source_index = 0
    word_embedding_dictionary_source = {}
    embedding_subset_dictionary_source = {}
    for token in unique_token_source:
        if embedding_dictionary_source.get(token):
                word_embedding_dictionary_source[token] = embedding_array_normalized_source[embedding_dictionary_source.get(token)].tolist()
                embedding_subset_dictionary_source[source_index] = token
                source_index += 1
                
    target_index = 0
    word_embedding_dictionary_target = {}
    embedding_subset_dictionary_target = {}
    for token in unique_token_target:
        if embedding_dictionary_target.get(token):
                word_embedding_dictionary_target[token] = embedding_array_normalized_target[embedding_dictionary_target.get(token)].tolist()
                embedding_subset_dictionary_target[target_index] = token
                target_index += 1
                
    embedding_subset_source = np.array(list(word_embedding_dictionary_source.values()))
    embedding_subset_target = np.array(list(word_embedding_dictionary_target.values()))
    
    def translation(token, word_embedding_dictionary_source, embedding_subset_target, embedding_subset_dictionary_target):
        norm_src_word_emb = word_embedding_dictionary_source[token]
        similarity_cos = np.dot(norm_src_word_emb, np.transpose(embedding_subset_target))
        most_similar_trg_index = np.argsort(-similarity_cos)[0].tolist()
        return embedding_subset_dictionary_target[most_similar_trg_index]
    
    translation_to_target_source = {}
    for token in unique_token_source:
        if embedding_dictionary_source.get(token):
            translation_to_target_source[token] = translation(token, word_embedding_dictionary_source, embedding_subset_target, embedding_subset_dictionary_target)
            
    translation_to_source_target = {}
    for token in unique_token_target:
        if embedding_dictionary_target.get(token):
            translation_to_source_target[token] = translation(token, word_embedding_dictionary_target, embedding_subset_source, embedding_subset_dictionary_source)
            
    return translation_to_target_source, translation_to_source_target
             

In [None]:
def translate_words(token_vector, translation_dictionary):
    def calculate_translations(word_list, translation_dictionary):
        translation_list = []
        for word in word_list:
            try:
                translation_list.append(translation_dictionary[word])
            except KeyError:
                continue
        return translation_list

    return token_vector.apply(lambda token_list: calculate_translations(token_list, translation_dictionary))

In [None]:
translation_to_target_source, translation_to_source_target = create_translation_dictionary(token_vector_source, token_vector_target, 
                                  embedding_array_normalized_source, embedding_dictionary_source, 
                                  embedding_array_normalized_target, embedding_dictionary_target)

In [None]:
parallel_sentences.preprocessed

In [None]:
# def save_object(obj, filename):
#     with open(filename, 'wb') as output:  # Overwrites any existing file.
#        pickle.dump(obj, output, pickle.HIGHEST_PROTOCOL)

# sample usage
# save_object(parallel_sentences, '../data/processed/processed_data_2505.pkl')

In [None]:
# import pickle 
# filehandler = open('../data/processed/processed_data_2505_2.pkl', 'wb') 
# pickle.dump(parallel_sentences, filehandler)

In [None]:
# import pickle 
# file = open("../data/processed/processed_data_2505_2.pkl",'rb')
# df = pickle.load(file)
# file.close()

## III. Create data set

In [None]:
from src.data.dataset_class import DataSet

In [None]:
dataset = DataSet(parallel_sentences)

In [None]:
n_training = 90
n_test_queries = 1
n_test_documents = 10
k = 1

In [None]:
dataset.get_sample(n_training, n_test_queries, n_test_documents,k)

In [None]:
dataset.dataset

## II. Create sentence based features

In this section we create features for our model, that are sentence based and should be created before the text is preprocessed.

In [None]:
from src.features.feature_generation_class import FeatureGeneration

In [None]:
features_train = FeatureGeneration(dataset.dataset, number_pc)

In [None]:
features_train.feature_generation()

In [None]:
features_train.feature_dataframe

In [None]:
# import pickle 
# filehandler = open('../data/processed/processed_data.pkl', 'wb') 
# pickle.dump(features.feature_dataframe, filehandler)

In [None]:
# import pickle 
# file = open("../data/processed/processed_data.pkl",'rb')
# df = pickle.load(file)
# file.close()

In [None]:
# df

## IV. Unsupervised Classification

## IV. Feature selection

In [None]:
import seaborn as sn
import numpy as np
import matplotlib.pyplot as plt

In [None]:
#look at correlation matrix
np.cov(df, bias=True)
corrMatrix=df.corr()
f=plt.figure(figsize=(14,9))
sn.heatmap(corrMatrix, annot=False)
f.show()

In [None]:
#Korrelation
correlated_features = set()
correlation_matrix = df.corr()

for i in range(len(correlation_matrix.columns)):
    for j in range(i):
        if abs(correlation_matrix.iloc[i, j]) > 0.8:
            print(f"The following features are correlated: {correlation_matrix.columns[i]} and {correlation_matrix.columns[j]}. Correlation = {round(abs(correlation_matrix.iloc[i, j]),2)}")
            colname = correlation_matrix.columns[i]
            correlated_features.add(colname)

print(f"Drop the following features: {correlated_features}")

In [None]:
#drop correlated features, but only when looking at a big dataset
df=df.drop(columns=correlated_features)

In [None]:
df

## V. Train Model


### Create Train Test Split

In [None]:
target=df['Translation']
df=df.drop(columns=['Translation'])
df

In [None]:
sum(df.isna().any())

In [None]:
df=df.fillna(0)

In [None]:
#scale data otherwise logistic regression does not converge
from sklearn import preprocessing

scaler = preprocessing.MinMaxScaler()
df[df.columns] = scaler.fit_transform(df[df.columns])


In [None]:
df

In [None]:
from sklearn.model_selection import train_test_split

data_train, data_test, target_train, target_test = train_test_split(
    df,target,test_size=0.2, random_state = 42)

### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
lr = LogisticRegression(class_weight = 'balanced', max_iter=10000).fit(data_train, target_train)
prediction = lr.predict(data_test)
acc = accuracy_score(target_test,prediction) 
f1= f1_score(target_test,prediction) 
pr= precision_score(target_test,prediction) 
re= recall_score(target_test,prediction) 
print("The Accuracy on test set: {:.4f}".format(acc))
print("The F1-Score on test set: {:.4f}".format(f1))
print("The Precision-Score on test set: {:.4f}".format(pr))
print("The Recall-Score on test set: {:.4f}".format(re))

In [None]:
# feature importance
# get importance
importance = lr.coef_[0]
# summarize feature importance
for i, v in enumerate(importance):
    print(f'Feature: {i} {data_train.columns[i]}, Score: {v}')

# plot feature importance
plt.bar([x for x in range(len(importance))], importance)
plt.show()

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
nb = GaussianNB().fit(data_train, target_train)
prediction = nb.predict(data_test)
acc = accuracy_score(target_test,prediction) 
f1= f1_score(target_test,prediction) 
pr= precision_score(target_test,prediction) 
re= recall_score(target_test,prediction) 
print("The Accuracy on test set: {:.4f}".format(acc))
print("The F1-Score on test set: {:.4f}".format(f1))
print("The Precision-Score on test set: {:.4f}".format(pr))
print("The Recall-Score on test set: {:.4f}".format(re))

In [None]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
svc = SVC().fit(data_train, target_train)
prediction = svc.predict(data_test)
acc = accuracy_score(target_test,prediction) 
f1= f1_score(target_test,prediction) 
pr= precision_score(target_test,prediction) 
re= recall_score(target_test,prediction) 
print("The Accuracy on test set: {:.4f}".format(acc))
print("The F1-Score on test set: {:.4f}".format(f1))
print("The Precision-Score on test set: {:.4f}".format(pr))
print("The Recall-Score on test set: {:.4f}".format(re))

In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
mlp = MLPClassifier().fit(data_train, target_train)
prediction = mlp.predict(data_test)
acc = accuracy_score(target_test,prediction) 
f1= f1_score(target_test,prediction) 
pr= precision_score(target_test,prediction) 
re= recall_score(target_test,prediction) 
print("The Accuracy on test set: {:.4f}".format(acc))
print("The F1-Score on test set: {:.4f}".format(f1))
print("The Precision-Score on test set: {:.4f}".format(pr))
print("The Recall-Score on test set: {:.4f}".format(re))