# Preprocessing and Feature Creation

In this notebook we import the data, preprocess the data and create features for supervised and unsupervised cross-lingual-information retrieval models.

## I. Import Data

In this section we import the English and German europarl datasets and combine them into a parallel sentence translation dataframe.

In [None]:
from src.data.preprocessing_class import PreprocessingEuroParl

In [None]:
parallel_sentences = PreprocessingEuroParl(sentence_data_source='../data/external/europarl-v7.de-en.en',
                 sentence_data_target='../data/external/europarl-v7.de-en.de',number_datapoints=100)

In [None]:
parallel_sentences.dataframe

In [None]:
# #duc dataset
# parallel_sentences.dataframe['Translation']=1
# wrong= PreprocessingEuroParl(sentence_data_source='../data/external/europarl-v7.de-en.en',
#                  sentence_data_target='../data/external/europarl-v7.de-en.de',number_datapoints=10000)
# import pandas as pd
# wrong_data=pd.concat([wrong.dataframe.drop(columns='text_target').reset_index(drop=True),wrong.dataframe['text_target'].sample(frac=1).reset_index(drop=True)],axis=1)
# wrong_data['Translation']=0
# data=pd.concat([parallel_sentences.dataframe.reset_index(drop=True),wrong_data.reset_index(drop=True)])
# import pickle 
# filehandler = open('../data/processed/dataset_duc.pkl', 'wb') 
# pickle.dump(data, filehandler)

## II. Preprocess data

In this section we preprocess the parallel sentence data.

In [None]:
import spacy
from nltk.corpus import stopwords
from textblob import TextBlob as textblob_source
from textblob_de import TextBlobDE as textblob_target
import en_core_web_sm
import de_core_news_sm

In [None]:
stopwords_source = stopwords.words('english')
stopwords_target = stopwords.words('german')
nlp_source = en_core_web_sm.load()
nlp_target = de_core_news_sm.load()
embedding_matrix_source = "../data/interim/proc_b_src_emb.p"
embedding_dictionary_source =  "../data/interim/proc_b_src_word.p"
embedding_matrix_target = "../data/interim/proc_b_trg_emb.p"
embedding_dictionary_target =  "../data/interim/proc_b_trg_word.p"
number_translations = 1
number_pc = 10

In [None]:
parallel_sentences.preprocess_sentences(stopwords_source, nlp_source, textblob_source,
                                               embedding_matrix_source, embedding_dictionary_source,
                                                stopwords_target,nlp_target, textblob_target,
                                               embedding_matrix_target, embedding_dictionary_target,
                                                number_translations, number_pc)

In [None]:
# def save_object(obj, filename):
#     with open(filename, 'wb') as output:  # Overwrites any existing file.
#        pickle.dump(obj, output, pickle.HIGHEST_PROTOCOL)

# sample usage
# save_object(parallel_sentences, '../data/processed/processed_data_2505.pkl')

In [None]:
# import pickle 
# filehandler = open('../data/processed/processed_data_2505_2.pkl', 'wb') 
# pickle.dump(parallel_sentences, filehandler)

In [None]:
# import pickle 
# file = open("../data/processed/processed_data_2505_2.pkl",'rb')
# df = pickle.load(file)
# file.close()

## III. Create data set

In [None]:
from src.data.dataset_class import DataSet

In [None]:
dataset = DataSet(parallel_sentences)

In [None]:
n_training = 50
n_test_queries = 5
n_test_documents = 10

In [None]:
dataset.get_sample(n_training, n_test_queries, n_test_documents)

## II. Create sentence based features

In this section we create features for our model, that are sentence based and should be created before the text is preprocessed.

In [None]:
from src.features.feature_generation_class import FeatureGeneration

In [None]:
features = FeatureGeneration(dataset.dataset, number_pc)

In [None]:
features.feature_generation()

In [None]:
features.feature_dataframe

In [None]:
# import pickle 
# filehandler = open('../data/processed/processed_data.pkl', 'wb') 
# pickle.dump(features.feature_dataframe, filehandler)

In [None]:
# import pickle 
# file = open("../data/processed/processed_data.pkl",'rb')
# df = pickle.load(file)
# file.close()

In [None]:
# df

## IV. Feature selection

In [None]:
import seaborn as sn
import numpy as np
import matplotlib.pyplot as plt

In [None]:
#look at correlation matrix
np.cov(df, bias=True)
corrMatrix=df.corr()
f=plt.figure(figsize=(14,9))
sn.heatmap(corrMatrix, annot=False)
f.show()

In [None]:
#Korrelation
correlated_features = set()
correlation_matrix = df.corr()

for i in range(len(correlation_matrix.columns)):
    for j in range(i):
        if abs(correlation_matrix.iloc[i, j]) > 0.8:
            print(f"The following features are correlated: {correlation_matrix.columns[i]} and {correlation_matrix.columns[j]}. Correlation = {round(abs(correlation_matrix.iloc[i, j]),2)}")
            colname = correlation_matrix.columns[i]
            correlated_features.add(colname)

print(f"Drop the following features: {correlated_features}")

In [None]:
#drop correlated features, but only when looking at a big dataset
df=df.drop(columns=correlated_features)

In [None]:
df

## V. Train Model


### Create Train Test Split

In [None]:
target=df['Translation']
df=df.drop(columns=['Translation'])
df

In [None]:
sum(df.isna().any())

In [None]:
df=df.fillna(0)

In [None]:
#scale data otherwise logistic regression does not converge
from sklearn import preprocessing

scaler = preprocessing.MinMaxScaler()
df[df.columns] = scaler.fit_transform(df[df.columns])


In [None]:
df

In [None]:
from sklearn.model_selection import train_test_split

data_train, data_test, target_train, target_test = train_test_split(
    df,target,test_size=0.2, random_state = 42)

### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
lr = LogisticRegression(class_weight = 'balanced', max_iter=10000).fit(data_train, target_train)
prediction = lr.predict(data_test)
acc = accuracy_score(target_test,prediction) 
f1= f1_score(target_test,prediction) 
pr= precision_score(target_test,prediction) 
re= recall_score(target_test,prediction) 
print("The Accuracy on test set: {:.4f}".format(acc))
print("The F1-Score on test set: {:.4f}".format(f1))
print("The Precision-Score on test set: {:.4f}".format(pr))
print("The Recall-Score on test set: {:.4f}".format(re))

In [None]:
# feature importance
# get importance
importance = lr.coef_[0]
# summarize feature importance
for i, v in enumerate(importance):
    print(f'Feature: {i} {data_train.columns[i]}, Score: {v}')

# plot feature importance
plt.bar([x for x in range(len(importance))], importance)
plt.show()

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
nb = GaussianNB().fit(data_train, target_train)
prediction = nb.predict(data_test)
acc = accuracy_score(target_test,prediction) 
f1= f1_score(target_test,prediction) 
pr= precision_score(target_test,prediction) 
re= recall_score(target_test,prediction) 
print("The Accuracy on test set: {:.4f}".format(acc))
print("The F1-Score on test set: {:.4f}".format(f1))
print("The Precision-Score on test set: {:.4f}".format(pr))
print("The Recall-Score on test set: {:.4f}".format(re))

In [None]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
svc = SVC().fit(data_train, target_train)
prediction = svc.predict(data_test)
acc = accuracy_score(target_test,prediction) 
f1= f1_score(target_test,prediction) 
pr= precision_score(target_test,prediction) 
re= recall_score(target_test,prediction) 
print("The Accuracy on test set: {:.4f}".format(acc))
print("The F1-Score on test set: {:.4f}".format(f1))
print("The Precision-Score on test set: {:.4f}".format(pr))
print("The Recall-Score on test set: {:.4f}".format(re))

In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
mlp = MLPClassifier().fit(data_train, target_train)
prediction = mlp.predict(data_test)
acc = accuracy_score(target_test,prediction) 
f1= f1_score(target_test,prediction) 
pr= precision_score(target_test,prediction) 
re= recall_score(target_test,prediction) 
print("The Accuracy on test set: {:.4f}".format(acc))
print("The F1-Score on test set: {:.4f}".format(f1))
print("The Precision-Score on test set: {:.4f}".format(pr))
print("The Recall-Score on test set: {:.4f}".format(re))