# Preprocessing and Feature Creation

In this notebook we import the data, preprocess the data and create features for supervised and unsupervised cross-lingual-information retrieval models.

## I. Import Data

In this section we import the English and German europarl datasets and combine them into a parallel sentence translation dataframe.

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
import sys
sys.path.append(os.path.dirname((os.path.abspath(''))))

from src.data import create_data_subset

In [None]:
create_data_subset(sentence_data_source_path='../data/external/europarl-v7.de-en.en',
                   sentence_data_target_path='../data/external/europarl-v7.de-en.de',
                   sample_size=25000,
                   sentence_data_sampled_path="../data/interim/europarl_en_de.pkl",)

## II. Preprocess data

In this section we preprocess the parallel sentence data for the feature generation

In [None]:
import spacy
from nltk.corpus import stopwords
from textblob import TextBlob as textblob_source
from textblob_de import TextBlobDE as textblob_target
import en_core_web_sm
import de_core_news_sm
# import it_core_news_sm
# import pl_core_news_sm
import time
from src.data import PreprocessingEuroParl

In [None]:
stopwords_source = stopwords.words('english')
stopwords_target = stopwords.words('german') # German stopwords
# stopwords_target = stopwords.words('italian') # Italian stopwords
# stopwords_target = stopwords.words('polish') # Polish stopwords
nlp_source = en_core_web_sm.load()
nlp_target = de_core_news_sm.load() # German pipeline
# nlp_target = it_core_news_sm.load() # Italian pipeline
# nlp_target = pl_core_news_sm.load() # Polish pipeline

In [None]:
parallel_sentences = PreprocessingEuroParl(df_sampled_path="../data/interim/europarl_en_de.pkl") # German
# parallel_sentences = PreprocessingEuroParl(df_sampled_path="../data/interim/europarl_en_it.pkl") # Italien
# parallel_sentences = PreprocessingEuroParl(df_sampled_path="../data/interim/europarl_en_pol.pkl") # Polnisch

In [None]:
parallel_sentences.preprocess_sentences(nlp_source, nlp_target, stopwords_source, stopwords_target)

In [None]:
parallel_sentences.extract_sentence_information(nlp_source, nlp_target)

In [None]:
parallel_sentences.create_embedding_information("proc_5k")

In [None]:
parallel_sentences.create_embedding_information("proc_b_1k")

In [None]:
parallel_sentences.create_embedding_information("vecmap")

In [None]:
parallel_sentences.preprocessed.to_json("../data/interim/preprocessed_data_en_de.json")

In [None]:
import pandas as pd
preprocessed_data = pd.read_json("../data/interim/preprocessed_data_en_de.json")
parallel_sentences = PreprocessingEuroParl(df_sampled_path="../data/interim/europarl_en_de.pkl")
parallel_sentences.preprocessed = preprocessed_data

In [None]:
parallel_sentences.preprocessed

In [None]:
parallel_sentences.dataframe

## III. Create data set

In this section we create the datasets for the training of the supervised model and the data for the supervised and unsupervised retrieval.

In [None]:
from src.data import DataSet

In [None]:
n_model = 20000
n_queries = 100
n_retrieval = 5000
k = 10
sample_size_k = 100

In [None]:
dataset = DataSet(parallel_sentences.preprocessed)
#dataset = DataSet(preprocessed_data)

In [None]:
dataset.split_model_retrieval(n_model, n_retrieval)

In [None]:
dataset.create_model_index(n_model, k, sample_size_k,
     "sentence_embedding_tf_idf_proc_5k_source", "sentence_embedding_tf_idf_proc_5k_target")

In [None]:
dataset.model_dataset_index.reset_index(drop=True).to_feather("../data/processed/dataset_model_index_en_de.feather")

In [None]:
# import pandas as pd
# pd.read_feather("../data/processed/dataset_model_index.feather")

In [None]:
import pandas as pd
#dataset.create_retrieval_index(n_queries)

# If your pandas version is old, use this instead
query = pd.DataFrame({"id_source": dataset.retrieval_subset.iloc[:n_queries]["id_source"]})
documents = pd.DataFrame({"id_target": dataset.retrieval_subset["id_target"]})
index = pd.MultiIndex.from_product([dataset.retrieval_subset.iloc[:n_queries]["id_source"], dataset.retrieval_subset["id_target"]], names = ["id_source", "id_target"])
dataset.retrieval_dataset_index = pd.DataFrame(index = index).reset_index()

In [None]:
dataset.retrieval_dataset_index.reset_index(drop=True).to_feather("../data/processed/dataset_retrieval_index_en_de.feather")

In [None]:
# import pandas as pd
# pd.read_feather("../data/processed/dataset_retrieval_index.feather")

## IV. Create features

In this section we create features for our model, that are sentence based and should be created before the text is preprocessed.

In [None]:
#%autoreload 2
from src.features import feature_generation_class

In [None]:
# import pickle
# with open(r"../data/processed/correlated_features.pkl", "rb") as file:
#    chosen_features = pickle.load(file)

Generation of the training data for the supervised classifciation model.

In [None]:
features_model = feature_generation_class.FeatureGeneration(dataset.model_dataset_index, 
                                                             parallel_sentences.preprocessed)

In [None]:
features_model.create_feature_dataframe()

In [None]:
features_model.create_sentence_features()

In [None]:
features_model.create_embedding_features("proc_5k")

In [None]:
features_model.create_embedding_features("proc_b_1k")

In [None]:
features_model.create_embedding_features("vecmap")

In [None]:
features_model.feature_dataframe.reset_index(drop=True).to_feather("../data/processed/feature_model_en_de.feather")

In [None]:
# import pandas as pd
# pd.read_feather("../data/processed/feature_model.feather")

Generation of the data for the crosslingual information retrieval task.

In [None]:
features_retrieval = feature_generation_class.FeatureGeneration(dataset.retrieval_dataset_index, 
                                                            parallel_sentences.preprocessed)

In [None]:
features_retrieval.create_feature_dataframe()

In [None]:
features_retrieval.create_sentence_features()

In [None]:
features_retrieval.create_embedding_features("proc_5k")

In [None]:
features_retrieval.create_embedding_features("proc_b_1k")

In [None]:
features_retrieval.create_embedding_features("vecmap")

In [None]:
features_retrieval.feature_dataframe.reset_index(drop=True).to_feather("../data/processed/feature_retrieval_en_de.feather")

In [None]:
import pandas as pd
pd.read_feather("../data/processed/feature_retrieval_en_de_testset.feather")