# Unsupervised Retrieval

In this notebook we use the generated features for a unsupervised crosslingual information retrieval task.

## I. Import Data

In this section we import the feature dataframe for the retrieval task.

In [1]:
import pickle5 as pickle
import pandas as pd
import sys, os
sys.path.append(os.path.dirname((os.path.abspath(''))))

from src.models.predict_model import MAP_score 

feature_retrieval = pd.read_feather("../data/processed/feature_retrieval_en_de.feather")
feature_retrieval = feature_retrieval.rename(columns={"id_source": "source_id", "id_target": "target_id"})

feature_retrieval_de = pd.read_feather("../data/processed/feature_retrieval_en_de_testset.feather")
feature_retrieval_de = feature_retrieval_de.rename(columns={"id_source": "source_id", "id_target": "target_id"})

feature_retrieval_pl = pd.read_feather("../data/processed/feature_retrieval_en_pl.feather")
feature_retrieval_pl = feature_retrieval_pl.rename(columns={"id_source": "source_id", "id_target": "target_id"})

feature_retrieval_it = pd.read_feather("../data/processed/feature_retrieval_en_it.feather")
feature_retrieval_it = feature_retrieval_it.rename(columns={"id_source": "source_id", "id_target": "target_id"})

feature_retrieval_doc = pd.read_feather("../data/processed/feature_retrieval_doc.feather")
feature_retrieval_doc = feature_retrieval_doc.rename(columns={"id_source": "source_id", "id_target": "target_id"})

## II. Unsupervised Retrieval

For Unsupervised Classification we use the distance measure features defined in the feature generation. Therefore we have four unsupervised models. Three crosslingual embedding based and one sentence encoder based. The three crosslingual embedding based models work with euclidean, cosine and word mover distance.

# Unsupervised: Proc 5k

In [2]:
print("Unsupervised for EN-DE: Validation Set")
unsupervised_prediction = feature_retrieval["cosine_similarity_average_proc_5k"].to_numpy()
unsupervised_prediction = [[1- pos_prediction_prob, pos_prediction_prob] for pos_prediction_prob in unsupervised_prediction]
map_score = MAP_score(feature_retrieval["source_id"], feature_retrieval["Translation"], unsupervised_prediction)
print("Map Score for Cosine Similarity Average: {}".format(map_score))

unsupervised_prediction = feature_retrieval["cosine_similarity_tf_idf_proc_5k"].to_numpy()
unsupervised_prediction = [[1- pos_prediction_prob, pos_prediction_prob] for pos_prediction_prob in unsupervised_prediction]
map_score = MAP_score(feature_retrieval["source_id"], feature_retrieval["Translation"], unsupervised_prediction)
print("Map Score for Cosine Similarity tf-idf: {}".format(map_score))

unsupervised_prediction = feature_retrieval["jaccard_translation_proc_5k"].to_numpy()
unsupervised_prediction = [[1- pos_prediction_prob, pos_prediction_prob] for pos_prediction_prob in unsupervised_prediction]
map_score = MAP_score(feature_retrieval["source_id"], feature_retrieval["Translation"], unsupervised_prediction)
print("Map Score for Jaccard Translation: {}".format(map_score))

print("\nUnsupervised for EN-DE: Test Set")
unsupervised_prediction = feature_retrieval_de["cosine_similarity_average_proc_5k"].to_numpy()
unsupervised_prediction = [[1- pos_prediction_prob, pos_prediction_prob] for pos_prediction_prob in unsupervised_prediction]
map_score = MAP_score(feature_retrieval_de["source_id"], feature_retrieval_de["Translation"], unsupervised_prediction)
print("Map Score for Cosine Similarity Average: {}".format(map_score))

unsupervised_prediction = feature_retrieval_de["cosine_similarity_tf_idf_proc_5k"].to_numpy()
unsupervised_prediction = [[1- pos_prediction_prob, pos_prediction_prob] for pos_prediction_prob in unsupervised_prediction]
map_score = MAP_score(feature_retrieval_de["source_id"], feature_retrieval_de["Translation"], unsupervised_prediction)
print("Map Score for Cosine Similarity tf-idf: {}".format(map_score))

unsupervised_prediction = feature_retrieval_de["jaccard_translation_proc_5k"].to_numpy()
unsupervised_prediction = [[1- pos_prediction_prob, pos_prediction_prob] for pos_prediction_prob in unsupervised_prediction]
map_score = MAP_score(feature_retrieval_de["source_id"], feature_retrieval_de["Translation"], unsupervised_prediction)
print("Map Score for Jaccard Translation: {}".format(map_score))

Unsupervised for EN-DE: Validation Set
Map Score for Cosine Similarity Average: 0.4833301160883595
Map Score for Cosine Similarity tf-idf: 0.5509129715986113
Map Score for Jaccard Translation: 0.7515430182419874

Unsupervised for EN-DE: Test Set
Map Score for Cosine Similarity Average: 0.5008120315596996
Map Score for Cosine Similarity tf-idf: 0.5666612981721634
Map Score for Jaccard Translation: 0.8153442621756715


In [3]:
print("Unsupervised for EN-IT")
unsupervised_prediction = feature_retrieval_it["cosine_similarity_average_proc_5k"].to_numpy()
unsupervised_prediction = [[1- pos_prediction_prob, pos_prediction_prob] for pos_prediction_prob in unsupervised_prediction]
map_score = MAP_score(feature_retrieval_it["source_id"], feature_retrieval_it["Translation"], unsupervised_prediction)
print("Map Score for Cosine Similarity Average: {}".format(map_score))

unsupervised_prediction = feature_retrieval_it["cosine_similarity_tf_idf_proc_5k"].to_numpy()
unsupervised_prediction = [[1- pos_prediction_prob, pos_prediction_prob] for pos_prediction_prob in unsupervised_prediction]
map_score = MAP_score(feature_retrieval_it["source_id"], feature_retrieval_it["Translation"], unsupervised_prediction)
print("Map Score for Cosine Similarity tf-idf: {}".format(map_score))

unsupervised_prediction = feature_retrieval_it["jaccard_translation_proc_5k"].to_numpy()
unsupervised_prediction = [[1- pos_prediction_prob, pos_prediction_prob] for pos_prediction_prob in unsupervised_prediction]
map_score = MAP_score(feature_retrieval_it["source_id"], feature_retrieval_it["Translation"], unsupervised_prediction)
print("Map Score for Jaccard Translation: {}".format(map_score))

Unsupervised for EN-IT
Map Score for Cosine Similarity Average: 0.5571193171063424
Map Score for Cosine Similarity tf-idf: 0.5789370497247365
Map Score for Jaccard Translation: 0.7834398160814157


In [4]:
print("Unsupervised for EN-PL")
unsupervised_prediction = feature_retrieval_pl["cosine_similarity_average_proc_5k"].to_numpy()
unsupervised_prediction = [[1- pos_prediction_prob, pos_prediction_prob] for pos_prediction_prob in unsupervised_prediction]
map_score = MAP_score(feature_retrieval_pl["source_id"], feature_retrieval_pl["Translation"], unsupervised_prediction)
print("Map Score for Cosine Similarity Average: {}".format(map_score))

unsupervised_prediction = feature_retrieval_pl["cosine_similarity_tf_idf_proc_5k"].to_numpy()
unsupervised_prediction = [[1- pos_prediction_prob, pos_prediction_prob] for pos_prediction_prob in unsupervised_prediction]
map_score = MAP_score(feature_retrieval_pl["source_id"], feature_retrieval_pl["Translation"], unsupervised_prediction)
print("Map Score for Cosine Similarity tf-idf: {}".format(map_score))

unsupervised_prediction = feature_retrieval_pl["jaccard_translation_proc_5k"].to_numpy()
unsupervised_prediction = [[1- pos_prediction_prob, pos_prediction_prob] for pos_prediction_prob in unsupervised_prediction]
map_score = MAP_score(feature_retrieval_pl["source_id"], feature_retrieval_pl["Translation"], unsupervised_prediction)
print("Map Score for Jaccard Translation: {}".format(map_score))

Unsupervised for EN-PL
Map Score for Cosine Similarity Average: 0.4817124126905841
Map Score for Cosine Similarity tf-idf: 0.537818318817475
Map Score for Jaccard Translation: 0.7906280126497955


# Unsupervised: Proc-B 1k

In [5]:
print("Unsupervised for EN-DE: Validation Set")
unsupervised_prediction = feature_retrieval["cosine_similarity_average_proc_b_1k"].to_numpy()
unsupervised_prediction = [[1- pos_prediction_prob, pos_prediction_prob] for pos_prediction_prob in unsupervised_prediction]
map_score = MAP_score(feature_retrieval["source_id"], feature_retrieval["Translation"], unsupervised_prediction)
print("Map Score for Cosine Similarity Average: {}".format(map_score))

unsupervised_prediction = feature_retrieval["cosine_similarity_tf_idf_proc_b_1k"].to_numpy()
unsupervised_prediction = [[1- pos_prediction_prob, pos_prediction_prob] for pos_prediction_prob in unsupervised_prediction]
map_score = MAP_score(feature_retrieval["source_id"], feature_retrieval["Translation"], unsupervised_prediction)
print("Map Score for Cosine Similarity tf-idf: {}".format(map_score))

unsupervised_prediction = feature_retrieval["jaccard_translation_proc_b_1k"].to_numpy()
unsupervised_prediction = [[1- pos_prediction_prob, pos_prediction_prob] for pos_prediction_prob in unsupervised_prediction]
map_score = MAP_score(feature_retrieval["source_id"], feature_retrieval["Translation"], unsupervised_prediction)
print("Map Score for Jaccard Translation: {}".format(map_score))


print("\nUnsupervised for EN-DE: Test Set")
unsupervised_prediction = feature_retrieval_de["cosine_similarity_average_proc_b_1k"].to_numpy()
unsupervised_prediction = [[1- pos_prediction_prob, pos_prediction_prob] for pos_prediction_prob in unsupervised_prediction]
map_score = MAP_score(feature_retrieval_de["source_id"], feature_retrieval_de["Translation"], unsupervised_prediction)
print("Map Score for Cosine Similarity Average: {}".format(map_score))

unsupervised_prediction = feature_retrieval_de["cosine_similarity_tf_idf_proc_b_1k"].to_numpy()
unsupervised_prediction = [[1- pos_prediction_prob, pos_prediction_prob] for pos_prediction_prob in unsupervised_prediction]
map_score = MAP_score(feature_retrieval_de["source_id"], feature_retrieval_de["Translation"], unsupervised_prediction)
print("Map Score for Cosine Similarity tf-idf: {}".format(map_score))

unsupervised_prediction = feature_retrieval_de["jaccard_translation_proc_b_1k"].to_numpy()
unsupervised_prediction = [[1- pos_prediction_prob, pos_prediction_prob] for pos_prediction_prob in unsupervised_prediction]
map_score = MAP_score(feature_retrieval_de["source_id"], feature_retrieval_de["Translation"], unsupervised_prediction)
print("Map Score for Jaccard Translation: {}".format(map_score))

Unsupervised for EN-DE: Validation Set
Map Score for Cosine Similarity Average: 0.441681530287522
Map Score for Cosine Similarity tf-idf: 0.5309456115988597
Map Score for Jaccard Translation: 0.7376443751786019

Unsupervised for EN-DE: Test Set
Map Score for Cosine Similarity Average: 0.5056690744651726
Map Score for Cosine Similarity tf-idf: 0.5587336694223508
Map Score for Jaccard Translation: 0.7946034589406027


In [6]:
print("Unsupervised for EN-IT")
unsupervised_prediction = feature_retrieval_it["cosine_similarity_average_proc_b_1k"].to_numpy()
unsupervised_prediction = [[1- pos_prediction_prob, pos_prediction_prob] for pos_prediction_prob in unsupervised_prediction]
map_score = MAP_score(feature_retrieval_it["source_id"], feature_retrieval_it["Translation"], unsupervised_prediction)
print("Map Score for Cosine Similarity Average: {}".format(map_score))

unsupervised_prediction = feature_retrieval_it["cosine_similarity_tf_idf_proc_b_1k"].to_numpy()
unsupervised_prediction = [[1- pos_prediction_prob, pos_prediction_prob] for pos_prediction_prob in unsupervised_prediction]
map_score = MAP_score(feature_retrieval_it["source_id"], feature_retrieval_it["Translation"], unsupervised_prediction)
print("Map Score for Cosine Similarity tf-idf: {}".format(map_score))

unsupervised_prediction = feature_retrieval_it["jaccard_translation_proc_b_1k"].to_numpy()
unsupervised_prediction = [[1- pos_prediction_prob, pos_prediction_prob] for pos_prediction_prob in unsupervised_prediction]
map_score = MAP_score(feature_retrieval_it["source_id"], feature_retrieval_it["Translation"], unsupervised_prediction)
print("Map Score for Jaccard Translation: {}".format(map_score))

Unsupervised for EN-IT
Map Score for Cosine Similarity Average: 0.5344506426258512
Map Score for Cosine Similarity tf-idf: 0.5751505222592489
Map Score for Jaccard Translation: 0.7870918525705566


In [7]:
print("Unsupervised for EN-PL")
unsupervised_prediction = feature_retrieval_pl["cosine_similarity_average_proc_b_1k"].to_numpy()
unsupervised_prediction = [[1- pos_prediction_prob, pos_prediction_prob] for pos_prediction_prob in unsupervised_prediction]
map_score = MAP_score(feature_retrieval_pl["source_id"], feature_retrieval_pl["Translation"], unsupervised_prediction)
print("Map Score for Cosine Similarity Average: {}".format(map_score))

unsupervised_prediction = feature_retrieval_pl["cosine_similarity_tf_idf_proc_b_1k"].to_numpy()
unsupervised_prediction = [[1- pos_prediction_prob, pos_prediction_prob] for pos_prediction_prob in unsupervised_prediction]
map_score = MAP_score(feature_retrieval_pl["source_id"], feature_retrieval_pl["Translation"], unsupervised_prediction)
print("Map Score for Cosine Similarity tf-idf: {}".format(map_score))

unsupervised_prediction = feature_retrieval_pl["jaccard_translation_proc_b_1k"].to_numpy()
unsupervised_prediction = [[1- pos_prediction_prob, pos_prediction_prob] for pos_prediction_prob in unsupervised_prediction]
map_score = MAP_score(feature_retrieval_pl["source_id"], feature_retrieval_pl["Translation"], unsupervised_prediction)
print("Map Score for Jaccard Translation: {}".format(map_score))

Unsupervised for EN-PL
Map Score for Cosine Similarity Average: 0.4653768805594804
Map Score for Cosine Similarity tf-idf: 0.567114306360162
Map Score for Jaccard Translation: 0.7721187995073494


# Unsupervised: VecMap

In [8]:
print("Unsupervised for EN-DE: Validation Set")
unsupervised_prediction = feature_retrieval["cosine_similarity_average_vecmap"].to_numpy()
unsupervised_prediction = [[1- pos_prediction_prob, pos_prediction_prob] for pos_prediction_prob in unsupervised_prediction]
map_score = MAP_score(feature_retrieval["source_id"], feature_retrieval["Translation"], unsupervised_prediction)
print("Map Score for Cosine Similarity Average: {}".format(map_score))

unsupervised_prediction = feature_retrieval["cosine_similarity_tf_idf_vecmap"].to_numpy()
unsupervised_prediction = [[1- pos_prediction_prob, pos_prediction_prob] for pos_prediction_prob in unsupervised_prediction]
map_score = MAP_score(feature_retrieval["source_id"], feature_retrieval["Translation"], unsupervised_prediction)
print("Map Score for Cosine Similarity tf-idf: {}".format(map_score))

unsupervised_prediction = feature_retrieval["jaccard_translation_vecmap"].to_numpy()
unsupervised_prediction = [[1- pos_prediction_prob, pos_prediction_prob] for pos_prediction_prob in unsupervised_prediction]
map_score = MAP_score(feature_retrieval["source_id"], feature_retrieval["Translation"], unsupervised_prediction)
print("Map Score for Jaccard Translation: {}".format(map_score))

print("\nUnsupervised for EN-DE: Test Set")
unsupervised_prediction = feature_retrieval_de["cosine_similarity_average_vecmap"].to_numpy()
unsupervised_prediction = [[1- pos_prediction_prob, pos_prediction_prob] for pos_prediction_prob in unsupervised_prediction]
map_score = MAP_score(feature_retrieval_de["source_id"], feature_retrieval_de["Translation"], unsupervised_prediction)
print("Map Score for Cosine Similarity Average: {}".format(map_score))

unsupervised_prediction = feature_retrieval_de["cosine_similarity_tf_idf_vecmap"].to_numpy()
unsupervised_prediction = [[1- pos_prediction_prob, pos_prediction_prob] for pos_prediction_prob in unsupervised_prediction]
map_score = MAP_score(feature_retrieval_de["source_id"], feature_retrieval_de["Translation"], unsupervised_prediction)
print("Map Score for Cosine Similarity tf-idf: {}".format(map_score))

unsupervised_prediction = feature_retrieval_de["jaccard_translation_vecmap"].to_numpy()
unsupervised_prediction = [[1- pos_prediction_prob, pos_prediction_prob] for pos_prediction_prob in unsupervised_prediction]
map_score = MAP_score(feature_retrieval_de["source_id"], feature_retrieval_de["Translation"], unsupervised_prediction)
print("Map Score for Jaccard Translation: {}".format(map_score))

Unsupervised for EN-DE: Validation Set
Map Score for Cosine Similarity Average: 0.572114424544042
Map Score for Cosine Similarity tf-idf: 0.6234430179604064
Map Score for Jaccard Translation: 0.7366128879231537

Unsupervised for EN-DE: Test Set
Map Score for Cosine Similarity Average: 0.5854638386260453
Map Score for Cosine Similarity tf-idf: 0.6317861300961769
Map Score for Jaccard Translation: 0.7777866976792339


In [9]:
print("Unsupervised for EN-IT")
unsupervised_prediction = feature_retrieval_it["cosine_similarity_average_vecmap"].to_numpy()
unsupervised_prediction = [[1- pos_prediction_prob, pos_prediction_prob] for pos_prediction_prob in unsupervised_prediction]
map_score = MAP_score(feature_retrieval_it["source_id"], feature_retrieval_it["Translation"], unsupervised_prediction)
print("Map Score for Cosine Similarity Average: {}".format(map_score))

unsupervised_prediction = feature_retrieval_it["cosine_similarity_tf_idf_vecmap"].to_numpy()
unsupervised_prediction = [[1- pos_prediction_prob, pos_prediction_prob] for pos_prediction_prob in unsupervised_prediction]
map_score = MAP_score(feature_retrieval_it["source_id"], feature_retrieval_it["Translation"], unsupervised_prediction)
print("Map Score for Cosine Similarity tf-idf: {}".format(map_score))

unsupervised_prediction = feature_retrieval_it["jaccard_translation_vecmap"].to_numpy()
unsupervised_prediction = [[1- pos_prediction_prob, pos_prediction_prob] for pos_prediction_prob in unsupervised_prediction]
map_score = MAP_score(feature_retrieval_it["source_id"], feature_retrieval_it["Translation"], unsupervised_prediction)
print("Map Score for Jaccard Translation: {}".format(map_score))

Unsupervised for EN-IT
Map Score for Cosine Similarity Average: 0.5794129831597857
Map Score for Cosine Similarity tf-idf: 0.6349631765333856
Map Score for Jaccard Translation: 0.7945411716464752


In [10]:
print("Unsupervised for EN-PL")
unsupervised_prediction = feature_retrieval_pl["cosine_similarity_average_vecmap"].to_numpy()
unsupervised_prediction = [[1- pos_prediction_prob, pos_prediction_prob] for pos_prediction_prob in unsupervised_prediction]
map_score = MAP_score(feature_retrieval_pl["source_id"], feature_retrieval_pl["Translation"], unsupervised_prediction)
print("Map Score for Cosine Similarity Average: {}".format(map_score))

unsupervised_prediction = feature_retrieval_pl["cosine_similarity_tf_idf_vecmap"].to_numpy()
unsupervised_prediction = [[1- pos_prediction_prob, pos_prediction_prob] for pos_prediction_prob in unsupervised_prediction]
map_score = MAP_score(feature_retrieval_pl["source_id"], feature_retrieval_pl["Translation"], unsupervised_prediction)
print("Map Score for Cosine Similarity tf-idf: {}".format(map_score))

unsupervised_prediction = feature_retrieval_pl["jaccard_translation_vecmap"].to_numpy()
unsupervised_prediction = [[1- pos_prediction_prob, pos_prediction_prob] for pos_prediction_prob in unsupervised_prediction]
map_score = MAP_score(feature_retrieval_pl["source_id"], feature_retrieval_pl["Translation"], unsupervised_prediction)
print("Map Score for Jaccard Translation: {}".format(map_score))

Unsupervised for EN-PL
Map Score for Cosine Similarity Average: 0.5873575441611578
Map Score for Cosine Similarity tf-idf: 0.6404432949480776
Map Score for Jaccard Translation: 0.7751963922254527


# Document Corpus

In [11]:
unsupervised_prediction = feature_retrieval_doc["cosine_similarity_average_vecmap"].to_numpy()
unsupervised_prediction = [[1- pos_prediction_prob, pos_prediction_prob] for pos_prediction_prob in unsupervised_prediction]
map_score = MAP_score(feature_retrieval_doc["source_id"], feature_retrieval_doc["Translation"], unsupervised_prediction)
print("Document Corpus Evaluation:")

print("\n------Proc-5K-----")
print("Map Score for Cosine Similarity Average: {}".format(map_score))

unsupervised_prediction = feature_retrieval_doc["cosine_similarity_tf_idf_proc_5k"].to_numpy()
unsupervised_prediction = [[1- pos_prediction_prob, pos_prediction_prob] for pos_prediction_prob in unsupervised_prediction]
map_score = MAP_score(feature_retrieval_doc["source_id"], feature_retrieval_doc["Translation"], unsupervised_prediction)
print("Map Score for Cosine Similarity tfidf: {}".format(map_score))

unsupervised_prediction = feature_retrieval_doc["jaccard_translation_proc_5k"].to_numpy()
unsupervised_prediction = [[1- pos_prediction_prob, pos_prediction_prob] for pos_prediction_prob in unsupervised_prediction]
map_score = MAP_score(feature_retrieval_doc["source_id"], feature_retrieval_doc["Translation"], unsupervised_prediction)
print("Map Score for Jaccard Translation: {}".format(map_score))


print("\n------Proc-b-1K-----")
unsupervised_prediction = feature_retrieval_doc["cosine_similarity_average_proc_b_1k"].to_numpy()
unsupervised_prediction = [[1- pos_prediction_prob, pos_prediction_prob] for pos_prediction_prob in unsupervised_prediction]
map_score = MAP_score(feature_retrieval_doc["source_id"], feature_retrieval_doc["Translation"], unsupervised_prediction)
print("Map Score for Cosine Similarity Average: {}".format(map_score))

unsupervised_prediction = feature_retrieval_doc["cosine_similarity_tf_idf_proc_b_1k"].to_numpy()
unsupervised_prediction = [[1- pos_prediction_prob, pos_prediction_prob] for pos_prediction_prob in unsupervised_prediction]
map_score = MAP_score(feature_retrieval_doc["source_id"], feature_retrieval_doc["Translation"], unsupervised_prediction)
print("Map Score for Cosine Similarity tfidf: {}".format(map_score))

unsupervised_prediction = feature_retrieval_doc["jaccard_translation_proc_b_1k"].to_numpy()
unsupervised_prediction = [[1- pos_prediction_prob, pos_prediction_prob] for pos_prediction_prob in unsupervised_prediction]
map_score = MAP_score(feature_retrieval_doc["source_id"], feature_retrieval_doc["Translation"], unsupervised_prediction)
print("Map Score for Jaccard Translation: {}".format(map_score))


print("\n------VecMap-----")
unsupervised_prediction = feature_retrieval_doc["cosine_similarity_average_proc_b_1k"].to_numpy()
unsupervised_prediction = [[1- pos_prediction_prob, pos_prediction_prob] for pos_prediction_prob in unsupervised_prediction]
map_score = MAP_score(feature_retrieval_doc["source_id"], feature_retrieval_doc["Translation"], unsupervised_prediction)
print("Map Score for Cosine Similarity Average: {}".format(map_score))

unsupervised_prediction = feature_retrieval_doc["cosine_similarity_tf_idf_vecmap"].to_numpy()
unsupervised_prediction = [[1- pos_prediction_prob, pos_prediction_prob] for pos_prediction_prob in unsupervised_prediction]
map_score = MAP_score(feature_retrieval_doc["source_id"], feature_retrieval_doc["Translation"], unsupervised_prediction)
print("Map Score for Cosine Similarity tfidf: {}".format(map_score))

unsupervised_prediction = feature_retrieval_doc["jaccard_translation_vecmap"].to_numpy()
unsupervised_prediction = [[1- pos_prediction_prob, pos_prediction_prob] for pos_prediction_prob in unsupervised_prediction]
map_score = MAP_score(feature_retrieval_doc["source_id"], feature_retrieval_doc["Translation"], unsupervised_prediction)
print("Map Score for Jaccard Translation: {}".format(map_score))

Document Corpus Evaluation:

------Proc-5K-----
Map Score for Cosine Similarity Average: 0.0025894709139908183
Map Score for Cosine Similarity tfidf: 0.03989460734110285
Map Score for Jaccard Translation: 0.0781507614256782

------Proc-b-1K-----
Map Score for Cosine Similarity Average: 0.0031360568153484696
Map Score for Cosine Similarity tfidf: 0.027318884289896022
Map Score for Jaccard Translation: 0.10946524326150824

------VecMap-----
Map Score for Cosine Similarity Average: 0.0031360568153484696
Map Score for Cosine Similarity tfidf: 0.02312668484141111
Map Score for Jaccard Translation: 0.1181369487198122


# Analysis: Why Jaccard Translation works well

In [15]:
unsupervised_prediction = feature_retrieval_de["jaccard_translation_proc_5k"].to_numpy()
unsupervised_prediction = [[1- pos_prediction_prob, pos_prediction_prob] for pos_prediction_prob in unsupervised_prediction]
map_score = MAP_score(feature_retrieval_de["source_id"], feature_retrieval_de["Translation"], unsupervised_prediction)
print("Map Score for Jaccard Translation: {}".format(map_score))

Map Score for Jaccard Translation: 0.8153442621756715


In [21]:
import pandas as pd
from src.data import PreprocessingEuroParl
preprocessed_data = pd.read_json("../data/interim/preprocessed_data_en_de.json")
parallel_sentences = PreprocessingEuroParl(df_sampled_path="../data/interim/europarl_en_de.pkl")
parallel_sentences.preprocessed = preprocessed_data

Finished function: 'import_data' in 0.19 seconds.


In [95]:
unsupervised_prediction_jaccard = feature_retrieval_de["jaccard_translation_proc_5k"].to_numpy()
unsupervised_prediction_jaccard = [[1- pos_prediction_prob, pos_prediction_prob] for pos_prediction_prob in unsupervised_prediction_jaccard]

unsupervised_prediction_cos = feature_retrieval_de["cosine_similarity_average_proc_5k"].to_numpy()
unsupervised_prediction_cos = [[1- pos_prediction_prob, pos_prediction_prob] for pos_prediction_prob in unsupervised_prediction_cos]

result_jaccard = pd.DataFrame()
result_jaccard['source_id'] = feature_retrieval["source_id"]
result_jaccard['target_id'] = feature_retrieval["target_id"]
result_jaccard['Translation'] = feature_retrieval["Translation"]
result_jaccard['probabilities'] = [x[1] for x in unsupervised_prediction_jaccard]
# rank by the source_id and get the ranking for each of the queries for all the documents
result_jaccard['rank'] = result_jaccard.groupby('source_id')['probabilities'].rank(method='average', ascending=False)

result_cos = pd.DataFrame()
result_cos['source_id'] = feature_retrieval["source_id"]
result_cos['target_id'] = feature_retrieval["target_id"]
result_cos['Translation'] = feature_retrieval["Translation"]
result_cos['probabilities'] = [x[1] for x in unsupervised_prediction_cos]
# rank by the source_id and get the ranking for each of the queries for all the documents
result_cos['rank'] = result_cos.groupby('source_id')['probabilities'].rank(method='average', ascending=False)

In [129]:
result_jaccard[result_jaccard["rank"] == 1.0].head(n=40)

Unnamed: 0,source_id,target_id,Translation,probabilities,rank
0,20000,20000,1,0.140867,1.0
10002,20002,20002,1,0.25817,1.0
15003,20003,20003,1,0.352564,1.0
20004,20004,20004,1,0.2,1.0
26440,20005,21440,0,0.183333,1.0
30006,20006,20006,1,0.222527,1.0
44287,20008,24287,0,0.190909,1.0
45009,20009,20009,1,0.201923,1.0
50010,20010,20010,1,0.176923,1.0
55011,20011,20011,1,0.10101,1.0


In [128]:
result_cos[result_cos["rank"] == 1.0].head(n=43)

Unnamed: 0,source_id,target_id,Translation,probabilities,rank
1427,20000,21427,0,0.834856,1.0
7065,20001,22065,0,0.773552,1.0
13767,20002,23767,0,0.827207,1.0
18158,20003,23158,0,0.842784,1.0
22067,20004,22067,0,0.777219,1.0
25500,20005,20500,0,0.769433,1.0
34184,20006,24184,0,0.83258,1.0
38825,20007,23825,0,0.661724,1.0
42943,20008,22943,0,0.815305,1.0
47602,20009,22602,0,0.8012,1.0


In [130]:
parallel_sentences.preprocessed.iloc[20036, :]["token_preprocessed_embedding_source"]

['mr',
 'president',
 'parliament',
 'must',
 'demand',
 'financial',
 'sanction',
 'iran']

In [131]:
parallel_sentences.preprocessed.iloc[20036, :]["translated_to_source_proc_5k_target"]

['lord', 'president', 'even', 'parliament', 'iran', 'insist']

In [132]:
parallel_sentences.preprocessed.iloc[20036, :]["token_preprocessed_embedding_target"]

['herr',
 'präsident',
 'deswegen',
 'parlament',
 'finanzsanktionen',
 'iran',
 'fordern']

In [133]:
parallel_sentences.preprocessed.iloc[20036, :]["translated_to_target_proc_5k_source"]

['herrn',
 'präsident',
 'parlament',
 'müssen',
 'nachfrage',
 'finanzielle',
 'sanktion',
 'iran']

In [134]:
a = parallel_sentences.preprocessed.apply(lambda x: "iran"  in x['token_preprocessed_embedding_source'], axis=1)

In [135]:
parallel_sentences.preprocessed[a]

Unnamed: 0,id_source,id_target,token_preprocessed_embedding_source,token_preprocessed_embedding_target,Translation,number_punctuations_total_source,number_punctuations_total_target,number_words_source,number_words_target,number_unique_words_source,...,sentence_embedding_average_proc_b_1k_source,sentence_embedding_average_proc_b_1k_target,sentence_embedding_tf_idf_proc_b_1k_source,sentence_embedding_tf_idf_proc_b_1k_target,translated_to_target_vecmap_source,translated_to_source_vecmap_target,sentence_embedding_average_vecmap_source,sentence_embedding_average_vecmap_target,sentence_embedding_tf_idf_vecmap_source,sentence_embedding_tf_idf_vecmap_target
560,560,560,"[answer, question, ask, believe, moment, give,...","[gestellt, frage, mögen, sagen, glauben, momen...",1,3,2,16,16,16,...,"[{'0': -0.0338814413, '1': 0.0354762134, '2': ...","[{'0': -0.035925401600000004, '1': 0.008561023...","[{'0': -0.0088409046, '1': 0.0080620772, '2': ...","[{'0': -0.0078660312, '1': 0.00144232550000000...","[beantworten, frage, nachfragen, glaube, momen...","[reinstate, question, seem, say, belief, momen...","[{'0': 0.2261654194, '1': -0.0543461666, '2': ...","[{'0': 0.2381045153, '1': -0.0347649602, '2': ...","[{'0': 0.0475400905, '1': -0.0177192261, '2': ...","[{'0': 0.0460326173, '1': -0.012391419, '2': 0..."
1006,1006,1006,"[government, middle, east, guilty, crime, with...","[geben, regierung, mittlere, osten, sowohl, in...",1,2,1,15,17,12,...,"[{'0': -0.0659756302, '1': 0.0076356419, '2': ...","[{'0': -0.037557847000000005, '1': -0.00320089...","[{'0': -0.0184350538, '1': 0.0012163304, '2': ...","[{'0': -0.010634973300000001, '1': -0.00219909...","[regierung, mittlere, osten, schuldig, krimina...","[give, government, lower, east, whereas, inlan...","[{'0': 0.14762316920000002, '1': -0.0954585072...","[{'0': 0.1889197909, '1': -0.0689704435, '2': ...","[{'0': 0.035546432, '1': -0.0283922482, '2': 0...","[{'0': 0.0321491277, '1': -0.0217574369, '2': ..."
2582,2582,2582,"[eu, want, play, part, also, peace, process, w...","[wolle, eu, gesamt, friedensprozess, rolle, sp...",1,4,3,21,18,21,...,"[{'0': -0.0231469287, '1': 0.0169041753, '2': ...","[{'0': -0.0049211674, '1': 0.0171680721, '2': ...","[{'0': -0.0049069481, '1': 0.00377902200000000...","[{'0': -0.0010809339, '1': 0.0039806293, '2': ...","[eu, brauche, spiel, teil, außerdem, frieden, ...","[promised, eu, total, role, games, must, immed...","[{'0': 0.2538638762, '1': -0.0586264494, '2': ...","[{'0': 0.18360828210000002, '1': -0.0320432409...","[{'0': 0.0515910297, '1': -0.0133230698, '2': ...","[{'0': 0.0385608583, '1': -0.0087558462, '2': ..."
2605,2605,2605,"[propose, track, approach, iran, balance, nucl...","[vorgeschlagen, zweigleisig, atomfrage, mensch...",1,2,0,15,14,13,...,"[{'0': -0.0601459747, '1': 0.0005476464, '2': ...","[{'0': -0.048398552500000004, '1': -0.01091387...","[{'0': -0.0165308101, '1': 0.00021615300000000...","[{'0': -0.0115629336, '1': -0.0018585433000000...","[vorschlagen, strecke, herangehensweise, iran,...","[suggested, railway, rights, approach, contras...","[{'0': 0.2166708289, '1': -0.0781058002, '2': ...","[{'0': 0.1334333442, '1': -0.07891708480000001...","[{'0': 0.0582081834, '1': -0.0213193977, '2': ...","[{'0': 0.031206574100000002, '1': -0.021030716..."
3789,3789,3789,"[eu, currently, seek, good, way, support, poli...","[europäisch, union, suchen, derzeit, gut, poli...",1,1,1,17,17,17,...,"[{'0': -0.0545399798, '1': -0.0013566494, '2':...","[{'0': -0.037755502, '1': 0.0088946736, '2': -...","[{'0': -0.0128811503, '1': 0.0002411454, '2': ...","[{'0': -0.0081406958, '1': 0.0018874549, '2': ...","[eu, zurzeit, bemühen, gute, schwerlich, unter...","[european, union, find, currently, nicely, pol...","[{'0': 0.19295601880000002, '1': -0.0508079009...","[{'0': 0.1603794111, '1': -0.05704538860000000...","[{'0': 0.0449711401, '1': -0.0124173743, '2': ...","[{'0': 0.0404443806, '1': -0.0138655671, '2': ..."
5380,5380,5380,"[sl, mr, president, lady, gentleman, tragedy, ...","[sl, herr, präsident, dame, herr, tragödie, ir...",1,4,4,8,8,8,...,"[{'0': -0.0739803032, '1': 0.04399285030000000...","[{'0': -0.0550682905, '1': 0.0378284797, '2': ...","[{'0': -0.025772079200000002, '1': 0.013164897...","[{'0': -0.0172542619, '1': 0.01488646770000000...","[sl, herrn, präsident, lady, gentleman, tragöd...","[sl, lord, president, blanche, lord, tragedy, ...","[{'0': 0.012268020400000001, '1': 0.0933581277...","[{'0': -0.0214098152, '1': 0.0266604785, '2': ...","[{'0': 0.0058199950000000005, '1': 0.015102593...","[{'0': -0.0072529487, '1': 0.0063962094, '2': ..."
6866,6866,6866,"[mr, president, eu, member, states, consistent...","[en, herr, präsident, sämtliche, eu-mitgliedst...",1,2,6,22,22,20,...,"[{'0': -0.0487260011, '1': 0.0180227854, '2': ...","[{'0': -0.0536121707, '1': 0.02192540940000000...","[{'0': -0.0092015705, '1': 0.0034159934, '2': ...","[{'0': -0.0097151662, '1': 0.0034546661, '2': ...","[herrn, präsident, eu, mitglied, staaten, unei...","[en, lord, president, virtually, rigorously, p...","[{'0': 0.16141358190000002, '1': -0.0348538215...","[{'0': 0.18882264540000002, '1': -0.0120969903...","[{'0': 0.0376373435, '1': -0.0091431376, '2': ...","[{'0': 0.035687700100000004, '1': -0.004813639..."
7470,7470,7470,"[question, external, security, strengthen, acq...","[frage, äußern, sicherheit, stärken, grenze, p...",1,5,6,13,12,13,...,"[{'0': -0.0571057824, '1': -0.006179617, '2': ...","[{'0': -0.0589173681, '1': -0.0123203633, '2':...","[{'0': -0.014943863000000002, '1': -0.00035343...","[{'0': -0.0164493499, '1': -0.0032799733, '2':...","[frage, externe, sicherheit, stärken, erwerben...","[question, respond, security, strengthen, bord...","[{'0': 0.10028302330000001, '1': -0.1929156441...","[{'0': 0.08359525100000001, '1': -0.1835875061...","[{'0': 0.0229004085, '1': -0.0551931971, '2': ...","[{'0': 0.015446182400000001, '1': -0.057719820..."
7481,7481,7481,"[unfortunately, however, iran, remain, evasive]","[bedauerlicherweise, weichen, iran]",1,2,0,5,3,5,...,"[{'0': -0.0415429842, '1': 0.0182899129, '2': ...","[{'0': 0.007015325100000001, '1': 0.0037637341...","[{'0': -0.0209128643, '1': 0.0113226416, '2': ...","[{'0': 0.0061622797, '1': 0.000278936500000000...","[glücklicherweise, jedoch, iran, bleiben, able...","[obviously, thinner, iran]","[{'0': 0.29619136300000004, '1': -0.0583100244...","[{'0': 0.1625753542, '1': -0.13618460670000002...","[{'0': 0.1203051341, '1': -0.0246413209, '2': ...","[{'0': 0.1020227905, '1': -0.07387812440000001..."
8260,8260,8260,"[think, clear, distinction, draw, cuba, primar...","[finden, jedoch, dabei, genau, unterscheiden, ...",1,1,3,15,21,14,...,"[{'0': -0.0414714942, '1': 0.0094620564, '2': ...","[{'0': -0.0461263517, '1': -0.014386136, '2': ...","[{'0': -0.011515571700000001, '1': 0.001568478...","[{'0': -0.0105933567, '1': -0.0033382647000000...","[wirklich, klare, unterscheidung, weiterkommen...","[find, however, thus, exactly, differ, would, ...","[{'0': 0.15744178050000002, '1': -0.0989353918...","[{'0': 0.1884903316, '1': -0.0808391775, '2': ...","[{'0': 0.037566315100000004, '1': -0.028355077...","[{'0': 0.0355212149, '1': -0.0186719005, '2': ..."


In [136]:
result_cos[(result_cos["source_id"] == 20036) & (result_cos["target_id"] == 20036)]

Unnamed: 0,source_id,target_id,Translation,probabilities,rank
180036,20036,20036,1,0.6828,504.0


In [137]:
parallel_sentences.preprocessed.iloc[20036, :]["token_preprocessed_embedding_target"]

['herr',
 'präsident',
 'deswegen',
 'parlament',
 'finanzsanktionen',
 'iran',
 'fordern']

In [139]:
parallel_sentences.preprocessed.iloc[20510, :]["token_preprocessed_embedding_target"]

['ebenfalls',
 'verbesserung',
 'verfahrensgarantien',
 'gelegen',
 'schlagen',
 'deshalb',
 'verringerung',
 'frist',
 'beantwortung',
 'antrag',
 'zulassung',
 'verlängerung',
 'tag']