# Apply metrics to results

In [8]:
!pip install jiwer
!pip install gensim
!pip install -U nltk
!pip install pandas



In [9]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [10]:
# Todas as ações relacionadas ao VoxForge estão comentadas pois só sera refeito para o ds do mozilla
# Adaptar para o local em que os resultados estão
results_path = "/content/drive/MyDrive/JIDM/Results/"

# Estes devem ser o nome das pastas dentro do result_path. Uma pasta para cada API/Model
models_apis = ["Wit", "Azure", "Google", "Wav2Vec", "AWS", "Jasper"]

# Estes são os nomes dos arquivos com os resultados das trancrições para computar as métricas
results_mozilla = ["mozilla_wit_api.tsv", "mozilla_azure_api.tsv", "mozilla_gcloud_api.tsv",
                   "transcribed_w2v2_metrics_mozilla.tsv", "generation_mozilla.tsv", "mozilla_result.tsv"]

# results_voxforge = ["voxforge_wit_api.tsv.tsv", "voxforge_azure_api.tsv", "voxforge_gcloud_api.tsv",
#                     "transcribed_w2v2_metrics_voxforge.tsv", "generation_voxforge.tsv", "voxforge_result.tsv"]

# As métricas computadas ficarão na pasta results_path dentro de cada pasta de models_api.
# O nome do arquivo será results_mozilla[i] + "_metrics.tsv", para mozilla
# O nome do arquivo será results_voxforge[i] + "_metrics.tsv", para voforge

In [11]:
from gensim.models import KeyedVectors

emb_models = {
    'word2vec_cbow_s50': KeyedVectors.load_word2vec_format('/content/drive/MyDrive/JIDM/embeddings/cbow_s50.txt'),
    'word2vec_skip_s50': KeyedVectors.load_word2vec_format('/content/drive/MyDrive/JIDM/embeddings/skip_s50.txt')
}

In [14]:
import re
from gensim import corpora
from gensim.matutils import softcossim
import nltk
nltk.download('rslp')
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.translate import bleu_score, meteor_score
import pandas as pd
from jiwer import wer

def clean_str(x):
    try:
        return re.sub('\W', ' ', x).lower()
    except:
        # print(x)
        return ""


def cosine_similarity(reference, hypothesis, model):
    reference = reference.split()
    hypotesis = hypothesis.split()
    documents = [hypotesis, reference]
    dictionary = corpora.Dictionary(documents)

    similarity_matrix = emb_models[model].similarity_matrix(dictionary)

    hypotesis = dictionary.doc2bow(hypotesis)
    reference = dictionary.doc2bow(reference)

    return softcossim(hypotesis, reference, similarity_matrix)


def bleu(reference, hypothesis):
    references = [reference.split()]
    hypothesis = hypothesis.split()

    if len(references[0]) == 1:
        weights = (1.0, 0.0, 0.0, 0.0)
    elif len(references[0]) == 2:
        weights = (0.5, 0.5, 0.0, 0.0)
    elif len(references[0]) == 3:
        weights = (0.4, 0.3, 0.3, 0.0)
    else:
        weights = (0.4, 0.3, 0.2, 0.1)

    return bleu_score.sentence_bleu(references, hypothesis, weights=weights)


pt_stemmer = nltk.stem.RSLPStemmer()


def meteor(reference, hypothesis):
    references = [reference.split()]
    hypothesis = hypothesis.split()
    return meteor_score.meteor_score(references, hypothesis, stemmer=pt_stemmer)


def compute_wer(result, result_column_name="translation"):
    originals = result["sentence"]
    sentences = result["sentence"].apply(clean_str)
    translations = result[result_column_name].apply(clean_str)
    
    for original, sentence, translation in zip(originals, sentences, translations):
        result.loc[result["sentence"] == original,"wer"] = wer(sentence, translation)
    
    return result


def compute_cossine_metrics(result, save=True, output_to_save=None, name_dataset=None, result_column_name="translation"):
    originals = result["sentence"]
    sentences = result["sentence"].apply(clean_str)
    translations = result[result_column_name].apply(clean_str)

    # Cossine metrics
    for model in emb_models:
        print(f"Applying for {model}")
        for original, sentence, translation in zip(originals, sentences, translations):
            sentence = clean_str(sentence)
            translation = clean_str(translation)
            result.loc[result["sentence"] == original, f"cos_sim_{model}"] = cosine_similarity(
                sentence, translation, model)
    
    return result


def compute_wer_bleu_meteor_metrics(result, save=True, output_to_save=None, name_dataset=None, result_column_name="translation"):
    originals = result["sentence"]
    sentences = result["sentence"].apply(clean_str)
    translations = result[result_column_name].apply(clean_str)
    
    for original, sentence, translation in zip(originals, sentences, translations):
        print(f"Applying for bleu")
        result.loc[result["sentence"] == original,
                   "bleu"] = bleu(sentence, translation)
        print(f"Applying for meteor")
        result.loc[result["sentence"] == original,
                   "meteor"] = meteor(sentence, translation)
        print(f"Applying for wer")
        result.loc[result["sentence"] == original,
                   "wer"] = wer(sentence, translation)
        
    return result


def get_metrics(result, save=True, output_to_save=None, name_dataset=None, result_column_name="translation"):
    originals = result["sentence"]
    sentences = result["sentence"].apply(clean_str)
    translations = result[result_column_name].apply(clean_str)

    # Cossine metrics
    for model in emb_models:
        print(f"Applying for {model}")
        for original, sentence, translation in zip(originals, sentences, translations):
            sentence = clean_str(sentence)
            translation = clean_str(translation)
            result.loc[result["sentence"] == original, f"cos_sim_{model}"] = cosine_similarity(
                sentence, translation, model)

    for original, sentence, translation in zip(originals, sentences, translations):
        result.loc[result["sentence"] == original,
                   "bleu"] = bleu(sentence, translation)
        result.loc[result["sentence"] == original,
                   "meteor"] = meteor(sentence, translation)
        result.loc[result["sentence"] == original,
                   "wer"] = wer(sentence, translation)

    print(f"WER: {result['wer'].mean()}")
    print(f"bleu: {result['bleu'].mean()}")
    print(f"meteor: {result['meteor'].mean()}")
    for model in emb_models:
        print(f'{model}: {result[f"cos_sim_{model}"].mean()}')

    if save:
        result.to_csv(output_to_save +
                      f"/{name_dataset}_metrics.tsv", sep="\t", index=False)
        
    return result

[nltk_data] Downloading package rslp to /root/nltk_data...
[nltk_data]   Package rslp is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Unzipping corpora/omw-1.4.zip.


In [17]:
# Mozilla
for idx in range(0, len(models_apis)):
    result_path = f"{results_path}{models_apis[idx]}/"
    result_file = f"{result_path}{results_mozilla[idx]}"
    
    print(f"Calculating for: {result_file}")
    result_df = pd.read_csv(result_file, sep='\t')
    result_df = get_metrics(result_df, output_to_save=result_path, name_dataset="mozilla")
    
# Voxforge
# for idx in range(0, len(models_apis)):
#     result_path = f"{results_path}{models_apis[idx]}/"
#     result_file = f"{result_path}{results_voxforge[idx]}"
    
#     print(f"Calculating for: {result_file}")
#     result_df = pd.read_csv(result_file, sep='\t')
#     result_df = get_metrics(result_df, output_to_save=result_path, name_dataset="voxforge")

Calculating for: /content/drive/MyDrive/JIDM/Results/Wit/mozilla_wit_api.tsv
Applying for word2vec_cbow_s50
Applying for word2vec_skip_s50


The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


WER: 0.06699888779679487
bleu: 0.8713992788332702
meteor: 0.9234176959444156
word2vec_cbow_s50: 0.9592391234675652
word2vec_skip_s50: 0.9648679727242702
Calculating for: /content/drive/MyDrive/JIDM/Results/Azure/mozilla_azure_api.tsv
Applying for word2vec_cbow_s50
Applying for word2vec_skip_s50


The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


WER: 0.07971765777521167
bleu: 0.8643648492899277
meteor: 0.9209197060674728
word2vec_cbow_s50: 0.9442506607405963
word2vec_skip_s50: 0.9503021784272242
Calculating for: /content/drive/MyDrive/JIDM/Results/Google/mozilla_gcloud_api.tsv
Applying for word2vec_cbow_s50
Applying for word2vec_skip_s50


The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


WER: 0.1271517288031348
bleu: 0.7793935996797233
meteor: 0.8779685379065291
word2vec_cbow_s50: 0.9137690262772099
word2vec_skip_s50: 0.9230080475960342
Calculating for: /content/drive/MyDrive/JIDM/Results/Wav2Vec/transcribed_w2v2_metrics_mozilla.tsv
Applying for word2vec_cbow_s50
Applying for word2vec_skip_s50


The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


WER: 0.10801561094236037
bleu: 0.7933085453194396
meteor: 0.8884671717556847
word2vec_cbow_s50: 0.9211899788936844
word2vec_skip_s50: 0.9312614979003944
Calculating for: /content/drive/MyDrive/JIDM/Results/AWS/generation_mozilla.tsv
Applying for word2vec_cbow_s50
Applying for word2vec_skip_s50


The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


WER: 0.14944453664205132
bleu: 0.7347800233767898
meteor: 0.8574449423297679
word2vec_cbow_s50: 0.9046086348019502
word2vec_skip_s50: 0.9170273635958393
Calculating for: /content/drive/MyDrive/JIDM/Results/Jasper/mozilla_result.tsv
Applying for word2vec_cbow_s50
Applying for word2vec_skip_s50


The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


WER: 0.2411026042484512
bleu: 0.5775898312329616
meteor: 0.7555478814061313
word2vec_cbow_s50: 0.8136234978659692
word2vec_skip_s50: 0.835366487767901


# Metrics from all models ans API's

### Given that we have all the results we can conpute the summary of the translations in the following order:

#### Mozilla
        | WER | BLEU | METEOR | Word2Vec CBOW | Word2Vec SKIP
* Wit
* Azure
* Google Cloud      
* Wav2Vec2.0        
* AWS               
* Jasper            

#### Voxforge
        | WER | BLEU | METEOR | Word2Vec CBOW | Word2Vec SKIP
* Wit
* Azure             
* Google Cloud      
* Wav2Vec2.0        
* AWS               
* Jasper           

In [18]:
# Mozilla
res_wit = f"{results_path}Wit/mozilla_metrics.tsv"
res_azure = f"{results_path}Azure/mozilla_metrics.tsv"
res_gcloud = f"{results_path}Google/mozilla_metrics.tsv"
res_wav2vec = f"{results_path}Wav2Vec/mozilla_metrics.tsv"
res_aws = f"{results_path}AWS/mozilla_metrics.tsv"
res_jasper = f"{results_path}Jasper/mozilla_metrics.tsv"

wit = pd.read_csv(res_wit, sep="\t")
azure = pd.read_csv(res_azure, sep="\t")
gcloud = pd.read_csv(res_gcloud, sep="\t")
w2v = pd.read_csv(res_wav2vec, sep="\t")
aws = pd.read_csv(res_aws, sep="\t")
jasper = pd.read_csv(res_jasper, sep="\t")

print("#################### MOZILLA #####################")
print(f"API | WER | BLEU | METEOR | W2V CBOW | W2V SKIP")
print(f"Wit | {round(wit.wer.mean(),7)} | {round(wit.bleu.mean(),7)} | {round(wit.meteor.mean(),7)} | {round(wit.cos_sim_word2vec_cbow_s50.mean(),7)} | {round(wit.cos_sim_word2vec_skip_s50.mean(),7)}")
print(f"Azure | {round(azure.wer.mean(),7)} | {round(azure.bleu.mean(),7)} | {round(azure.meteor.mean(),7)} | {round(azure.cos_sim_word2vec_cbow_s50.mean(),7)} | {round(azure.cos_sim_word2vec_skip_s50.mean(),7)}")
print(f"Google | {round(gcloud.wer.mean(),7)} | {round(gcloud.bleu.mean(),7)} | {round(gcloud.meteor.mean(),7)} | {round(gcloud.cos_sim_word2vec_cbow_s50.mean(),7)} | {round(gcloud.cos_sim_word2vec_skip_s50.mean(),7)}")
print(f"Wav2Vec | {round(w2v.wer.mean(),7)} | {round(w2v.bleu.mean(),7)} | {round(w2v.meteor.mean(),7)} | {round(w2v.cos_sim_word2vec_cbow_s50.mean(),7)} | {round(w2v.cos_sim_word2vec_skip_s50.mean(),7)}")
print(f"AWS | {round(aws.wer.mean(),7)} | {round(aws.bleu.mean(),7)} | {round(aws.meteor.mean(),7)} | {round(aws.cos_sim_word2vec_cbow_s50.mean(),7)} | {round(aws.cos_sim_word2vec_skip_s50.mean(),7)}")
print(f"Jasper | {round(jasper.wer.mean(),7)} | {round(jasper.bleu.mean(),7)} | {round(jasper.meteor.mean(),7)} | {round(jasper.cos_sim_word2vec_cbow_s50.mean(),7)} | {round(jasper.cos_sim_word2vec_skip_s50.mean(),7)}")

#################### MOZILLA #####################
API | WER | BLEU | METEOR | W2V CBOW | W2V SKIP
Wit | 0.0669989 | 0.8713993 | 0.9234177 | 0.9592391 | 0.964868
Azure | 0.0797177 | 0.8643648 | 0.9209197 | 0.9442507 | 0.9503022
Google | 0.1271517 | 0.7793936 | 0.8779685 | 0.913769 | 0.923008
Wav2Vec | 0.1080156 | 0.7933085 | 0.8884672 | 0.92119 | 0.9312615
AWS | 0.1494445 | 0.73478 | 0.8574449 | 0.9046086 | 0.9170274
Jasper | 0.2411026 | 0.5775898 | 0.7555479 | 0.8136235 | 0.8353665


In [None]:
# # Voxforge
# res_wit = f"{results_path}Wit/voxforge_metrics.tsv"
# res_azure = f"{results_path}Azure/voxforge_metrics.tsv"
# res_gcloud = f"{results_path}Google/voxforge_metrics.tsv"
# res_wav2vec = f"{results_path}Wav2Vec/voxforge_metrics.tsv"
# res_aws = f"{results_path}AWS/voxforge_metrics.tsv"
# res_jasper = f"{results_path}Jasper/voxforge_metrics.tsv"

# wit = pd.read_csv(res_wit, sep="\t")
# azure = pd.read_csv(res_azure, sep="\t")
# gcloud = pd.read_csv(res_gcloud, sep="\t")
# w2v = pd.read_csv(res_wav2vec, sep="\t")
# aws = pd.read_csv(res_aws, sep="\t")
# jasper = pd.read_csv(res_jasper, sep="\t")

# print("#################### VOXFORGE #####################")
# print(f"API | WER | BLEU | METEOR | W2V CBOW | W2V SKIP")
# print(f"Wit | {round(wit.wer.mean(),7)} | {round(wit.bleu.mean(),7)} | {round(wit.meteor.mean(),7)} | {round(wit.cos_sim_word2vec_cbow_s50.mean(),7)} | {round(wit.cos_sim_word2vec_skip_s50.mean(),7)}")
# print(f"Azure | {round(azure.wer.mean(),7)} | {round(azure.bleu.mean(),7)} | {round(azure.meteor.mean(),7)} | {round(azure.cos_sim_word2vec_cbow_s50.mean(),7)} | {round(azure.cos_sim_word2vec_skip_s50.mean(),7)}")
# print(f"Google | {round(gcloud.wer.mean(),7)} | {round(gcloud.bleu.mean(),7)} | {round(gcloud.meteor.mean(),7)} | {round(gcloud.cos_sim_word2vec_cbow_s50.mean(),7)} | {round(gcloud.cos_sim_word2vec_skip_s50.mean(),7)}")
# print(f"Wav2Vec | {round(w2v.wer.mean(),7)} | {round(w2v.bleu.mean(),7)} | {round(w2v.meteor.mean(),7)} | {round(w2v.cos_sim_word2vec_cbow_s50.mean(),7)} | {round(w2v.cos_sim_word2vec_skip_s50.mean(),7)}")
# print(f"AWS | {round(aws.wer.mean(),7)} | {round(aws.bleu.mean(),7)} | {round(aws.meteor.mean(),7)} | {round(aws.cos_sim_word2vec_cbow_s50.mean(),7)} | {round(aws.cos_sim_word2vec_skip_s50.mean(),7)}")
# print(f"Jasper | {round(jasper.wer.mean(),7)} | {round(jasper.bleu.mean(),7)} | {round(jasper.meteor.mean(),7)} | {round(jasper.cos_sim_word2vec_cbow_s50.mean(),7)} | {round(jasper.cos_sim_word2vec_skip_s50.mean(),7)}")