## Download the pre-trained ``glove.twitter`` word embedding model

In [1]:
import urllib.request
from os.path import exists
import ipywidgets as widgets
from IPython.display import display

progress = None
def show_progress(block_num, block_size, total_size):
    global progress
    if not progress :
        progress = widgets.FloatProgress(
            value=0,
            min=0,
            max=total_size,
            step=0.1,
            description='Downloading',
            bar_style='info',
            orientation='horizontal'
        )
        display(progress)
        
    downloaded = (block_num * block_size)
    print(block_num * block_size, "/", total_size,"\r", end="")
    
    progress.value = downloaded

model_path = "models/glove.twitter.27B.100d.txt"
word_embedding_url = "https://huggingface.co/Juliano/fault_injection_mlaas/resolve/main/glove.twitter.27B.100d.txt"

file_exists = exists(model_path)

if file_exists :
    print("file ", model_path, " already exists.")
else:
    urllib.request.urlretrieve(word_embedding_url, model_path, show_progress)
    print("File downloaded!")

file  models/glove.twitter.27B.100d.txt  already exists.


## Importing and Mocking MLaaS providers

In [2]:
from mlaas_providers import providers as ml_providers

Run if you want to use mocked providers instead real ones

In [3]:
ml_providers.amazon = ml_providers.return_mock_of(ml_providers.amazon)
ml_providers.google = ml_providers.return_mock_of(ml_providers.google)
ml_providers.microsoft = ml_providers.return_mock_of(ml_providers.microsoft)

# `Experiment 1`

## Importing code modules

In [3]:
from datetime import datetime
from typing import List
from mlaas_providers.providers import read_dataset
from noise_insertion.utils import save_data_to_file
from data_sampling.data_sampling import DataSampling
from noise_insertion.percent_insertion import noises
from noise_insertion import noise_insertion
from utils import visualization
from progress import progress_manager
from metrics import metrics
import ipywidgets as widgets
data_sampling = DataSampling()


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\rocha\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\rocha\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\rocha\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rocha\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Parameters

In [4]:
sample_size = 99

noise_list =[
    noises.Keyboard,
    noises.OCR,
    noises.RandomCharReplace,
    noises.CharSwap,
    noises.aug.WordSwap,
    noises.aug.WordSplit,
    noises.aug.Antonym,
    noises.aug.Synonym,
    noises.aug.Spelling,
    noises.aug.TfIdfWord,
    noises.aug.WordEmbeddings,
    noises.aug.ContextualWordEmbs,
]

noise_level=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]

## Running the experiment

To continue from previously ongoing progress insert the name of a /outputs/experiment1 folder

In [5]:
continue_widget = widgets.Textarea(
    value='',
    placeholder='Type the name of a /outputs/experiment1 folder to continue from',
    description='Continue from',
    disabled=False
)
continue_widget

Textarea(value='', description='Continue from', placeholder='Type the name of a /outputs/experiment1 folder to…

In [6]:
continue_from = continue_widget.value # Continue from previously ongoing progress. Insert the name of a /outputs/experiment1 folder

def get_main_path(size):
    now = datetime.now()
    timestamp = now.strftime("%m-%d-%Y %H_%M_%S")
    main_dir = './outputs/experiment1/size'+str(size)+'_' + timestamp
    return main_dir

def run_evaluation(sample_size: int,
                  noise_levels: List[int] =[0.1, 0.15, 0.2, 0.25, 0.3],
                  noise_algorithms=[noises.no_noise, noises.RandomCharReplace, noises.Keyboard, noises.OCR],
                  mlaas_providers=[ml_providers.google],
                  continue_from=None):
    if(continue_from):
        main_path = './outputs/experiment1/'+continue_from
        progress = progress_manager.load_progress(main_path)
        x_dataset = read_dataset(main_path + '/data' + "/dataset.xlsx")
        y_labels = read_dataset(main_path + '/data' + "/labels.xlsx")
    else:
        x_dataset, y_labels = data_sampling.get_dataset_sample('./Tweets_dataset.csv', sample_size)
        main_path = get_main_path(len(x_dataset))
        save_data_to_file(x_dataset, main_path + '/data', "dataset")
        save_data_to_file(y_labels, main_path + '/data', "labels")
        
        progress = progress_manager.init_progress(main_path, noise_algorithms, noise_levels, mlaas_providers)
    print("Results will be stored at: ", main_path)
    print('Generating noise...')
    progress = noise_insertion.generate_noised_data(x_dataset, main_path)

    print('Getting predictions from providers...')
    progress = ml_providers.get_prediction_results(main_path)

    print('Calculating metrics...')
    metrics_results = metrics.metrics(progress, y_labels, main_path)

    noise_list = [0.0]
    noise_list.extend(noise_levels)
    visualization.save_results_plot_RQ1(metrics_results,
        main_path + '/results/rq1', noise_list)
    visualization.save_results_plot_RQ2(metrics_results,
        main_path + '/results/rq2', noise_list)
    visualization.plot_results(metrics_results, main_path + '/results/others_plots')

    print("Results were saved to:", main_path)

run_evaluation(
    sample_size,
    noise_levels=noise_level,
    noise_algorithms=noise_list,
    mlaas_providers=[ml_providers.google, ml_providers.microsoft, ml_providers.amazon],
    continue_from=continue_from
)

Results will be stored at:  ./outputs/experiment1/size99_07-12-2022 09_34_29
Generating noise...
- Keyboard
- OCR
- RandomCharReplace
- CharSwap
- WordSwap
- WordSplit
- Antonym
- Synonym
- Spelling
- TfIdfWord
- WordEmbeddings
- ContextualWordEmbs
Getting predictions from providers...
- google
-- Keyboard
-- OCR
-- RandomCharReplace
-- CharSwap
-- WordSwap
-- WordSplit
-- Antonym
-- Synonym
-- Spelling
-- TfIdfWord
-- WordEmbeddings
-- ContextualWordEmbs
- microsoft
-- Keyboard
-- OCR
-- RandomCharReplace
-- CharSwap
-- WordSwap
-- WordSplit
-- Antonym
-- Synonym
-- Spelling
-- TfIdfWord
-- WordEmbeddings
-- ContextualWordEmbs
- amazon
-- Keyboard
-- OCR
-- RandomCharReplace
-- CharSwap
-- WordSwap
-- WordSplit
-- Antonym
-- Synonym
-- Spelling
-- TfIdfWord
-- WordEmbeddings
-- ContextualWordEmbs
Calculating metrics...
Results were saved to: ./outputs/experiment1/size99_07-12-2022 09_34_29


# Experiment 2

## Importing code modules

In [7]:
from pathlib import Path
from typing import TypedDict, List
from datetime import datetime
from data_sampling.data_sampling import DataSampling
from progress import progress_manager
from noise_insertion.unit_insertion import noises as unit_noises
from noise_insertion.unit_insertion import noises
from noise_insertion import noise_insertion
from mlaas_providers.providers import read_dataset
from metrics import metrics
import ipywidgets as widgets
from utils import visualization

## Parameters

In [8]:
sample_size=100
noise_level = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

word_counts = [
    {"min_width": 5, "max_width": 10},
    {"min_width": 10, "max_width": 15},
    {"min_width": 15, "max_width": 20},
    {"min_width": 20, "max_width": 25},
]

noise_algo = [
    noises.Keyboard,
    noises.OCR,
    noises.RandomCharReplace,
    noises.CharSwap,
    noises.WordSwap,
    noises.WordSplit,
    noises.Antonym,
    noises.Synonym,
    noises.Spelling,
    noises.TfIdfWord,
    noises.WordEmbeddings,
    noises.ContextualWordEmbs,
]

## Running

To continue from previously ongoing progress insert the name of a /outputs/outputs/experiment2 folder

In [9]:
continue_widget = widgets.Textarea(
    value='',
    placeholder='Type the name of a /outputs/experiment2 folder to continue from',
    description='Continue from',
    disabled=False
)
continue_widget

Textarea(value='', description='Continue from', placeholder='Type the name of a /outputs/experiment2 folder to…

In [10]:
continue_from = continue_widget.value 
print(continue_from)
class Size(TypedDict):
    min_width: int
    max_width: int
    
def create_main_path(timestamp, size):
    main_dir = f'./outputs/experiment2/size{str(size)}_{timestamp}'

    Path(main_dir).mkdir(parents=True, exist_ok=True)
    return main_dir

def create_sub_path(main_path: str, min_width: int, max_width: int):
    path = f'{main_path}/[{str(min_width)}-{str(max_width)}]'
    
    Path(path).mkdir(parents=True, exist_ok=True)
    Path(path+'/data').mkdir(parents=True, exist_ok=True)
    
    return path

def prepare_execution(
    continue_from: str,
    timestamp: str,
    sample_size: int,
    sizes: List[Size],
    noise_algorithms,
    noise_levels,
    mlaas_providers
):
    dataSampling = DataSampling()
    if not len(continue_from) > 0:
        main_path = create_main_path(timestamp, sample_size)
    else:
        continue_from = './outputs/experiment2/'+ continue_from
        print("continue_from:", continue_from)
        main_path = continue_from
    
    sub_path_list = []
    for size in sizes:
        min_width = size['min_width']
        max_width = size['max_width']
        sub_path = create_sub_path(main_path, min_width, max_width)

        data, labels = dataSampling.get_by_word_count('Tweets_dataset.csv',
                                              sample_size,
                                              min_width,
                                              max_width)

        path = Path(sub_path+"/data/dataset.xlsx")
        if not path.is_file():
            data.to_excel(sub_path+"/data/dataset.xlsx", 'data', index=False)
        
        path = Path(sub_path+"/data/labels.xlsx")
        if not path.is_file():
            labels.to_excel(sub_path+"/data/labels.xlsx", 'data', index=False)
        sub_path_list.append(sub_path)
        progress = progress_manager.init_progress(sub_path, noise_algorithms, noise_levels, mlaas_providers)
    return sub_path_list

def run_evaluation(noise_levels_units: List[int],
                   continue_from: str,    
):
    main_path = continue_from
    progress = progress_manager.load_progress(main_path)

    x_dataset = read_dataset(main_path + '/data/dataset.xlsx')
    y_labels = read_dataset(main_path + '/data/labels.xlsx')

    print('Generating noise...')
    progress = noise_insertion.generate_noised_data(x_dataset, main_path, noise_package=unit_noises)

    print('Getting predictions from providers...')
    progress = ml_providers.get_prediction_results(main_path)

    print('Calculating metrics...')
    metrics_results = metrics.metrics(progress, y_labels, main_path)

    noise_list = [0]
    noise_list.extend(noise_levels_units)
    visualization.save_results_plot_RQ1(metrics_results,
        main_path + '/results/rq1', noise_list)
    visualization.save_results_plot_RQ2(metrics_results,
        main_path + '/results/rq2', noise_list)
    visualization.plot_results(metrics_results, main_path + '/results/others_plots')

    print(main_path)

timestamp = datetime.now().strftime("%m-%d-%Y %H_%M_%S")
# timestamp = "07-04-2022 20_19_59" uncomment with a timestamp to continue from previouly run

path_list = prepare_execution(continue_from,
                          timestamp, 
                          sample_size,
                          word_counts,
                          noise_algo,
                          noise_level,
                          [ml_providers.google, ml_providers.amazon, ml_providers.microsoft])
for path in path_list:
    run_evaluation(noise_level, 
                   continue_from=path)
print(path_list)

size100_07-04-2022 20_19_59
continue_from: ./outputs/experiment2/size100_07-04-2022 20_19_59
Generating noise...
- Keyboard
- OCR
- RandomCharReplace
- CharSwap
- WordSwap
- WordSplit
- Antonym
- Synonym
- Spelling
- TfIdfWord
- WordEmbeddings
- ContextualWordEmbs
Getting predictions from providers...
- google
-- Keyboard
-- OCR
-- RandomCharReplace
-- CharSwap
-- WordSwap
-- WordSplit
-- Antonym
-- Synonym
-- Spelling
-- TfIdfWord
-- WordEmbeddings
-- ContextualWordEmbs
- amazon
-- Keyboard
-- OCR
-- RandomCharReplace
-- CharSwap
-- WordSwap
-- WordSplit
-- Antonym
-- Synonym
-- Spelling
-- TfIdfWord
-- WordEmbeddings
-- ContextualWordEmbs
- microsoft
-- Keyboard
-- OCR
-- RandomCharReplace
-- CharSwap
-- WordSwap
-- WordSplit
-- Antonym
-- Synonym
-- Spelling
-- TfIdfWord
-- WordEmbeddings
-- ContextualWordEmbs
Calculating metrics...
./outputs/experiment2/size100_07-04-2022 20_19_59/[5-10]
Generating noise...
- Keyboard
- OCR
- RandomCharReplace
- CharSwap
- WordSwap
- WordSplit
- An