# All Noise

In [None]:
import sys
DIR_ROOT = '../../'
sys.path.append(DIR_ROOT)
from src.noise import *
import pandas as pd

# Select the dataset
DATASET_NAME = 'imdb'
df = pd.read_csv(f'{DIR_ROOT}data/{DATASET_NAME}_train.csv')

# Copy the original data and split
X_train = df.drop(columns=['target'])
y_train = df['target'].to_frame()

## 5% Label Noise

In [2]:
y_train_noisy = label_noise(y_train, p=0.05)
df_compare = pd.DataFrame({'orig': y_train.iloc[:,0], 'noisy': y_train_noisy.iloc[:,0]})
df_compare[df_compare['orig'] != df_compare['noisy']]
df_compare = df_compare[df_compare['orig'] != df_compare['noisy']]
df_compare

=== AUDIT: label_noise ===
p (fraction per class): 0.05, random_state: 42
Flips per class:
  - positive: 938
  - negative: 938
Total labels changed: 1876


Unnamed: 0,orig,noisy
40,negative,positive
57,negative,positive
59,negative,positive
122,negative,positive
147,positive,negative
...,...,...
37383,positive,negative
37393,positive,negative
37394,positive,negative
37398,negative,positive


## 5% Feature Noise - Tabular

In [None]:
#Define the categorical columns
colunas_categoricas = []

colunas_numericas = X_train.columns.difference(colunas_categoricas).tolist()

X_train_noisy = data_noise_tabular(X_train, y_train, numerical_columns=colunas_numericas, categorical_columns=colunas_categoricas, p=0.05)

df_compare_data = X_train - X_train_noisy
df_compare_data = df_compare_data[(df_compare_data != 0).any(axis=1)]
df_compare_data

=== AUDIT: data_noise_tabular ===
p: 0.05, random_state: 42
Num numeric columns: 4 | picked per instance: 1
Num categorical columns: 0 | picked per instance: 0
Instances changed per class:
  - 2: 2
  - 0: 2
  - 1: 2


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
5,0.0,-0.322311,0.0,0.0
52,0.0,0.0,0.0,-0.870415
61,0.0,0.0,0.0,0.012973
88,1.642302,0.0,0.0,0.0
90,0.0,-0.334053,0.0,0.0


## 5% Feature Noise - Images

In [6]:
X_train_noisy = data_noise_images(X_train, y_train, DATASET_NAME, p=0.05)

df_compare_data = X_train - X_train_noisy
df_compare_data = df_compare_data[(df_compare_data != 0).any(axis=1)]
df_compare_data

=== AUDIT: data_noise_images (salt & pepper) ===
Dataset: digits, p: 0.05, random_state: 42
Total columns (pixels): 64, noisy columns per instance: 4
Instances changed per class:
  - 8: 7
  - 9: 7
  - 2: 7
  - 0: 7
  - 7: 7
  - 5: 7
  - 3: 7
  - 4: 7
  - 6: 7
  - 1: 7
Salt/pepper ratio per instance: 50/50 (extra to 'salt' if odd)


Unnamed: 0,pixel_0_0,pixel_0_1,pixel_0_2,pixel_0_3,pixel_0_4,pixel_0_5,pixel_0_6,pixel_0_7,pixel_1_0,pixel_1_1,...,pixel_6_6,pixel_6_7,pixel_7_0,pixel_7_1,pixel_7_2,pixel_7_3,pixel_7_4,pixel_7_5,pixel_7_6,pixel_7_7
54,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
59,0.0,0.0,0.0,0.0,0.0,-16.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
83,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,-16.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
101,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,-15.0,0.0,0.0,0.0,0.0,0.0
109,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1237,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1258,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1271,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-14.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1311,0.0,0.0,0.0,0.0,0.0,0.0,8.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## 5% Feature Noise - Text

In [3]:
X_train_noisy = data_noise_text(X_train, y_train, col_name='review', p=0.05)
df['review_noisy'] = X_train_noisy['review']
df_compara_data = df[df['review'] != df['review_noisy']]
df_compara_data[['review', 'review_noisy']]
df_compara_data

=== AUDIT: data_noise_text ===
p: 0.05, random_state: 42, removal_prob_per_word: 0.2
Instances changed per class:
  - positive: 938
  - negative: 938


Unnamed: 0,review,target,review_noisy
30,I watched this film a long time ago (aprox 10 ...,negative,I watched this film a long time (aprox 10 year...
35,Watching this movie was the biggest waste of t...,negative,Watching this movie was the biggest waste of t...
96,I suppose I should be fair and point out that ...,negative,I suppose I should be fair point out that I do...
147,Barbra Streisand's first television special wa...,positive,Barbra Streisand's first television special wa...
168,It has been widely agreed that Hayao Miyazaki ...,positive,It has been widely agreed that Hayao is a mast...
...,...,...,...
37421,I must say that I am fairly disappointed by th...,negative,I must that I am fairly disappointed this movi...
37431,"""Prom Night"" is a title-only remake of the 198...",negative,"""Prom is title-only remake of the 1980 flick t..."
37432,"""Yesterday"" as a movie, is hard to rate. The c...",negative,"""Yesterday"" a movie, is hard cinematography ex..."
37440,Hollywood had a long love affair with bogus Ar...,negative,Hollywood had a long love with Arabian tales b...


## Saving the Results

In [None]:
# Create the new df with the combination of noises and save in the correct folder.
new_df = X_train_noisy.copy()
new_df['target'] = y_train_noisy

new_df.to_csv(f'{DIR_ROOT}data/noise/all/{DATASET_NAME}_train.csv', index=False)

# Keep the test data intact and copy it to the new folder
df_test = pd.read_csv(f'{DIR_ROOT}data/{DATASET_NAME}_test.csv')
df_test.to_csv(f'{DIR_ROOT}data/noise/all/{DATASET_NAME}_test.csv', index=False)