### Importing Required Libraries

In [3]:
import sys
assert sys.version_info >= (3, 8, 0)
assert sys.version_info < (3, 9, 0)
import json
import pandas as pd
from jenga.corruptions.generic import MissingValues
from jenga.corruptions.generic import CategoricalShift
from jenga.corruptions.generic import SwappedValues
from jenga.corruptions.numerical import Scaling
from jenga.corruptions.numerical import GaussianNoise
import numpy as np
import random
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import FunctionTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.utils import resample
from pandas.api.types import is_numeric_dtype
import category_encoders as ce
from sklearn.svm import SVC
import miceforest as mf
import pandas as pd
import json
import jenga
import random
import jamspell
import time
from joblib import dump
from copy import deepcopy
import warnings
import matplotlib.pyplot as plt

### Error Injections (Categorical and Numerical)

In [None]:
def missing_values(df, column, fraction=.5, missingness='MCAR'):
    df[column] = MissingValues(column=column, fraction=fraction, missingness=missingness).transform(df)[column]
    return df

def swapping_values(df, column1, column2, fraction=.5, missingness='MCAR'):
  print(df.columns)
  print(column1, column2)
  df = SwappedValues(column=column1, fraction=fraction, sampling=missingness, swap_with=column2).transform(df)
  return df

def permute_categories(df, column, fraction=.5, missingness='MAR'):
    df[column] = CategoricalShift(column=column, fraction=fraction, sampling=missingness).transform(df)[column]
    return df

def scale(df, column, fraction=.5, missingness='MCAR'):
  df[column] = df[column].astype(str)
  parts = df[column].str.extract(r'^(\D*?)(\d+\.*\d*)(.*)')
  parts[1] = parts[1].astype(float)
  parts[1] = Scaling(column = 1, fraction =fraction, sampling=missingness).transform(parts)[1]

  # Append non-numerical part to numerical part
  combined_values = parts[0] + parts[1].astype(str) + parts[2]

  df[column] = combined_values
  return df

### Textual Error Injections

In [None]:
qwerty_dic = {'a': 'qwsz', 'b': 'vghn', 'c': 'xdfv', 'd': 'serfcx', 'e': 'wsdfr', 'f': 'drtgvc', 'g': 'ftyhbv', 'h': 'gyujnb', 'i': 'uojk', 'j': 'uikmnh', 'k': 'ijlm', 'l': 'opk', 'm': 'njk,', 'n': 'bhjm', 'o': 'iklp', 'p': 'ol', 'q': 'was', 'r': 'edft', 's': 'awedxz', 't': 'rfgy', 'u': 'yhji', 'v': 'cfgb', 'w': 'qesa', 'x': 'zsdc', 'y': 'tghu', 'z': 'asx'}

def inject_typo_naive(text, prob=0.1):
    typo = ""
    for char in text:
        if random.random() < prob and char != ' ':
            typo += random.choice('abcdefghijklmnopqrstuvwxyz')
        else:
            typo += char
    return typo

def inject_typo_typographic(text, dic=qwerty_dic, typo_prob=0.1, adjac_prob = 0.8):
    typo = ""
    for char in text:
        if random.random() < typo_prob and char != ' ':
          if (char in dic) and (random.random() < adjac_prob):
            typo += random.choice(dic[char])
          else:
            typo += random.choice('abcdefghijklmnopqrstuvwxyz')
        else:
            typo += char
    return typo

def naive_typo_df(df, columns, prob):
  for col in columns:
    df[col] = df[col].apply(inject_typo_naive,prob=prob)
  return df

def inject_typo_df(df, columns, prob):
  for col in columns:
    df[col] = df[col].apply(inject_typo_typographic,typo_prob=prob)
  return df

### Error Injections (Combined)
This function is used to inject errors into a dataframe.
It takes:
* The dataframe
* a list of error tuples
  * This list has the following format:\
  [ \
  (*name_of_error_1*, [*name_of_column_for_error_1*]), \
  (*name_of_error_2*, [*name_of_column_for_error_2*]), \
  (etc.) \
  ] \
  each tuple in this list represents 1 error to be applied to 1 column (some errors require two columns to be passed in the list in the second part of the tuple like swapping).
  * For example if you want to apply the missing value error to columns 'age' and 'review_text' and the swap the columns 'height' and 'weight' the errors list would look like:\
  [ \
  ('missing', ['age']), \
  ('missing', ['review_text']), \
  ('swapping', ['height', 'weight'])\
  ] \
* a fraction of the rows to which to apply the errors
* If verbose is set to True it will print, for each error, the column before and after the corruption

It returns:
* The corrupted dataframe


Possible error names:

* 'missing': Inject issing values
* 'swapping' : Swap the values oftwo columns(takes two column names)
* 'category_mixup': Permute the categorical values of a column: 'category_mixup'
* 'noise': Add noise and rounding to numerical column
* 'scale' : Randomly Scales a numerical columns
* 'manual_typo': Typo injection into textual columns (currently doesnt support the fraction parameter)


In [None]:
# Inject errors into a Dataframe
def inject_error(df, errors, fraction=0.5, verbose=True):

    for error, args in errors:
      error_df = pd.DataFrame()
      error_df['before_error'] = df[args[0]]
      print(f"injecting error '{error}' on {fraction*100}% of column '{args[0]}' (args = {args})")

      # ('missing', [column, fraction, missingess])
      if error == 'missing':
        df = missing_values(df, args[0], fraction=fraction)
      # ('swapping', [column1, column2, fraction, missingess])
      elif error == 'swapping':
        df = swapping_values(df, args[0], args[1], fraction=fraction)
      elif error == 'category_mixup':
        df = permute_categories(df, args[0], fraction=fraction)
      elif error ==  'noise':
        df = noise_and_rounding(df, args[0], fraction=fraction)
      elif error ==  'scale':
        df = scale(df, args[0], fraction=fraction)
      elif error == 'typo_smart':
        df = naive_typo_df(df, [args[0]],0.1)
      elif error == 'typo_naive':
        df = naive_typo_df(df, [args[0]],0.1)

      error_df['after_error'] = df[args[0]]

      if verbose:
        print(error_df)

    return df

### Preprocessing functions

In [None]:
enc_cat = OneHotEncoder(handle_unknown='ignore')
# encoder = ce.QuantileEncoder(cols=['bust size', 'category'], quantile=0.5, m=1.0)
tar_enc = ce.MEstimateEncoder(cols=['bust_size', 'category'], m=5.0)

def prepare_categorical(df, columns1, columns2, column_y, data_type):

    if data_type == "train":
      # Convert target mapping to numeric data
      mapping = {'fit': 1, 'small': 2, 'large': 3}
      new_df = df.copy()
      # Replace the values using the mapping in the new DataFrame
      new_df[column_y] = df[column_y].replace(mapping)

      global enc_bs, enc_cat1
      enc_cat1 = enc_cat.fit(df[columns1])
      enc_bs = tar_enc.fit(new_df[columns2], new_df[column_y])

    enc_df1 = pd.DataFrame(enc_cat1.transform(df[columns1]).toarray(), columns=enc_cat1.get_feature_names_out(columns1))
    # enc_df1 = pd.DataFrame(enc_cat1.transform(df[columns1]).toarray(), columns=enc_cat1.get_feature_names(input_features=columns1))
    enc_df2 = enc_bs.transform(df[columns2])

    df.drop(columns=columns1, inplace=True)
    df.drop(columns=columns2, inplace=True)
    df = pd.concat([df, enc_df1, enc_df2], axis=1)

    return df

In [None]:
#prepare numerical data
sc = StandardScaler()

def prepare_numerical(df, numerical_columns, data_type):
    for col in numerical_columns:
        df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)

    if data_type == "train":
      global enc_num
      enc_num = sc.fit(df[numerical_columns])

    sc_df = pd.DataFrame(enc_num.transform(df[numerical_columns]), columns=numerical_columns)

    df.drop(columns=numerical_columns, inplace=True)
    df.reset_index(drop=True, inplace=True)
    sc_df.reset_index(drop=True, inplace=True)
    df = pd.concat([df, sc_df], axis=1)
    df.drop(columns=numerical_columns, inplace=True)

    return df

In [1]:
def convert_to_cm(height):
    if pd.isna(height):
        return pd.NA
    height = str(height)

    if not(re.match(r'^\d+\'\s*\d+\"$', height)):
      numeric = re.findall(r'\d+', height)
      if len(numeric) > 0:
        return numeric[0]
      else:
        return pd.NA

    ft, inches = height.split("'")
    total = int(float(ft)) * 12 + int(inches.replace('"', ''))
    return total * 2.54

In [None]:
#prepare text columns
enc = TfidfVectorizer(max_features=1000, stop_words='english')
def prepare_text(df, text_columns, target_column, data_type):
    corpus = df[text_columns].apply(lambda x: ' '.join(map(str, x)), axis=1)

    if data_type == "train":
      global enc_txt
      enc_txt = enc.fit(corpus)

    # enc_df = pd.DataFrame(enc_txt.transform(corpus).toarray(), columns=enc.get_feature_names())
    enc_df = pd.DataFrame(enc.fit_transform(corpus).toarray(), columns=enc.get_feature_names_out())
    #enc_df = enc_df.rename(columns={target_column: 'tf_' + target_column})

    enc_df.columns = ['tf_' + col for col in enc_df.columns]

    df.reset_index(drop=True, inplace=True)
    enc_df.reset_index(drop=True, inplace=True)

    df = pd.concat([df, enc_df], axis=1)
    df.drop(columns=text_columns, inplace=True)

    return df

In [None]:
#Basic preprocessing

# Cardinality <10
categorical_columns1 = ['rented_for', 'body_type']
# Cardinality >10
categorical_columns2 = ['bust_size', 'category']
numerical_columns = ['rating', 'weight', 'height', 'size', 'age', 'Year', 'Month', 'Day']
text_columns = ['review_text', 'review_summary']
target_column = 'fit'
columns_to_drop = ['item_id', 'user_id', 'review_date']

def basic_preproseccing(df, data_type):
  for column in list(set(numerical_columns) - set(['height' , 'Year', 'Month', 'Day'])):
    df[column] = df[column].astype(str).str.extract('(\d+)').astype(float)

  df['height'] = df['height'].apply(convert_to_cm)
  df['review_date'] = pd.to_datetime(df['review_date'])
  df['Year'] = df['review_date'].dt.year
  df['Month'] = df['review_date'].dt.month
  df['Day'] = df['review_date'].dt.day


  df = prepare_numerical(df, numerical_columns, data_type)
  df = prepare_categorical(df, categorical_columns1, categorical_columns2, target_column, data_type)
  df = prepare_text(df, text_columns, target_column, data_type)

  return df

In [None]:
#Preprocessing required for Mice forest
def miceprec(df):
  numerical_columns = ['rating', 'weight', 'height', 'size', 'age']
  for column in list(set(numerical_columns) - set(['height'])):
      df[column] = df[column].astype(str).str.extract('(\d+)').astype(float)
  df['height'] = df['height'].apply(convert_to_cm)

### Loading and Splitting Dataset

This function Splits the data set into a train and test set. It resamples the dataframe so that it is balanced with respect to the values of the target variable. It also fits the preprocessers to the training data, and applies the preprocessing to the training data aswell. It returns the preprocessed training data and the unprocessed testing data.

In [None]:
#This function loads the data from file and splits it into train/test sets
# it also preprocesses the training set
def split_data(random_state = None, verbose = True):
  start_split = time.time()
  df = pd.read_json('renttherunway_final_data.json', lines=True)

  # Removing spaces in column names
  df.columns = df.columns.str.replace(' ', '_')

  # Assuming your DataFrame is called df and the target column is called 'target'
  # You'll need to replace 'target_class1', 'target_class2', and 'target_class3' with the actual class labels in your dataset

  # Separate majority and minority classes
  majority_class = df[df['fit'] == 'fit']
  minority_class1 = df[df['fit'] == 'small']
  minority_class2 = df[df['fit'] == 'large']

  # Downsample majority class to match minority class sizes
  majority_downsampled = resample(majority_class,
                                  replace=False,  # sample without replacement
                                  n_samples=len(minority_class2),  # match minority class 1 size
                                  random_state=42)  # reproducible results

  # Downsample majority class to match minority class sizes
  minority_downsampled_2 = resample(minority_class1,
                                  replace=False,  # sample without replacement
                                  n_samples=len(minority_class2),  # match minority class 2 size
                                  random_state=42)  # reproducible results

  # Combine minority classes with downsampled majority class
  balanced_df = pd.concat([majority_downsampled, minority_downsampled_2, minority_class2])

  # Shuffle the DataFrame to mix the classes
  balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)

  balanced_df['fit'].value_counts()

  df = balanced_df

  if verbose:
    print_stats(df)


  X = df.drop(columns=[target_column])
  y = df[target_column]

  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)
  end_split = time.time()

  print(f"Splitting training data took {end_split - start_split}s")

  train = pd.concat([X_train, y_train], axis=1)

  start_train_preprocess = time.time()
  train = basic_preproseccing(train, 'train')

  X_train = train.drop(columns= columns_to_drop + [target_column])

  y_train = train[target_column]
  end_train_preprocess = time.time()

  print(f"preprocessing training data took {end_train_preprocess - start_train_preprocess}s")


  return X_train, X_test, y_train, y_test

### AutoFix

In [None]:
class metadata:
    def determine_column_types(self,df):
        self.categorical_features = {}
        self.numerical_features = {}
        self.text_features = {}
        for x in df:
            if(len(pd.unique(df[x])) != len(df[x])):
                if("id" in x):
                    continue
                if(is_numeric_dtype(df[x])):
                    self.numerical_features[x] = {"clean": 100 - ((df[x].isna().sum() / len(df[x])) * 100), "unique" : len(pd.unique(df[x]))}
                elif(len(pd.unique(df[x])) <= 200):
                    self.categorical_features[x] = {"clean": 100 - ((df[x].isna().sum() / len(df[x])) * 100), "unique" : len(pd.unique(df[x]))}
                elif(isinstance(df[x].iloc[0], str)):
                    self.text_features[x] = {"clean": 100 - ((df[x].isna().sum() / len(df[x])) * 100), "unique" : len(pd.unique(df[x]))}

    def __init__(self,df):
        self.determine_column_types(df)

class autofix:
    def preprocess(self):
        for column in self.df:
            self.df.rename(columns={column : column.replace(" ", "_")}, inplace = True)
        for column in self.df:
            if(isinstance(self.df[column].iloc[0], str)):
                if(np.sum(self.df[column].str.isnumeric()) == len(self.df[column])):
                    self.df[column] = self.df[column].astype(int)
                elif(np.sum(self.df[column].replace(".", "").str.isnumeric()) == len(self.df[column])):
                    self.df[column] = self.df[column].astype(float)

    def correct_typo(self, columns, verbose=True):
        tot_time = 0
        for i in range(len(columns)):
            col = columns[i]
            if verbose:
                print(f"Correcting typos in column {i+1}/{len(columns)} ({self.df[col].shape[0]} rows)...")
            start = time.time()
            try:
                self.df[col]=self.df[col].apply(self.corrector.FixFragment)
            except:
                print(f"Error correcting typos in column {i+1}. Aborting.")
                return

            end = time.time()
            if verbose:
                print(f"Finished correcting typos in column {i+1}. Time taken: {end - start}s")
            tot_time += (end - start)
        if verbose:
            print(f"Finished correcting typos. Total time taken: {tot_time}s")

    def miceforest_imputer(self):
        # Determine the column types and columns to drop before imputation
        columns_to_drop = ['item_id', 'user_id', 'review_date', 'fit','review_text', 'review_summary','Year', 'Month', 'Day']
        categorical_columns = ['rented_for', 'body_type','bust_size', 'category']
        numerical_columns = ['rating', 'weight', 'height', 'size', 'age']

        # Take a copy of self.df prior to dropping non-numerical and non-categorical
        df2 = self.df.copy()
        # Drop the specified columns from df2
        for col in columns_to_drop:
            df2 = df2.drop(columns=col, errors='ignore')

        # Set the data types for categorical and numerical columns
        df2[categorical_columns] = df2[categorical_columns].astype('category')
        for col in numerical_columns:
            df2[col] = pd.to_numeric(df2[col], errors='coerce')

        # Create kernels.  #mice forest
        kernel = mf.ImputationKernel(
          data=df2,
          save_all_iterations=True,
          random_state=1343
        )
        # Run the MICE algorithm for 3 iterations on each of the datasets
        kernel.mice(1,verbose=True, n_estimators=50)
        completed_dataset = kernel.complete_data(dataset=0, inplace=False)


        # Drop common columns from self.df
        self.df.drop(columns=categorical_columns+numerical_columns, inplace=True)

        # Replace dropped columns in df with columns from completed_dataset
        self.df[categorical_columns+numerical_columns] = completed_dataset[categorical_columns+numerical_columns]

    def bootstrap_imputer(self, df, column_to_predict, clean_mask):
        if(np.sum(clean_mask) == len(df)):
            return df
        features_to_subset = set(self.metadata.categorical_features.keys())
        features_to_subset.discard(column_to_predict)
        features_to_subset.difference_update(self.features_to_exclude)
        dirty_mask = ~clean_mask
        clean_data = df.loc[clean_mask]
        dirty_data = df.loc[dirty_mask]
        cleaned_index = pd.Index([])
        dirty_sizes = dirty_data.groupby(list(features_to_subset)).size().sort_values(ascending=False).reset_index()
        if((column_to_predict in list(self.metadata.text_features.keys())) and self.text_fix == "no"):
            dirty_data[column_to_predict] = ""
            return pd.concat([clean_data, dirty_data])
        elif((column_to_predict in list(self.metadata.text_features.keys())) and self.text_fix == "drop"):
            return df


        for sizes_index in range(len(dirty_sizes)):
            if(sizes_index > 100):
                break
            query = ""
            for column in features_to_subset:
                value = dirty_sizes.loc[sizes_index][column]
                is_numeric = isinstance(value,int) | isinstance(value, float)
                if(is_numeric):
                    query = query + f'{column} == {value} and '
                else:
                    value = value.replace('\"', '\\"')
                    query = query + f'{column} == "{value}" and '
            query = query[:-4]
            sample_space = clean_data.query(query)
            if(len(sample_space) == 0):
                continue
            if(column_to_predict in list(self.metadata.numerical_features.keys())):
                impute_value = self.get_num_impute_bootstrap_value(sample_space[column_to_predict],self.num_impute_bootstrap)
            else:
                impute_value = sample_space[column_to_predict].mode()[0]
            data_to_impute = dirty_data.query(query)
            dirty_data.loc[data_to_impute.index,column_to_predict] = impute_value
            cleaned_index = cleaned_index.union(data_to_impute.index)
        other_data_index = dirty_data.index.difference(cleaned_index)
        if(column_to_predict in list(self.metadata.numerical_features.keys())):
            impute_value = self.get_num_impute_bootstrap_value(clean_data[column_to_predict],self.num_impute_bootstrap)
        else:
            impute_value = clean_data[column_to_predict].mode()[0]
        dirty_data.loc[other_data_index,column_to_predict] = impute_value
        return pd.concat([clean_data, dirty_data])

    def scaling_fix(self,scaling_factors = [1000,100], base_factor = 1):
        for column in list(self.metadata.numerical_features.keys()):
            min = np.min(self.df[column])
            if(min == 0):
                min  = 1
            for factor in scaling_factors:
                mask = self.df[column] >= factor * min
                self.df.loc[mask, column] = self.df.loc[mask, column] / factor
            self.df[column] = self.df[column] * base_factor


    def impute_missing(self):
        self.df = self.bootstrap_imputer(self.df, self.best_cat[0],self.df[self.best_cat[0]].notna())
        for column in self.df:
            if(column != self.best_cat[0]):
                self.df= self.bootstrap_imputer(self.df, column, self.df[column].notna())

    def __init__(self,df, imputation_type_for_NonTextData="bootstrap", num_impute_bootstrap = "mode", text_fix = "yes",features_to_exclude = [], typo_fixes = []):
        """ @params:
                - df                              = Dataframe to impute
                - imputation_type_for_NonTextData = Default imputation is bootstrap
                                                    possible_imputation_methods ["bootstrap", "miceforest"].
                                                    MICEForest can be used for categorical and numerical imputations.
                                                    Reamining columns will be imputed with bootstrap imputation

        """
        self.imputation_type_for_NonTextData = imputation_type_for_NonTextData
        possible_imputation_methods = ["bootstrap", "miceforest"]

        if self.imputation_type_for_NonTextData not in possible_imputation_methods:
            raise ValueError(f"Invalid imputation type: {self.imputation_type_for_NonTextData}. "
                             f"Allowed imputation methods are: {', '.join(possible_imputation_methods)}")
        self.corrector = jamspell.TSpellCorrector()
        self.corrector.LoadLangModel('en.bin')
        self.df = df.dropna(thresh = round(0.25 * len(df.columns)))
        self.num_impute_bootstrap = num_impute_bootstrap
        self.text_fix = text_fix
        self.features_to_exclude = features_to_exclude
        self.preprocess()
        self.metadata = metadata(self.df)
        if(len(typo_fixes) == 0):
            self.typo_fixes = list(self.metadata.text_features.keys())
        else:
            self.typo_fixes = typo_fixes

    def get_num_impute_bootstrap_value(self, values, method):
        if(method == "mode"):
            return values.mode()[0]
        elif(method == "mean"):
            return values.mean()
        else:
            return values.median()

    def fix(self):
        self.correct_typo(self.typo_fixes)
        if self.imputation_type_for_NonTextData == "miceforest":
            print("USING MICEFOREST")
            self.miceforest_imputer()
        if(self.text_fix == "drop"):
            self.df = self.df.dropna(subset=list(self.metadata.text_features.keys()))
        self.non_text_columns = list(self.metadata.numerical_features.keys()) + list(self.metadata.categorical_features.keys())
        self.best_cat = ("",0,0)
        self.second_best_cat = ("",0,0)
        for cat in list(self.metadata.categorical_features.keys()):
            if ((self.metadata.categorical_features[cat]["unique"] > self.best_cat[1]) & (self.metadata.categorical_features[cat]["clean"] >= self.best_cat[2])):
                self.second_best_cat = self.best_cat
                self.best_cat = (cat, self.metadata.categorical_features[cat]["unique"],self.metadata.categorical_features[cat]["clean"])
            elif ((self.metadata.categorical_features[cat]["unique"] > self.second_best_cat[1]) & (self.metadata.categorical_features[cat]["clean"] >= self.second_best_cat[2])):
                self.second_best_cat = (cat, self.metadata.categorical_features[cat]["unique"],self.metadata.categorical_features[cat]["clean"])
        self.impute_missing()
        self.scaling_fix()
        return self.df

In [None]:
# AutoFix test

reviews = []
with open('./renttherunway_final_data.json', 'r') as file:
    for line in file:
        reviews.append(json.loads(line))
df = pd.DataFrame(reviews)
df = miceprec(df)

print(df)
auto_imputer = autofix(df, imputation_type_for_NonTextData='miceforest',typo_fixes=['review_summary'])
auto_imputer.fix()
output = auto_imputer.df.sort_index()
output.dropna()

### Corrupt & Fix
This function allows to run a single experiment on a specific dataset with specific error injections and specific cleaning methods.
This function takes:
* a trained model
* a list of "error tuples" which define the errors to be injected into the data
* A number between 0 and 1 which defines the fraction of rows to inject the error on
* The unprocessed X_test data partition
* The y_test data for the accuracy report
* The training data that was used to train the model.

It returns:
* The accuracy report of the experiment (as a dict)

In [None]:
# Takes a model, test_data and a set of errors and applies the errors to the test data.
# The model is run on corrupted test data and the accuracy report is returned

def corrupt_and_fix(model, errors, X_test, y_test, X_train, y_train, auto=False, verbose=True):

  # corrupt test data
  start_corruption = time.time()
  X_corrupted_test = inject_error(X_test, errors, verbose = verbose)
  stop_corruption = time.time()
  print(f"Corruption of test data time: {stop_corruption - start_corruption}s")

  corrupted_test = pd.concat([X_corrupted_test, y_test], axis = 1)


  start_cleaning = time.time()

  # corrupted
  if auto:
    corrutped_test = miceprec(corrupted_test)
    auto_imputer = autofix(corrupted_test, imputation_type_for_NonTextData='miceforest',typo_fixes=['review_summary'])
    auto_imputer.fix()
    output = auto_imputer.df.sort_index()
    output.dropna()
    corrupted_test = output

  end_cleaning = time.time()
  print(f"Cleaning of test data time: {end_cleaning - start_cleaning}s")

  # preprocess corrupted data

  start_preprocess_test = time.time()
  corrupted_test = basic_preproseccing(corrupted_test, 'test')
  end_preprocess_test = time.time()
  print(f"Preprocessing of test data time: {end_preprocess_test - start_preprocess_test}s")

  X_corrupted_test = corrupted_test.drop(columns = columns_to_drop +[target_column])

  # ensure that the train and test sets have the same columns
  X_corrupted_test = X_corrupted_test.filter(items=X_train.columns)
  missing_columns = X_train.columns.difference(X_corrupted_test.columns)
  X_corrupted_test[missing_columns] = 0

  X_corrupted_test = X_corrupted_test[X_train.columns]

  y_corrupted_test = corrupted_test[y_test.name]




  # use model to predict and return classification report
  start_prediction = time.time()
  y_corrupted_pred = model.predict(X_corrupted_test)
  end_prediction = time.time()
  print(f"Prediction of test data time: {end_prediction - start_prediction}s")

  report = classification_report(y_corrupted_test, y_corrupted_pred, output_dict = True)

  print(f"accuracy is {report['accuracy']}")
  return report

### Model Training

This cell pretrains 5 models for later use (since the same set of random states are used across experiments, we are ables to split the data, and train the models in advance and there is no need to do that at runtime). It loads 5 lists. One with the trained models, and then four with the verious dataset partitions. X_train and y_train are already preprocessed after running this cell. X_test and y_test are not.

In [None]:
random_states = [4, 256, 32, 42, 1]
models = []
X_tests = []
y_tests = []
X_trains = []
y_trains = []


for i in range(len(random_states)):
  print(f"preparing model {i+1} of {len(random_states)}")
  X_train, X_test, y_train, y_test = split_data(random_state=random_states[i], verbose = False)

  X_tests.append(X_test)
  y_tests.append(y_test)
  X_trains.append(X_train)
  y_trains.append(y_train)

  model = RandomForestClassifier()
  start = time.time()
  model.fit(X_train, y_train)
  stop = time.time()
  print(f"Training time: {stop - start}s")

  models.append(model)

preparing model 1 of 5
Splitting training data took 1.6774237155914307s
preprocessing training data took 3.991675853729248s
Training time: 36.39355421066284s


In [None]:
for i in range(len(random_states)):
  dump(models[i], (f'model_state_{random_states[i]}.joblib'))

### Collecting Results

In [None]:
def start_report(file, mode):
  test_outcome_df = pd.DataFrame(columns = ["precision", "recall", "f1", "support", "error tuple", "random state"])
  test_outcome_df.to_csv(file, mode=mode)

def append_report(report, file, random_state, errors):
  test_outcome_df = pd.DataFrame(report).transpose()
  test_outcome_df['error_tuple'] = str(errors)
  test_outcome_df['random state'] = random_state
  test_outcome_df.to_csv(file, mode='a', header=False)

In [None]:
error_tuples_list = [[('typo_smart',['review_summary']),
                ('typo_naive',['review_summary']),
                ('missing',['rented_for']),
                ('missing',['age']),
                ('missing',['category']),
                ('swapping',['weight','height']),
                ('swapping',['age','height']),
                ('scale',['rating']),
                ('scale',['weight']),
                ()],
                  [('typo_naive',['review_summary'])],
                  [('missing',['category'])],
                 [()] ]

def write_results(error_tuples_list, file, auto):
  start_report(file, 'w')
  for mod_number in range(len(random_states)):
    print(f"Using random state: {random_states[mod_number]} ({mod_number+1}/{len(random_states)}) ")
    model = models[mod_number]
    fraction = .5

    for error_tuples in error_tuples_list:
      errors = []
      for error in error_tuples:
          if error != ():
            errors.append(error)

      X_corrupted_test = deepcopy(X_tests[mod_number])
      y_corrupted_test = deepcopy(y_tests[mod_number])
      print(f"error list: {errors}")
      #possible error names are defined in inject_error function

      # corrupts the test data and returns the classification report
      report = corrupt_and_fix(model, errors, X_corrupted_test, y_corrupted_test, X_trains[mod_number], y_trains[mod_number], auto=auto, verbose=False)
      print(report)
      # save results to file
      append_report(report, file, random_states[mod_number], errors)

write_results(error_tuples_list,'all_noclean.csv',False)
write_results(error_tuples_list,'all_clean.csv',True)

In [None]:
error_tuples = [('typo_smart',['review_summary']),
                ('typo_naive',['review_summary']),
                ('missing',['rented_for']),
                ('missing',['age']),
                ('missing',['category']),
                ('swapping',['weight','height']),
                ('swapping',['age','height']),
                ('category_mixup',['category']),
                ('category_mixup',['bust_size']),
                ('scale',['rating']),
                ('scale',['weight']),
                ()]

def write_results(error_tuples, file, auto):
  start_report(file, 'w')
  for mod_number in range(len(random_states)):
    print(f"Using random state: {random_states[mod_number]} ({mod_number+1}/{len(random_states)}) ")
    model = models[mod_number]
    fraction = .5

    for error in error_tuples:
      if error == ():
        errors = []
      else:
        errors = [error]
      X_corrupted_test = deepcopy(X_tests[mod_number])
      y_corrupted_test = deepcopy(y_tests[mod_number])
      print(f"error list: {errors}")
      #possible error names are defined in inject_error function

      # corrupts the test data and returns the classification report
      report = corrupt_and_fix(model, errors, X_corrupted_test, y_corrupted_test, X_trains[mod_number], y_trains[mod_number], auto=auto, verbose=False)
      print(report)
      # save results to file
      append_report(report, file, random_states[mod_number], errors)

write_results(error_tuples,'noclean.csv',False)
write_results(error_tuples,'clean.csv',True)

This cell can be edited to run various experiments.
* Specify the list of columns you want to maniplulate in the experiment
* Specify the file name where you want to save results to
* If you want to append an existing file make sure the line 'start_report(file, 'w')' is commented out (it will overwrite the file)
* The first loop loops over all the pretrained models (run the training models cell first)
* specifiy fraction parameter
* the second nested loop can be edited freely to be made to run the tests you want. It should build the errors list (as specified by the inject error function), copy the correct datasets from the previously created lists and call corrupt_and_proedict once for each test you want to run.
* (For testing purposes you can comment out the append_report line aswell )

In [None]:
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)

# columns to manipulate
columns = ['review_summary']
#columns = ['review_text', 'review_summary']


print(columns)

# file name to save results to
file = "./full_df_no_cleaning.csv"

# comment this out if you only want to append to the file and not write a new header
start_report(file, 'w')

errors = []
for mod_number in range(len(random_states)):

  print(f"Using random state: {random_states[mod_number]} ({mod_number+1}/{len(random_states)}) ")

  model = models[mod_number]


  fraction = .5
  errors = []

  # define the error tuples here
  for col_number in range(len(random_states)):
    errors = [('typo_smart', [columns[col_number]])]

    X_corrupted_test = deepcopy(X_tests[mod_number])
    y_corrupted_test = deepcopy(y_tests[mod_number])


    print(f"error list: {errors}")
    #possible error names are defined in inject_error function

    # corrupts the test data and returns the classification report
    report = corrupt_and_fix(model, errors, X_corrupted_test, y_corrupted_test, X_trains[mod_number], y_trains[mod_number], auto=False, verbose=False)

    # save results to file
    append_report(report, file, random_states[mod_number])



### Plotting Results

In [None]:
def plot_test_result(file):
  df = pd.read_csv(file)
  df.rename(columns={ df.columns[0]: "type" }, inplace = True)

  df = df.loc[lambda x: x["type"] == "accuracy"]
  accuracy = (df["precision"]).astype(float)
  error_tuple = df["error tuple"]
  random_state = df['random state']

  accuracy = pd.concat([error_tuple, accuracy, random_state], axis = 1)

  grouped_data =accuracy.groupby('error tuple')['precision'].mean().reset_index()

  # Plot the average accuracy against the error tuple
  plt.figure(figsize=(10, 6))
  plt.bar(grouped_data['error tuple'], grouped_data['precision'])

  print(grouped_data)


  plt.xlabel('Error tuple')
  plt.ylabel('Accuracy')
  plt.ylim(np.min(grouped_data['precision'])-0.02, np.max(grouped_data['precision'])+0.02)
  plt.xticks(rotation=90)
  plt.title('Accuracy vs. Error Tuple Bar graph')
  plt.grid(True)
  plt.show()


plot_test_result('full_df_no_cleaning.csv')

In [None]:
def plot_corrupt_vs_clean(clean_file, unclean_file):
  clean_df = pd.read_csv(clean_file)
  unclean_df = pd.read_csv(unclean_file)

  clean_df.rename(columns={ clean_df.columns[0]: "type" }, inplace = True)
  unclean_df.rename(columns={ unclean_df.columns[0]: "type" }, inplace = True)

  unclean_df = unclean_df.loc[lambda x: x["type"] == "accuracy"]
  accuracy = (unclean_df["precision"]).astype(float)
  error_tuple = unclean_df["error tuple"].astype(object)
  random_state = unclean_df['random state']

  unclean_accuracy = pd.concat([error_tuple, accuracy, random_state], axis = 1)

  clean_df = clean_df.loc[lambda x: x["type"] == "accuracy"]
  accuracy = (clean_df["precision"]).astype(float)
  error_tuple = clean_df["error tuple"].astype(object)
  random_state = clean_df['random state']

  clean_accuracy = pd.concat([error_tuple, accuracy, random_state], axis = 1)


  grouped_clean_data = clean_accuracy.groupby('error tuple')['precision'].mean().reset_index()
  grouped_unclean_data = unclean_accuracy.groupby('error tuple')['precision'].mean().reset_index()

  grouped_data = pd.merge(grouped_unclean_data, grouped_clean_data, on='error tuple', how='inner', suffixes = ("_unclean", "_clean"))

  print(grouped_data)

  # Plot the average accuracy against the error tuple
  plt.figure(figsize=(10, 6))
  X = np.arange(len(grouped_data['error tuple']))


  plt.bar(X - 0.2, grouped_data['precision_unclean'], 0.4, label="Without Autofix")
  plt.bar(X + 0.2, grouped_data['precision_clean'], 0.4, label="With Autofix")


  plt.xlabel('Error Tuple' ,fontsize=18, fontweight='bold')
  plt.ylabel('Accuracy' ,fontsize=18, fontweight='bold')
  plt.ylim(np.max(grouped_data['precision_clean'])-0.02, np.max(grouped_data['precision_clean'])+0.02)
  plt.ylim(0.675,0.684)
  plt.xticks(X, grouped_data['error tuple'], rotation=90, fontsize = 12)
  plt.title('Accuracy vs. Error Tuple Bar Graph', fontsize=18, fontweight='bold')
  plt.legend(fontsize = 12)
  plt.grid(True)
  plt.tick_params(axis='y', labelsize=12)
  plt.rcParams.update({'font.size': plt.rcParamsDefault['font.size'],
                     'font.weight': plt.rcParamsDefault['font.weight']})
  plt.show()

plot_corrupt_vs_clean('clean.csv','noclean.csv')