## Missing Values

'Missingness' Terminology:
*   **MCAR:** Missing completely at random
*   **MAR:** Missing at random - corruption is conditioned on other column
*   **MNAR:** Missing not at random - corruption is conditioned on values in column on which it is applied

In [None]:
from jenga.corruptions.generic import MissingValues

def missing_values(df, column, fraction=.5, missingness='MCAR'):
    df[column] = MissingValues(column=column, fraction=fraction, missingness=missingness).transform(df)[column]
    return df

In [None]:
from jenga.basis import DataCorruption
class MillisInsteadOfSeconds ( DataCorruption ):

    def transform ( self , data, column ):
        self.fraction = 0.2
        self.column = column
        # Operate on a copy of the data
        corrupted_data = data.copy( deep = True )
        # Pick a random fraction of the rows
        # Multiply the column values of the chosen rows
        corrupted_data[self.column] = corrupted_data[self.column] * 1000
        return corrupted_data

def second_to_millis(df, column):
  tranfomer = MillisInsteadOfSeconds()
  df = df.ffill()
  df = tranfomer.transform(df, column)
  return df

## Adding Gaussian Noise and rounding

In [None]:
from jenga.corruptions.numerical import GaussianNoise

def noise_and_rounding(df, column, fraction=.5, missingness='MCAR'):
  df[column] = df[column].astype(str)
  parts = df[column].str.extract(r'^(\D*?)(\d+\.*\d*)(.*)')
  parts[1] = parts[1].astype(float)
  df[column] = GaussianNoise(column=column, fraction=fraction, sampling=missingness).transform(df)[column]
  df[column] = df[column].round(0).astype(int)
  combined_values = parts[0] + parts[1].astype(str) + parts[2]
  df[column] = combined_values
  return df

## Unit Conversion

In [None]:
from jenga.basis import TabularCorruption

# transforms string data for height in feet and inches into corresponding height in meters and cm
class UnitTransform(TabularCorruption):
    #Imperial to metric
    def convert_to_metric(self, height):
      if 'ft' in height and 'in' in height:
        feet, inches = height.split(' ')
        feet_cm = float(feet.replace("'","")) * 30.48
        inches_cm = float(inches.replace('"',"")) * 2.54
        height_in_cm = round(feet_cm + inches_cm)
        cm = height_in_cm%100
        meter = round((height_in_cm - cm)/100)

        return str(meter) + "m " + str(cm) + "cm"
      else:
        return height

    def transform(self, data):
      df = data.copy(deep=True)

      if self.fraction > 0:
          rows = self.sample_rows(data)
          df.loc[rows, self.column] = df.loc[rows, self.column].apply(lambda x: self.convert_to_metric(x))

      return df

In [None]:

def Unit_inch_to_cm(df, column, fraction=.5, missingness='MCAR'):
  df[column] = UnitTransform(column=column, fraction=fraction, sampling=missingness).transform(df)[column]
  return df

# Text Errors

## Setup Textattack
This is system specific and unstable

In [None]:
import nltk
nltk.download('stopwords')

In [None]:
import textattack

## Basic Augmenter
Some of the code below is copied from the textattack docs

In [None]:
from textattack.augmentation import EmbeddingAugmenter
augmenter = EmbeddingAugmenter()
s = 'Hello my friend, what is your mission?'
augmenter.augment(s)

In [None]:
from textattack.augmentation import CheckListAugmenter
cl = CheckListAugmenter(pct_words_to_swap=0.2, transformations_per_example=5)
cl.augment(s)

## Old Typo Injection

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, ENGLISH_STOP_WORDS
from sklearn import preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

In [None]:
from textattack.attack_recipes import pruthi_2019

In [None]:
from textattack.models.wrappers import SklearnModelWrapper
model_wrapper = SklearnModelWrapper(model,model_fit)

In [None]:
goal_function = textattack.goal_functions.UntargetedClassification(model_wrapper)
attack = pruthi_2019.Pruthi2019.build(model_wrapper)

In [None]:
def transform_cvect(training, testing, column_name):
    vect = CountVectorizer(
        max_features=100, ngram_range=(1, 3), stop_words=list(ENGLISH_STOP_WORDS)
    )
    vectFit = vect.fit(training[column_name])
    cvect_training = vectFit.transform(training[column_name])
    cvect_training_df = pd.DataFrame(
        cvect_training.toarray(), columns=vect.get_feature_names_out()
    )
    cvect_testing = vectFit.transform(testing[column_name])
    cvect_testing_df = pd.DataFrame(
        cvect_testing.toarray(), columns=vect.get_feature_names_out()
    )
    return vectFit, cvect_training_df, cvect_testing_df

In [None]:
def transform_tfidf(training, testing, column_name):
    Tfidf = TfidfVectorizer(
        ngram_range=(1, 3), max_features=100, stop_words=list(ENGLISH_STOP_WORDS)
    )
    Tfidf_fit = Tfidf.fit(training[column_name])
    Tfidf_training = Tfidf_fit.transform(training[column_name])
    Tfidf_training_df = pd.DataFrame(
        Tfidf_training.toarray(), columns=Tfidf.get_feature_names()
    )
    Tfidf_testing = Tfidf_fit.transform(testing[column_name])
    Tfidf_testing_df = pd.DataFrame(
        Tfidf_testing.toarray(), columns=Tfidf.get_feature_names()
    )
    return Tfidf_fit, Tfidf_training_df, Tfidf_testing_df

In [None]:
def build_example_model(X_train, y_train, X_test, y_test, name_of_test):
    rf = RandomForestClassifier(max_depth=4).fit(X_train, y_train)
    y_pred = rf.predict(X_test)
    print(
        "Training accuracy of " + name_of_test + ": ", rf.score(X_train, y_train)
    )
    print("Testing accuracy of " + name_of_test + ": ", rf.score(X_test, y_test))
    print(classification_report(y_test, y_pred))
    return rf

In [None]:
text_train, text_test = train_test_split(df)
cvectFit, cvect_training_df, cvect_testing_df = transform_cvect(text_train,text_test,'review_text')
example_model = build_example_model(text_train,text_train['fit'],text_test,text_test['fit'],'test')

## Old Typo Fixing

In [None]:
# SLOW experimental model from https://huggingface.co/oliverguhr/spelling-correction-english-base
from transformers import pipeline
fix_spelling = pipeline("text2text-generation",model="oliverguhr/spelling-correction-english-base")
print(fix_spelling("lets do a comparsion",max_length=2048))

[{'generated_text': "Let's do a comparison."}]


In [None]:
import neuspell
from neuspell.neuspell import available_checkers
print(f"available checkers: {available_checkers()}")

In [None]:
from neuspell.neuspell import BertChecker
checker = BertChecker()
checker.from_pretrained()
checker.correct("I luk foward to receving your reply")

In [None]:
from neuspell.neuspell import CnnlstmChecker
checker = NestedlstmChecker()
checker.from_pretrained()
checker.correct("I luk foward to receving your reply")

In [None]:
from autocorrect import Speller

def fix_typo_autocorrect(df, columns):
  spell = Speller(lang='en')
  for column in columns:
    df[column] = df[column].apply(spell)
  return df

In [None]:
summaries = df_typo['review_summary'].to_list()
from autocorrect import Speller
spell = Speller(lang='en')
start = time.time()
for i in range(len(summaries)):
  if (i%50 == 0):
    t = (time.time()) - start
    print(str(i)+"/"+str(len(summaries))+ "in " +str(t))
  summaries[i] = spell(summaries[i])

In [None]:
import pickle

import torch
from torch.nn.utils.rnn import pad_sequence
from transformers import AutoConfig, AutoTokenizer, AutoModelForTokenClassification

path = "murali1996/bert-base-cased-spell-correction"
config = AutoConfig.from_pretrained(path)
tokenizer = AutoTokenizer.from_pretrained(path)
bert_model = AutoModelForTokenClassification.from_pretrained(path, config=config)
model_dict = bert_model.state_dict()

In [None]:
bert_model.eval()

In [None]:

def load_vocab_dict(path_: str):
    """
    path_: path where the vocab pickle file is saved
    """
    with open(path_, 'rb') as fp:
        vocab = pickle.load(fp)
    return vocab


def _tokenize_untokenize(input_text: str, bert_tokenizer):
    subtokens = bert_tokenizer.tokenize(input_text)
    output = []
    for subt in subtokens:
        if subt.startswith("##"):
            output[-1] += subt[2:]
        else:
            output.append(subt)
    return " ".join(output)


def _custom_bert_tokenize_sentence(input_text, bert_tokenizer, max_len):
    tokens = []
    split_sizes = []
    text = []
    for token in _tokenize_untokenize(input_text, bert_tokenizer).split(" "):
        word_tokens = bert_tokenizer.tokenize(token)
        if len(tokens) + len(word_tokens) > max_len - 2:  # 512-2 = 510
            break
        if len(word_tokens) == 0:
            continue
        tokens.extend(word_tokens)
        split_sizes.append(len(word_tokens))
        text.append(token)

    return " ".join(text), tokens, split_sizes


def _custom_bert_tokenize(batch_sentences, bert_tokenizer, padding_idx=None, max_len=512):
    if padding_idx is None:
        padding_idx = bert_tokenizer.pad_token_id

    out = [_custom_bert_tokenize_sentence(text, bert_tokenizer, max_len) for text in batch_sentences]
    batch_sentences, batch_tokens, batch_splits = list(zip(*out))
    batch_encoded_dicts = [bert_tokenizer.encode_plus(tokens) for tokens in batch_tokens]
    batch_input_ids = pad_sequence(
        [torch.tensor(encoded_dict["input_ids"]) for encoded_dict in batch_encoded_dicts], batch_first=True,
        padding_value=padding_idx)
    batch_attention_masks = pad_sequence(
        [torch.tensor(encoded_dict["attention_mask"]) for encoded_dict in batch_encoded_dicts], batch_first=True,
        padding_value=0)
    batch_bert_dict = {"attention_mask": batch_attention_masks,
                       "input_ids": batch_input_ids
                       }
    return batch_sentences, batch_bert_dict, batch_splits


def _custom_get_merged_encodings(bert_seq_encodings, seq_splits, mode='avg', keep_terminals=False, device="cpu"):
    bert_seq_encodings = bert_seq_encodings[:sum(seq_splits) + 2, :]  # 2 for [CLS] and [SEP]
    bert_cls_enc = bert_seq_encodings[0:1, :]
    bert_sep_enc = bert_seq_encodings[-1:, :]
    bert_seq_encodings = bert_seq_encodings[1:-1, :]
    # a tuple of tensors
    split_encoding = torch.split(bert_seq_encodings, seq_splits, dim=0)
    batched_encodings = pad_sequence(split_encoding, batch_first=True, padding_value=0)
    if mode == 'avg':
        seq_splits = torch.tensor(seq_splits).reshape(-1, 1).to(device)
        out = torch.div(torch.sum(batched_encodings, dim=1), seq_splits)
    elif mode == "add":
        out = torch.sum(batched_encodings, dim=1)
    elif mode == "first":
        out = batched_encodings[:, 0, :]
    else:
        raise Exception("Not Implemented")

    if keep_terminals:
        out = torch.cat((bert_cls_enc, out, bert_sep_enc), dim=0)
    return out

In [None]:
with torch.no_grad():
    misspelled_sentences = ["Well,becuz badd spelln is ard to undrstnd wen ou rid it.",
                            "they fought a deadly waer",
                            "Hurahh!! we mad it...."]
    batch_sentences, batch_bert_dict, batch_splits = _custom_bert_tokenize(misspelled_sentences, tokenizer)
    # print(batch_sentences, "\n")
    outputs = bert_model(batch_bert_dict['input_ids'], attention_mask=batch_bert_dict["attention_mask"],
                          output_hidden_states=True)
    sequence_output = outputs[1][-1]
    # sanity check -------->
    # sequence_output = bert_model.dropout(sequence_output)
    # temp_logits = bert_model.classifier(sequence_output)
    # x1 = [val.data for val in outputs[0].reshape(-1,)]
    # x2 = [val.data for val in temp_logits.reshape(-1,)]
    # assert all([a == b for a, b in zip(x1, x2)])
    # <-------- sanity check
    bert_encodings_splitted = \
        [_custom_get_merged_encodings(bert_seq_encodings, seq_splits, mode='avg')
          for bert_seq_encodings, seq_splits in zip(sequence_output, batch_splits)]
    bert_merged_encodings = pad_sequence(bert_encodings_splitted,
                                          batch_first=True,
                                          padding_value=0
                                          )  # [BS,max_nwords_without_cls_sep,768]
    logits = bert_model.classifier(bert_merged_encodings)
    output_vocab = load_vocab_dict("vocab.pkl")
    # print(logits.shape)
    assert len(output_vocab["idx2token"]) == logits.shape[-1]
    argmax_inds = torch.argmax(logits, dim=-1)
    outputs = [" ".join([output_vocab["idx2token"][idx.item()] for idx in argmaxs][:len(wordsplits)])
                for wordsplits, argmaxs in zip(batch_splits, argmax_inds)]
    print(outputs)

    print("complete")

In [None]:
from spellchecker import SpellChecker
spell = SpellChecker()
start = time.time()
df_typo.iloc['review_text']=df_typo..apply(spell)
stop = time.time()
print(f"Typo fix time: {stop - start}s")

In [None]:
from spellwise import (CaverphoneOne, CaverphoneTwo, Editex, Levenshtein,
                       Soundex, Typox)
algorithm = Editex()

algorithm.add_words(words)
suggestions = algorithm.get_suggestions("lovily")
suggestions[0]['word']

## Old Typo Test

In [None]:
def small_typo_dfs(base_df, columns, target, typo_probs, corrector, max_target_sample = 10000, random_state=1, verbose = True):
  df = base_df.filter(columns + [target])
  target_values = df[target].unique()
  target_sample_size = min(min(df[target].value_counts()),max_target_sample)
  sampled_df = pd.DataFrame()
  for tval in target_values:
    sampled_df = pd.concat([sampled_df,df.loc[df[target]==tval].sample(n=target_sample_size,random_state=random_state)])
  sampled_df = sampled_df.sample(frac=1,random_state=random_state).reset_index(drop=True)
  typo_dfs = []
  if verbose:
    print("Injecting typos...")
  for prob in typo_probs:
    if(prob == 0):
      typo_dfs.append(sampled_df)
    else:
      new_df = sampled_df.copy()
      new_df = naive_typo_df(new_df,columns,prob)
      typo_dfs.append(new_df)
  if verbose:
    print("Fixing typos...")
  fixed_dfs = []
  for i in range(len(typo_dfs)):
    if(typo_probs[i] == 0):
      fixed_dfs.append(sampled_df)
    else:
      new_df = typo_dfs[i].copy()
      new_df = correct_typo(new_df,columns,corrector,verbose)
      fixed_dfs.append(new_df)
  if verbose:
    print("Fixed typos. Done.")
  return typo_dfs, fixed_dfs

In [None]:
df = pd.read_json('renttherunway_final_data.json', lines=True)
columns = ['review_text','review_summary']
typo_dfs, fixed_dfs = small_typo_dfs(df,columns,'fit',[0,0.05,0.1],corrector,3000)

In [None]:
def word_count(df, column):
  return (df[column].str.count(' ')+1).sum()

# Categorical Data Fixes

## Reduce Category Data Duplication

In [None]:
import numpy as np
import pandas as pd
from strsimpy import Jaccard,Cosine

def reduce_category(df):
    df2 = df.copy()
    df2 = df2.drop(df2.columns.difference(['category']), axis=1)

    # Handling quality
    missing_rows = df2[df2['category'].isnull()].index
    df2.drop(missing_rows, axis = 0, inplace=True)

    df2 = pd.DataFrame({'unique_category': df2['category'].unique()})

    # Define threshold for cosine and Jaccard similarity
    cosine_threshold = 0.7
    jaccard_threshold = 0.7

    c=Cosine(1)
    jaccard=Jaccard(1)

    # Group similar tokens based on both cosine and Jaccard similarity
    groups = {}
    for i, category in enumerate(df2['unique_category']):
        similar_indices = [j for j, other_category in enumerate(df2['unique_category']) if 1-jaccard.distance(category, other_category) > jaccard_threshold
                                  and 1-c.distance(category, other_category) > cosine_threshold]
        similar_indices = np.unique((similar_indices), axis=None)
        groups[category] = [df2['unique_category'].iloc[j] for j in similar_indices]


    # Map each original token to its corresponding group
    token_to_group = {}
    for category, similar_categories in groups.items():
        for similar_category in similar_categories:
            token_to_group[similar_category] = category

    # Replace original tokens with group tokens
    df2['reduced_category'] = [token_to_group[category] for category in df2['unique_category']]

    mapping_dict = dict(zip(df2['unique_category'], df2['reduced_category']))
    df['category'] = df['category'].map(mapping_dict)

    return df

## Category Imputation

In [None]:
import miceforest as mf
import pandas as pd

def apply_categorical_imputation_I(df, categorical_columns1, categorical_columns2):
    columns_to_drop = ['item_id', 'user_id', 'review_date', 'fit','review_text', 'review_summary','Year', 'Month', 'Day']
    df2 = df.copy()
    df2 = df2.drop(columns=columns_to_drop)

    df2[categorical_columns1+categorical_columns2] = df2[categorical_columns1+categorical_columns2].astype('category')

    # Create kernels.  #mice forest
    kernel = mf.ImputationKernel(
      data=df2,
      save_all_iterations=True,
      random_state=1343
    )
    # Run the MICE algorithm for 3 iterations on each of the datasets
    kernel.mice(3,verbose=True, n_estimators=50)
    #print(kernel)
    # slowest speed, highest imputation quality for large datasets
    completed_dataset = kernel.complete_data(dataset=0, inplace=False)


    # Drop common columns from df
    df.drop(columns=categorical_columns1+categorical_columns2, inplace=True)

    # Replace dropped columns in df with columns from completed_dataset
    df[categorical_columns1+categorical_columns2] = completed_dataset[categorical_columns1+categorical_columns2]

    return df

# Autofix (Old)

In [None]:
import json
import pandas as pd
from jenga.corruptions.generic import MissingValues
from jenga.corruptions.generic import SwappedValues
from jenga.corruptions.numerical import Scaling
from jenga.corruptions.numerical import GaussianNoise
import numpy as np
import random
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import FunctionTransformer
from pandas.api.types import is_numeric_dtype
from sklearn.svm import SVC

class metadata:
    def determine_column_types(self,df):
        self.categorical_features = {}
        self.numerical_features = {}
        self.text_features = {}
        for x in df:
            if(len(pd.unique(df[x])) != len(df[x])):
                if("id" in x):
                    continue
                if(is_numeric_dtype(df[x])):
                    self.numerical_features[x] = {"clean": 100 - ((df[x].isna().sum() / len(df[x])) * 100), "unique" : len(pd.unique(df[x]))}
                elif(len(pd.unique(df[x])) <= max(20,0.001 * len(df))):
                    self.categorical_features[x] = {"clean": 100 - ((df[x].isna().sum() / len(df[x])) * 100), "unique" : len(pd.unique(df[x]))}
                elif(isinstance(df[x].iloc[0], str)):
                    self.text_features[x] = {"clean": 100 - ((df[x].isna().sum() / len(df[x])) * 100), "unique" : len(pd.unique(df[x]))}

    def __init__(self,df):
        self.determine_column_types(df)

class autofix:
    def preprocess(self):
        for column in self.df:
            if(isinstance(self.df[column].iloc[0], str)):
                if(np.sum(self.df[column].str.isnumeric()) == len(self.df[column])):
                    self.df[column] = self.df[column].astype(int)
                elif(np.sum(self.df[column].replace(".", "").str.isnumeric()) == len(self.df[column])):
                    self.df[column] = self.df[column].astype(float)

    def bootstrap_imputer(self, df, column_to_predict, clean_mask):
        features_to_subset = set(self.metadata.categorical_features.keys())
        features_to_subset.discard(column_to_predict)
        features_to_subset.discard("fit")
        features_to_subset.discard("height")
        dirty_mask = ~clean_mask
        clean_data = df.loc[clean_mask]
        clean_sizes = df.groupby(list(features_to_subset)).size().sort_values(ascending=False).reset_index()
        NaN_data = df.loc[dirty_mask]
        new_NaN_data = pd.DataFrame()
        sizes_index = 0
        while(clean_sizes.loc[sizes_index][0] > 5):
            if(sizes_index > 100):
                break
            query = ""
            for column in features_to_subset:
                value = clean_sizes.loc[sizes_index][column]
                is_numeric = isinstance(value,int) | isinstance(value, float)
                if(is_numeric):
                    query = query + f'{column} == {value} and '
                else:
                    query = query + f'{column} == "{value}" and '
            query = query[:-4]
            sample_space = clean_data.query(query)
            impute_value = sample_space[column_to_predict].mode()[0]
            data_to_impute = NaN_data.query(query)
            NaN_data.loc[data_to_impute.index,column_to_predict] = impute_value
            sizes_index += 1
        other_data_index = NaN_data[column_to_predict].isna()
        impute_value = clean_data[column_to_predict].mode()[0]
        NaN_data.loc[other_data_index,column_to_predict] = impute_value
        return pd.concat([clean_data, NaN_data])

    def scaling_fix(self,scaling_factors = [1000,100], base_factor = 1):
        for column in list(self.metadata.numerical_features.keys()):
            min = np.min(self.df[column])
            if(min == 0):
                min  = 1
            for factor in scaling_factors:
                mask = self.df[column] >= factor * min
                self.df.loc[mask, column] = self.df.loc[mask, column] / factor
            self.df[column] = self.df[column] * base_factor

    def impute_missing(self):
        self.df = self.bootstrap_imputer(self.df, self.best_cat[0],self.df[self.best_cat[0]].notna())
        for column in self.df:
            if(column != self.best_cat[0]):
                self.df= self.bootstrap_imputer(self.df, column, self.df[column].notna())

    def __init__(self,df):
        self.df = df.dropna(thresh = round(0.25 * len(df.columns)))
        print(self.df)

    def fix(self):
        self.preprocess()
        self.metadata = metadata(self.df)
        self.non_text_columns = list(self.metadata.numerical_features.keys()) + list(self.metadata.categorical_features.keys())
        self.best_cat = ("",0,0)
        self.second_best_cat = ("",0,0)
        for cat in list(self.metadata.categorical_features.keys()):
            if ((self.metadata.categorical_features[cat]["unique"] > self.best_cat[1]) & (self.metadata.categorical_features[cat]["clean"] >= self.best_cat[2])):
                self.second_best_cat = self.best_cat
                self.best_cat = (cat, self.metadata.categorical_features[cat]["unique"],self.metadata.categorical_features[cat]["clean"])
            elif ((self.metadata.categorical_features[cat]["unique"] > self.second_best_cat[1]) & (self.metadata.categorical_features[cat]["clean"] >= self.second_best_cat[2])):
                self.second_best_cat = (cat, self.metadata.categorical_features[cat]["unique"],self.metadata.categorical_features[cat]["clean"])
        self.impute_missing()
        self.scaling_fix()
        return self.df


# Other

In [None]:
#stats
def print_stats(df):
  number_of_rows = len(df.values)
  print("Number of rows:", number_of_rows)
  missing_data = pd.DataFrame({'total_missing': df.isnull().sum(), 'perc_missing': (df.isnull().sum()/number_of_rows)*100})
  print(missing_data)
  return