In [1]:
import pandas as pd
import numpy as np
import string, collections, typing, math, warnings
import imblearn
from matplotlib import pyplot as plt

In [2]:
#Configurations
warnings.filterwarnings('ignore')

In [3]:
#Global variables
resampling_methods = {'SMOTE': imblearn.over_sampling.SMOTE, 
                      'RandomOver': imblearn.over_sampling.RandomOverSampler,
                      #'ADASYN': imblearn.over_sampling.ADASYN,
                      'aug': None}
K_FOLDS = 5 #folds for k-fold x-validation
INPUT_NAME = 'text' #name of input column in df
THRESHOLD_PCT = 0.5 #minimum percent of text that must be taken up by a particular word
OUTPUT_NAMES = ['Binary', '4-type', '5-type']
AUG_NAME = '../data/augmented_dataset.csv'

In [4]:
#Word representations
def bow(text: str, wordlist: list[str]) -> list[int]:
    '''Represent text with bag of words.'''
    return [int(word in text) for word in wordlist]

def freq(text: str, wordlist: list[str]) -> list[float]:
    '''Represent text with frequencies.'''
    text = text.split()
    counter = collections.Counter(text)
    length = len(text)
    return [value_or_zero(word, counter)/length for word in wordlist]

def tfidf(text: str, word_dict: dict[str, float]) -> list[float]:
    '''Represent text with TF-IDF'''
    text = text.split()
    counter = collections.Counter(text)
    length = len(text)
    frequencies = [value_or_zero(word, counter)/length for word in word_dict]
    return [word_dict[word] * frequencies[i] for (i, word) in enumerate(word_dict)]

In [5]:
#Data formatting and wrangling functions
def remove_below_threshold(array: np.ndarray, value_array: np.ndarray, threshold: float) -> np.ndarray:
    '''Remove all elements from an array where same index in another array is greater than a particular threshold.'''
    return np.array([value for i, value in enumerate(array) if value_array[i] <= threshold])
    
def get_all_text(df: pd.DataFrame) -> str:
    '''Get a string with all text from a pandas DataFrame.'''
    return ' '.join(list(df[INPUT_NAME]))

def remove_uncommon_words(wordlist: list[str], text_counter: dict[str, int]) -> list[str]:
    '''Remove words that are uncommon, below a certain threshold.'''
    return [word for word in wordlist if text_counter[word]*100/len(wordlist) >= THRESHOLD_PCT]

def combine_answers(df: pd.DataFrame, names: list[str]) -> pd.DataFrame:
    '''Combine the answers to get the 5-type class'''
    df = df.fillna(0)
    columns = list(df.columns.values)
    data_type, classification = columns.index(names[0]), columns.index(names[1])
    df[OUTPUT_NAMES[2]] = df.apply(lambda x: x[data_type] + x[classification], axis=1)
    return df

def get_wordlist(df: pd.DataFrame) -> tuple[list[str], dict[str, int]]:
    '''Get the wordlist for a dataframe.'''
    all_text_split = get_all_text(df).split()
    all_text_counter = collections.Counter(all_text_split)
    return remove_uncommon_words(list(set(all_text_split)), all_text_counter), all_text_counter

def get_worddict(df: pd.DataFrame, frequency_counter: dict[str, int], 
                 wordlist: list[str]) -> dict[str, float]:
    '''Get the word dict for a dataframe, for the TF-IDF encoding.'''
    all_text_split = get_all_text(df).split()
    all_text_counter = collections.Counter(all_text_split)
    word_dict = dict()
    for word in wordlist:
        value = math.log(len(wordlist)/all_text_counter[word])
        word_dict[word] = value
    return word_dict

def resample(inputs: np.ndarray, outputs: np.ndarray, resampling_method: str) -> tuple[np.ndarray, np.ndarray]:
    '''Resample a dataset with a resampling method.'''
    try:
        if resampling_method not in ['aug', 'none']:
            inputs, outputs = methods[resampling_method]().fit_resample(inputs, outputs)
    except ValueError: #sometimes it doesn't generate any new samples, which gives value error. In that case, return original.
        pass
    return inputs, outputs

def value_or_zero(key: str, temp_dict: dict[str, int]) -> int:
    '''Return value in a dictionary or zero if key not in dictionary.'''
    return 0 if key not in temp_dict else temp_dict[key]

def delete_punctuation(input_string: str) -> str:
    '''Delete the punctuation from a string.'''
    return ''.join(char for char in input_string if char not in string.punctuation)

def find_distances(original: np.ndarray, resampled: np.ndarray) -> np.ndarray:
    '''Find the distances between two numpy ndarrays.'''
    return np.linalg.norm(original - resampled, axis=1)

def represent_text(df: pd.DataFrame, rep_func: typing.Callable,
                  rep_list: typing.Union[list[str], dict[str, float]]) -> np.ndarray:
    '''Represent text in a dataframe with a given representation method.'''
    representation = df[INPUT_NAME].apply(lambda x: rep_func(x, rep_list)).to_numpy()
    return np.array([np.array(x) for x in representation]) #it doesn't do this automatically for some reason

def train_test_split(df: pd.DataFrame, k: int) -> tuple[pd.DataFrame, pd.DataFrame]:
    '''Split a dataframe into train and test.'''
    begin, end = (k * len(df) // K_FOLDS), ((k+1) * len(df) // K_FOLDS)
    output_df = df.iloc[begin:end]
    input_df = df.drop(output_df.index)
    return input_df, output_df

In [6]:
#Visualization functions
def visualize_distances(distances: list[np.ndarray], names: list[str]) -> None:
    '''Visualize the distances found through the resampling calculation.'''
    ax, fig = plt.subplots()
    for i, distance in enumerate(distances):
        plt.plot(distance, alpha=0.4, label=names[i])
    plt.legend()
    plt.show()

In [7]:
#Primary distance functions
def normal_resample(inputs: np.ndarray, classes: np.ndarray, needed_classes: np.ndarray, method_name: str) -> np.ndarray:
    '''Resample with one of the traditional geometric methods.'''
    original = collections.Counter(classes)
    needed = collections.Counter(needed_classes)
    for key in original:
        needed[key] = original[key] if key not in needed else original[key] + needed[key]
    method = resampling_methods[method_name](sampling_strategy=needed)
    input_length = len(inputs)
    inputs, classes = method.fit_resample(inputs, classes)
    return inputs[input_length:]

def aug_resample(split: int, needed_classes: np.ndarray, 
                 rep_func: typing.Callable, rep_list: typing.Union[list[str], dict[str, float]]) -> np.ndarray:
    '''Perform augmented resampling.'''
    aug_df = pd.read_csv(AUG_NAME)
    begin, end = (split * len(aug_df) // K_FOLDS), ((split+1) * len(aug_df) // K_FOLDS)
    aug_df = aug_df.drop(aug_df.iloc[begin:end].index)
    needed_counter = collections.Counter(needed_classes)
    to_return = []
    for i in sorted(list(set(needed_classes))):
        temp_df = aug_df[aug_df['type'] == i]
        to_return += temp_df.sample(needed_counter[i])[INPUT_NAME].tolist()
    return np.array([np.array(rep_func(item, rep_list)) for item in to_return])

def resample(inputs: np.ndarray, classes: np.ndarray, needed_classes: np.ndarray, 
             method_name: str, split: int, rep_func: typing.Callable, 
             rep_list: typing.Union[list[str], dict[str, float]]) -> np.ndarray:
    '''Resample given inputs.'''
    return aug_resample(split, needed_classes, 
                        rep_func, rep_list) if method_name == 'aug' else normal_resample(inputs, classes, 
                                                                                needed_classes, method_name)

def k_fold_loop(df: pd.DataFrame, rep_func: typing.Callable, method_name: str, 
                rep_list: typing.Union[list[str], dict[str, float]], output_name: str) -> np.ndarray:
    '''Loop through each fold in k-fold cross validation to find result.'''
    all_arrays = []
    for i in range(K_FOLDS):
        input_df, output_df = train_test_split(df, i)
        input_arr, output_arr = represent_text(input_df, rep_func, rep_list), represent_text(output_df, rep_func, rep_list)
        input_class, output_class = input_df[output_name].to_numpy(), output_df[output_name].to_numpy()
        resampled = resample(input_arr, input_class, output_class, method_name, i, rep_func, rep_list)
        output_arr = np.array([item[1] for item in sorted(zip(output_class.tolist(), output_arr.tolist()))]) #correct order
        distances = find_distances(output_arr, resampled)
        all_arrays.append(distances)
    return np.concatenate(all_arrays)

def get_resampling_dist(df: pd.DataFrame, method_name: str, output_name: str) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
    '''Loop through each rep method given an input and output df as well as a method name and find results.'''
    wordlist, frequency_counter = get_wordlist(df)
    word_dict = get_worddict(df, frequency_counter, wordlist)
    rep_methods = [{'name': 'bow', 'function': bow, 'wordlist': wordlist}, #methods for representing text
                   {'name': 'freq', 'function': freq, 'wordlist': wordlist},
                   {'name': 'tfidf', 'function': tfidf, 'wordlist': word_dict},]
    
    distance_list = []
    for rep_method in rep_methods:
        rep_name, rep_func, rep_list = rep_method['name'], rep_method['function'], rep_method['wordlist']
        distances = k_fold_loop(df, rep_func, method_name, rep_list, output_name)
        distance_list.append(distances)
    return tuple(distance_list)

def evaluate_methods(filename: str, input_name: str, output_names: list[str]) -> None:
    '''Evaluate all methods for a file.'''
    df = pd.read_csv(filename)
    df[INPUT_NAME] = df[input_name].astype(str) #ensure i/o columns all have the same names
    df = combine_answers(df, output_names)
    for (i, incorrect_output_name) in enumerate(output_names):
        df[OUTPUT_NAMES[i]] = df[incorrect_output_name]
    df = df[df[OUTPUT_NAMES[1]] != 1]
    
    for resampling_method in resampling_methods:
        bow_dist, freq_dist, tfidf_dist = get_resampling_dist(df, resampling_method, OUTPUT_NAMES[1])
        print(resampling_method)
        print(f'\tmean: {np.mean(bow_dist):.3f}, {np.mean(freq_dist):.3f}, {np.mean(tfidf_dist):.3f}')
        print(f'\tmedian: {np.median(bow_dist):.3f}, {np.median(freq_dist):.3f}, {np.median(tfidf_dist):.3f}')

In [8]:
evaluate_methods('../data/antisemitism_dataset.csv', INPUT_NAME, ['classification', 'type_of_antisemitism'])

SMOTE
	mean: 3.521, 0.164, 0.278
	median: 3.606, 0.152, 0.260
RandomOver
	mean: 3.811, 0.186, 0.291
	median: 3.873, 0.172, 0.271
aug
	mean: 3.786, 0.191, 0.310
	median: 3.742, 0.182, 0.297
