# Adressing Imbalanced Datasets

In the eary stages of EDA we identified an uneven distribution of the values in the binary target column. This means that our algorithms might be underperforming for two reasons:
- Not enough information from minority class to train properly
- Too much information from majority class, overwhelms the algorithm output

In both cases there is an underrepresentation of the minority class. For this notebook we develop functions that will be added to the `utils.py` file.

In order to avoid data leakage, the sampling techniques may only be applied on the training data. If we sample the testing data, it counts as data leakage since in real business settings the data will be heavily imbalanced.

In [1]:
import pandas as pd
import numpy as np
import utils, plot_help
import matplotlib.pyplot as plt

#avoid warning popping up
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()

%matplotlib inline

In [2]:
def make_num_df (df, drop_cols=['latitude', 'longitude', 'postal_code']):
    """
    Drops columns and returns numerical entries from pandas dataframe
    df(pandas): dataframe
    drop_cols(list): array of columns to drop, deemed noisy and/or of little use
    """
    
    df_drop = df.drop(columns=drop_cols)
    
    #make dataframe of numeric types
    df_num = df_drop.select_dtypes(include=[np.float64, np.int64]).copy()
    
    return df_num

In [3]:
#change read limit to -1 to load entire dataframe
df_bus = utils.chunk_loader('data/cleaned/business_merge_feats.csv', read_limit=-1)

df_num = make_num_df(df_bus)

df_num.head()

Unnamed: 0,review_count,stars,road_type,GoodForKids,RestaurantsReservations,Caters,RestaurantsTableService,RestaurantsTakeOut,RestaurantsPriceRange2,OutdoorSeating,...,Health,Hair,cool_change,funny_change,stars_change,useful,avg_month_checkin,span_checkin,median_income,is_open
0,5,3.0,1.0,0,0,0,0,0,0,0,...,0,0,0.0,0.0,0.0,0.0,1.777778,5323,3.5,0
1,128,2.5,8.0,1,1,1,1,1,2,0,...,0,0,-0.042484,-0.04902,-0.075163,-0.156863,36.083333,15143,3.0,1
2,170,4.0,6.0,1,1,0,1,1,2,0,...,0,0,-0.11,-0.19,0.055,-0.215,57.083333,58518,3.5,1
3,3,5.0,1.0,0,0,0,0,0,0,0,...,0,0,0.0,0.0,0.0,-0.5,1.222222,8464,3.5,1
4,3,2.5,6.0,0,0,0,0,0,2,0,...,0,0,0.0,0.0,0.0,0.0,1.0,2971,3.0,1


In [4]:
df_num.shape

(161160, 102)

# Identify Minority Class

In [5]:
#split data
X_train, X_test, y_train, y_test = utils.train_test_scale (df_num, 
                                                           'is_open', 
                                                           random_state=None)

In [6]:
def binary_get_minority(y):
    """
    Identify the minority and majority class in a 1-dimensional array
    y(array): array containing target
    
    """
    #return the sorted unique elements of an array
    unique, counts = np.unique(y, return_counts=True)
    
    #check target is binary and 1-D
    if (len(unique) != 2) or (np.ndim(y)!= 1):
        print("Target must be binary and 1-dimensional... Returning None")
        return None
    
    if counts[0]<counts[1]:
        minority, majority = unique[0], unique[1]
    elif counts[0]==counts[1]:
        print("array is balanced... Returning classes as is")
        return unique[0], unique[1]
    else:
        minority, majority = unique[1], unique[0]
        
    return minority, majority

In [7]:
#assign minority class
minority, majority = binary_get_minority(y_train)
print("Minority class: {}\nMajority class: {}".format(minority, majority))

Minority class: 0
Majority class: 1


# Undersample

In [8]:
def split_major_minor(X, y):
    """
    Split feature and target arrays into majority and minority
    X(numpy): feature space
    y(numpy): targets
    """
    
    #identify minorit and majority
    minority, majority = binary_get_minority(y)
    
    #get indeces of minority
    minority_index = np.where(y==minority)
    #indeces of majority
    majority_index = np.where(y==majority)

    #get minority and majority features and targets
    y_minority, y_majority = y[minority_index].copy(), y[majority_index].copy()

    X_minority, X_majority = X[minority_index].copy(), X[majority_index].copy()
    
    return X_minority, X_majority, y_minority, y_majority


In [9]:
def parallel_permute(X, y):
    """
    Permute two arrays X and y in unision along first dimension
    X(array): n-dimensional array
    y(array): n-dimensional array
    """
    #check lengths are same
    assert len(X)== len(y)
    
    #create permuted index
    perm = np.random.permutation(len(y))

    #return permuted array to maintain randomness
    X_perm = X[perm]
    y_perm = y[perm]
    
    return X_perm, y_perm

In [10]:
#apply the split
X_minority, X_majority, y_minority, y_majority = split_major_minor(X_train, y_train)

In [11]:
def undersample(X, y):
    """
    Return balanced features and targets by undersampling
    X(numpy): feature array
    y(numpy): targets array
    """
    X_minority, X_majority, y_minority, y_majority = split_major_minor(X, y)
    
    #draw random indeces from majority and limit by number of entries in minority
    rand_idx = np.random.choice(len(y_majority), 
                                len(y_minority))

    #apply undersampling
    X_majority_under = X_majority[rand_idx]
    y_majority_under = y_majority[rand_idx]

    #check lengths
    assert len(X_majority_under) == len(X_minority)
    assert len(y_majority_under) == len(y_minority)


    #vertically stack
    X_under = np.concatenate((X_majority_under, X_minority), axis=0)
    y_under = np.concatenate((y_majority_under, y_minority), axis=0)

    X_under, y_under = parallel_permute(X_under, y_under)
    
    return X_under, y_under

In [12]:
#apply undersampling
X_under, y_under = undersample(X_train, y_train)

In [13]:
#get major and minor
X_minority, X_majority, y_minority, y_majority = split_major_minor(X_train, y_train)

In [14]:
#check balances
assert len (X_under) == 2* len(X_minority)
assert np.mean(y_under) == 0.5

# Oversample

In [15]:
n_copy = len(X_majority) / len(X_minority)
n_copy_int = int(n_copy)
n_copy_frac = n_copy % 1

In [16]:
n_copy_int

4

In [17]:
n_copy_frac

0.307863318237958

In [18]:
X_oversample = np.repeat(X_minority, n_copy_int, axis=0)
len(X_oversample)/ len(X_minority)

4.0

In [19]:
np.shape(X_oversample)

(97160, 101)

In [20]:
np.shape(X_minority)

(24290, 101)

In [21]:
rand_idx = np.random.choice(len(y_minority), int(n_copy_frac*len(y_minority)))

In [22]:
rand_idx

array([ 1493,  2538,   860, ...,  5953,  5463, 20429])

In [28]:
def oversample(X, y):
    #get major and minor
    X_minority, X_majority, y_minority, y_majority = split_major_minor(X, y)
    
    #get sizing different
    n_copy = len(X_majority) / len(X_minority)
    #get integer portion
    n_copy_int = int(n_copy)
    #get fraction portion
    n_copy_frac = n_copy % 1
    
    #replicate minority by integer portion
    X_minority_over = np.repeat(X_minority, n_copy_int, axis=0)
    y_minority_over = np.repeat(y_minority, n_copy_int, axis=0)

    #replicate minority by fraction
    rand_idx = np.random.choice(len(y_minority), int(n_copy_frac*len(y_majority)))
    X_over_frac = X_minority[rand_idx]
    y_over_frac = y_under[rand_idx]
    
    #concatenate to create oversampled minority
    X_minority_over = np.concatenate((X_minority_over, X_over_frac), axis=0)
    y_minority_over = np.concatenate((y_minority_over, y_over_frac), axis=0)
    
    #concatenate with majority class
    X_over = np.concatenate((X_minority_over, X_majority), axis=0)
    y_over = np.concatenate((y_minority_over, y_majority), axis=0)
    
    #shuffle
    X_over, y_over = parallel_permute(X_over, y_over)
    
    return X_over, y_over

In [29]:
X_over, y_over = oversample(X_train, y_train)

In [30]:
np.mean(y_over)

0.515747055706545

In [31]:
np.shape(X_train)

(128928, 101)

In [32]:
np.shape(X_over)

(234012, 101)