# Engineering Functions

In [1]:
from IPython.core.display import display, HTML

# Make notebooks expand 100%.
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
# This method replaces values from the give columns.
# Parameters 
# df (Dataframe) : The data frame we want to use.
# before (Object) : The value that is going to be replaced.
# after (Object) : The value that is going to be assigned.
# columns (List) : Column names. Empty means all columns.
# inplace_ (Boolean) : Maintain the changes or not.
# Returns : The data frame.
def replace_values(df, before, after, columns=[], inplace_=True):
    if columns:
        for col in columns:
            df[col].replace(before, after, inplace=inplace_)
    else:
        df.replace(to_replace=before, value=after, inplace=inplace_)

In [3]:
# This method fills NA with None.
# Parameters
# dataframe (Dataframe) : The data frame we want to use.
# columns (List of Strings) : The column that we want to replaces the na with None.
# inplace_ (Boolean) : Maintain the changes or not.
# Returns : The data frame.
def fill_na_with_none(df, columns=[], inplace_=True):
    if columns:
        for col in columns:
            df[col].fillna('None', inplace=inplace_)
    else:
        df.fillna('None', inplace=inplace_)

In [4]:
# This method fills NA with zero.
# Parameters
# dataframe (Dataframe) : The data frame we want to use.
# columns (List of Strings) : The column that we want to replaces the na with zero.
# inplace_ (Boolean) : Maintain the changes or not.
# Returns : The data frame.
def fill_na_with_zero(df, columns=[], inplace_=True):
    if columns:
        for col in (columns):
            df[col].fillna(0, inplace=inplace_)
    else:
        df.fillna(0, inplace=inplace_)

In [5]:
# This method fills NA with mode.
# Parameters
# dataframe (Dataframe) : The data frame we want to use.
# columns (List of Strings) : The column that we want to replaces the na with mode.
# inplace_ (Boolean) : Maintain the changes or not.
# Returns : The data frame.
def fill_na_with_mode(df, columns=[], inplace_=True):
    if not columns:
        columns = df.columns
    
    for col in columns:
        df[col].fillna(df[col].mode()[0], inplace=inplace_)

In [6]:
# This method replaces NA with mean.
# Parameters
# dataframe (Dataframe) : The data frame we want to use.
# columns (List of Strings) : The column that we want to replaces the na with mode.
# inplace_ (Boolean) : Maintain the changes or not.
# Returns : The data frame.
def fill_na_with_mean(df, columns=[], inplace_=True):
    if not columns:
        columns = df.columns
    
    for col in columns:
        df[col].fillna(df[col].mean(), inplace=inplace_)                  

In [7]:
# This method replaces categorical values.
# Parameters
# dataframe (Dataframe) : The data frame we want to use.
# columns (List of Strings) : The column that we want to replaces the na with mode.
# Returns : The data frame.
def replace_categorical_values(df, columns):
    for i in columns:
        feature_set = set(df[i])
        
        for j in feature_set:
            feature_list = list(feature_set)
            
            df.loc[df[i]==j, i] = feature_list.index(j)

In [8]:
 # This method creates dummy variables. One hot encoding.
# Parameters
# dataframe (Dataframe) : The data frame we want to use.
# todummy_list (Array) : The columns that we want to create dummy variables.
# drop_ (Boolean) : Drop a dummy column or not.
# Returns : The dataset with the dummy variables.
def dummy_df(df, todummy_list, drop_=True):
    for x in todummy_list:
        dummies = pd.get_dummies(df[x], prefix=x, dummy_na=False, drop_first=drop_)
        
        df = df.drop(x,1)
        
        df = pd.concat([df, dummies], axis=1)
        
    return df

In [9]:
# This method removes outliers using IQR.
# Parameters
# X (Array) : The data that have outliers.
# outlier_constant (Float) : The outlier constant.
# Returns : The dataframe without the outliers
def remove_array_outliers(X, outlier_constant = 1.5):
    a = np.array(X)
    
    upper_quartile = np.percentile(a, 75)
    
    lower_quartile = np.percentile(a, 25)
    
    IQR = (upper_quartile - lower_quartile) * outlier_constant
    
    quartile_set = (lower_quartile - IQR, upper_quartile + IQR)
    
    result = a[np.where((a >= quartile_set[0]) & (a <= quartile_set[1]))]
    
    return result.tolist()

In [10]:
# This method removes outliers using IQR.
# Parameters
# df (DataFrame) : The data that have outliers.
# columns (Array) : The columns that we want to check. If empty all columns are chosen.
# outlier_constant (Float) : The outlier constant.
# Returns : The dataframe without the outliers.
def remove_outliers(df, columns=[], outlier_constant = 1.5):
    if not columns:
        columns = df.columns
        
    for col in columns:
            df = outlier_calculations(df, col, outlier_constant)

    return df

In [11]:
# This method calculates the outliers using IQR.
# Parameters
# df (DataFrame) : The data that have outliers.
# column (String) : The column that we want to check.
# outlier_constant (Float) : The outlier constant.
# Returns : The dataframe without the outliers.
def outlier_calculations(df, col, outlier_constant):
    up = np.percentile(df[col], 75)
    
    down = np.percentile(df[col], 25)
    
    IQR = (up - down) * outlier_constant
    
    quartile_set = (up - IQR, down + IQR)
    
    df = df.loc[(df[col] >= quartile_set[0]) & (df[col] <= quartile_set[1])]
    
    return df