In [6]:
import pandas as pd
import numpy as np
import streamlit as st

def outliers_zscore(df, list_var, z_threshold, keep_nan):
    
    #remove mergeid and country variables
    var_to_describe = list_var[:-2]
    
    #init the number of outliers at 0
    tot_outliers = 0
    
    #iterate over all variables
    for var in var_to_describe:

        #if full NA, pass
        if df[var].isna().sum() == len(df):
            pass
        else:
            
            #if var a quantitative variable, apply 
            if df[var].dtype in ["int64", "float64"]:
                
                #define the z-scores of each observations
                z_scores = (df[var] - df[var].mean()) / df[var].std()
                
                #count outliers
                tot_outliers = count_outliers(z_scores, z_threshold)
                
                #test if the user want to remove or keep outliers (in Nan)
                if keep_nan==True:
                    
                    #transform outliers to Nan
                    df.loc[abs(z_scores) > z_threshold, var] = np.nan
                    
                else:
                    
                    #attribute a specific value to outliers 
                    df.loc[abs(z_scores) > z_threshold, var] = "outliers"
                    
                    #drop rows equals to this value
                    #df = df[df[var] != "outliers"]
                    df.drop(df[df[var] == "outliers"].index, inplace = True)
    
    return df, tot_outliers

In [15]:
def count_outliers(scores, threshold):
    
    #init at 0
    count = 0
    
    #iterate over all observations
    for obs in scores:
        if abs(obs) > threshold:
            count += 1
    
    return count

In [20]:
from scipy import stats
import pandas as pd
import numpy as np

def outliers_mscore(df, list_var, m_threshold, keep_nan):
    
    #remove mergeid and country
    var_to_describe = list_var[:-2]
    
    #init the number of outliers at 0
    tot_outliers = 0
    
    #iterate over all variables
    for var in var_to_describe:

        #if full NA, pass
        if df[var].isna().sum() == len(df):
            pass
        else:
            
            #if var a quantitative variable, apply 
            if df[var].dtype in ["int64", "float64"]:
                
                #define the m-scores of each observations
                mad = stats.median_abs_deviation(df[var])
                m_scores = 0.6745 * (df[var] - df[var].median()) / mad
                
                #count outliers
                tot_outliers = count_outliers(m_scores, m_threshold)
                
                #test if the user want to remove or keep outliers in Nan
                if keep_nan==True:
                    
                    #transform outliers to Nan
                    df.loc[abs(m_scores) > m_threshold, var] = np.nan
                    
                else:
                    
                    #attribute a specific value to outliers 
                    df.loc[abs(m_scores) > m_threshold, var] = -100007
                    
                    #drop rows equals to this value
                    df = df[df[var] != -100007]
    
    return df, tot_outliers