Import packages

In [1]:
import pandas as pd
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn as skl
from os import listdir, getcwd, chdir
from os.path import isfile, join

Load selected data in

In [2]:
# grab the all file names from the data directory/folder
features_data_path = "/home/gmcp/mpt-statistical-testing/features_data/"
feature_files = [f for f in listdir(features_data_path) if isfile(join(features_data_path, f)) and '.csv' in f and "P" in f]
#print(len(feature_files))
#print(feature_files)

In [3]:
# read .csv files into dictionary
def read_feature_files(files_list, pathname):
    """
    Quick function to read in .csv data files.
    Does not have functionality currently to raise error 
    if a file does not exist in the path.
    Option to use 

    Args:
        files_list : list of str
            list of strings of .csv file names
        pathname : str
            file path to folder or directory that contains the desired .csv files

    Output: 
        features_dataframes : dict
            dictionary of pandas dataframes
    """
    features_dataframes = {}
    for file in files_list:
        features_dataframes[file] = pd.read_csv(pathname+file,index_col=0)
    return features_dataframes

In [4]:
all_dfs = read_feature_files(feature_files, features_data_path)

Functions to do pairwise Pearson Correlation

In [37]:
# pairwise Pearson correlation
def corr_rowi_rowj(row_i, row_j):
    """
    Pearson correlation between row_i and row_j

    Args: 
        row_i, row_j : pd.Series
            row of data from dataframe, represented as a pandas series

    Output: 
        corr_ij : float
            Pearson correlation of row_i to row_j
    """
    if row_i.any() == False or row_j.any() == False:
        raise Exception("A row is all zeros and does not work with .corr")
    corr_ij = row_i.corr(row_j)
    return corr_ij
    

def corr_rowi_vs_all(row_i, dataframe):
    """
    Vector of Pearson correlations for each row against row_i

    Args: 
        row_i : pd.Series
            row of data from dataframe, represented as a pandas series
        dataframe : pd.DataFrame
            dataframe containing data of interest
    
    Output:
        corr_to_i : list
            list of Pearson correlation values stored as float values
    """
    corr_to_i = []
    for j, row_j in dataframe.iterrows():
        corr_to_i.append(corr_rowi_rowj(row_i,row_j))
    return corr_to_i
     
def pairwise_correlation(dataframe):
    """
    Pairwise Pearson correlation of all rows, plus conversion back to dataframe.
    If issues arise, might need to transpose dataframe.

    Args: 
        dataframe : pd.DataFrame
            dataframe containing data of interest
    
    Output: 
        corr_df = pd.DataFrame
            pandas dataframe containing all pairwise Pearson correlation values
    """ 
    corr_all = []
    for i, row_i in dataframe.iterrows():
        corr_all.append( corr_rowi_vs_all(row_i, dataframe) )
    corr_df = pd.DataFrame(
        np.array(corr_all), # corr_all needs to convert to Numpy array from list
        index=dataframe.index,
        columns=dataframe.index)
    return corr_df

Function to pull descriptive statistics from one or more features of a single dataframe

In [41]:
# Runs descriptive statistics on specified features in a dataframe
def feature_descriptive_statistics(dataframe, features):
    """
    This function pulls the descriptive statistics from given features. Input the features as a list of str.
    Can use "all_features" to run descriptive statistics on all features without needing to make a long list of names.
    Quantiles are disabled automatically. To use, make separate variables for each desired quantile and append.


    Args:
        dataframe : pd.DataFrame
            dataframe containing data of interest
        features : list of str, or str
            list of strings which are the features stored as column names in the data frame.
            can use "all_features" to run all feature columns
    
    Output: 
        feat_descriptive_statistics_df : pd.DataFrame
            Pandas dataframe of the descriptive statistics as columns and features as rows
    """
    feat_descriptive_statistics = []
    if features == "all_features":
        features = dataframe.columns.tolist()
        # need to add method to remove Unnamed:0 and ID
        for feature in features:
            feat_stats = []
            feat_mean = dataframe[feature].mean();feat_stats.append(feat_mean)
            feat_median = dataframe[feature].median();feat_stats.append(feat_median)
            feat_max = dataframe[feature].max();feat_stats.append(feat_max)
            feat_min = dataframe[feature].min();feat_stats.append(feat_min)
            #feat_quantile1, feat_quantile2 = dataframe[feature].quantile([0.25, 0.75])
            #feat_stats.append(feat_quantile1,feat_quantile2)
            feat_var = dataframe[feature].var();feat_stats.append(feat_var)
            feat_std = dataframe[feature].std();feat_stats.append(feat_std)
            feat_descriptive_statistics.append(feat_stats)
    else:
        for feature in features:
            feat_stats = []
            feat_mean = dataframe[feature].mean();feat_stats.append(feat_mean)
            feat_median = dataframe[feature].median();feat_stats.append(feat_median)
            feat_max = dataframe[feature].max();feat_stats.append(feat_max)
            feat_min = dataframe[feature].min();feat_stats.append(feat_min)
            #feat_quantiles = dataframe[feature].quantile([0.25, 0.75])
            #feat_stats.append(feat_quantiles)
            feat_var = dataframe[feature].var();feat_stats.append(feat_var)
            feat_std = dataframe[feature].std();feat_stats.append(feat_std)
            feat_descriptive_statistics.append(feat_stats)
    stat_names = ["mean", "median", "maximum", "minimum",
                  "variance", "standard deviation"] 
    feat_descriptive_statistics_df = pd.DataFrame(
        np.array(feat_descriptive_statistics),
        index=features,
        columns=stat_names)
    return feat_descriptive_statistics_df

In [7]:
dummyFeatures_df1 = pd.read_csv("../features_data/features_P14_40nm_s2_v2.csv",index_col=0).iloc[10:20]
dummyFeatures_df1.head(10)

Unnamed: 0,Track_ID,alpha,D_fit,kurtosis,asymmetry1,asymmetry2,asymmetry3,AR,elongation,boundedness,...,Mean convex_hull,Std convex_hull,Mean convex_hull_norm,Std convex_hull_norm,Mean dist_tot,Std dist_tot,Mean dist_net,Std dist_net,Mean progression,Std progression
10,10.0,0.04458,111.125959,2.019197,0.617458,0.346346,0.1254,1.935849,0.483431,0.095876,...,0.0,0.0,0.0,0.0,23410.038597,8060.004574,2010.932948,64.207167,0.094711,0.026536
11,11.0,1.159148,10.766955,1.870196,0.993848,0.039279,0.557336,3.764191,0.734339,0.044533,...,0.0,0.0,0.0,0.0,37120.132308,11975.092611,2322.137756,40.500374,0.069408,0.022246
12,12.0,1.271617,8.371443,1.597442,0.404523,0.471676,0.066609,1.977858,0.494403,0.053624,...,0.0,0.0,0.0,0.0,37120.132308,11975.092611,2322.137756,40.500374,0.069408,0.022246
13,13.0,2.009474,2.820423,2.949629,0.68232,0.308665,0.150285,1.922496,0.479843,0.157116,...,0.0,0.0,0.0,0.0,22176.252529,4087.478085,1786.963066,71.774691,0.083388,0.015486
14,14.0,1.545154,5.008628,5.088765,0.988937,0.052737,0.518906,2.839301,0.647801,0.047887,...,0.0,0.0,0.0,0.0,45170.218915,29786.417,2144.765566,84.918362,0.063797,0.025918
15,15.0,0.316939,85.800203,3.112588,0.680587,0.309684,0.149556,2.193355,0.544078,0.143137,...,0.0,0.0,0.0,0.0,25775.270384,4500.622689,2175.050937,84.154787,0.086276,0.011273
16,16.0,0.361029,45.22819,2.359616,0.938111,0.126374,0.357796,2.430197,0.588511,0.089951,...,0.0,0.0,0.0,0.0,42333.631822,17200.589004,2472.800011,55.418964,0.068725,0.026378
17,17.0,0.800348,34.189379,2.740176,0.403265,0.47246,0.066331,1.625616,0.384848,0.169429,...,0.0,0.0,0.0,0.0,25775.270384,4500.622689,2175.050937,84.154787,0.086276,0.011273
18,18.0,0.338518,127.447869,2.602074,0.511113,0.407718,0.092675,1.566647,0.361694,0.047617,...,0.0,0.0,0.0,0.0,45170.218915,29786.417,2144.765566,84.918362,0.063797,0.025918
19,19.0,1.027666,28.13924,3.733755,0.883781,0.175718,0.282048,2.369166,0.577911,0.052379,...,0.0,0.0,0.0,0.0,25775.270384,4500.622689,2175.050937,84.154787,0.086276,0.011273


In [8]:
print(type(dummyFeatures_df1.iloc[1]))

<class 'pandas.core.series.Series'>


In [9]:
feature_descriptive_statistics(dummyFeatures_df1,["alpha","asymmetry3"])

Unnamed: 0,mean,median,maximum,minimum,variance,standard deviation
alpha,0.887447,0.914007,2.009474,0.04458,0.394947,0.628448
asymmetry3,0.236694,0.14992,0.557336,0.066331,0.033924,0.184185


In [10]:
feature_descriptive_statistics(dummyFeatures_df1,"all_features")

Unnamed: 0,mean,median,maximum,minimum,variance,standard deviation
Track_ID,14.500000,14.500000,19.000000,10.000000,9.166667e+00,3.027650
alpha,0.887447,0.914007,2.009474,0.044580,3.949468e-01,0.628448
D_fit,45.889829,31.164309,127.447869,2.820423,2.124469e+03,46.091959
kurtosis,2.807344,2.671125,5.088765,1.597442,1.042992e+00,1.021270
asymmetry1,0.710394,0.681454,0.993848,0.403265,5.308465e-02,0.230401
...,...,...,...,...,...,...
Std dist_tot,12637.295895,10017.548592,29786.417000,4087.478085,1.002836e+08,10014.171696
Mean dist_net,2172.965548,2175.050937,2472.800011,1786.963066,3.458460e+04,185.969354
Std dist_net,69.470266,77.964739,84.918362,40.500374,3.370169e+02,18.358019
Mean progression,0.077206,0.076398,0.094711,0.063797,1.271766e-04,0.011277


In [42]:
# Runs descriptive statistics on dataframes within a dictionary 
def multi_df_feat_descriptive_statistics(dataframes, features):
    """
    This function takes an input dictionary of dataframes and 
    a list of features str to automatically run multiple dataframes through
    the feature descriptive statistics, returning a dictionary with the same keys. 
    For running statistics on all features use "all_features".

    Args:
        dataframes : dict
            dictionary of dataframes containing data of interest
        features : list of str, or str
            list of strings which are the features stored as column names in the data frame.
            can use "all_features" to run all feature columns

    Output: 
        dfs_descriptive_statistics : dict
            dictionary of dataframes containing descriptive statistics of specified features
            utilizes the same keys as the input dataframe dictionary
    """
    dfs_descriptive_statistics = {}
    for key in dataframes:
        dfs_descriptive_statistics[key] = feature_descriptive_statistics(dataframes[key],features)
    return dfs_descriptive_statistics

Below is troubleshooting trying to get the comprehension to work with a dataframe to pull values in a feature column that satisfy standard deviation outlier parameter

In [12]:
dummyFeatures_df1.dtypes

Track_ID            float64
alpha               float64
D_fit               float64
kurtosis            float64
asymmetry1          float64
                     ...   
Std dist_tot        float64
Mean dist_net       float64
Std dist_net        float64
Mean progression    float64
Std progression     float64
Length: 91, dtype: object

In [13]:
feat_mean = dummyFeatures_df1["alpha"].mean()
feat_std = dummyFeatures_df1["alpha"].std()
type(feat_mean)

numpy.float64

In [14]:
type(feat_std)

float

In [15]:
dummy_above = [row_i["alpha"] for index, row_i in dummyFeatures_df1.iterrows() if row_i["alpha"] >= feat_mean + (1 * feat_std)]
print(dummy_above)

[2.009474037293017, 1.54515378436631]


In [16]:
feat_median = dummyFeatures_df1["alpha"].median()
feat_iqr = sp.stats.iqr(dummyFeatures_df1["alpha"])
print(feat_median,feat_iqr)

0.914006891498764 0.899354422817235


In [17]:
outliers_above2 = [row_i["alpha"] for index, row_i in dummyFeatures_df1.iterrows() if row_i["alpha"] >= feat_median+(1.5*feat_iqr)]
print(outliers_above2)

[]


In [38]:
# Pulls out values of a feature that satis of std away from mean
def feature_outliers(dataframe, features, outlier_method):
    """
    Rapid calculation the outliers of specified feature data within a dataframe.
    Has the options of STD multiplier and IQR for selecting an outlier selection parameter.

    Args:
        dataframe : pd.DataFrame
            dataframe containing data of interest
        features : list of str, or str
            list of strings which are the features stored as column names in the data frame.
            Can use "all_features" string to run all features.
        outlier_method : str
            either "STD multiplier" or "IQR" to specify method of determining outlier cutoff.
            "STD multiplier" will prompt user to enter a float value to use as a multiplier
            of the standard deviation.

    Output:
        feature_outliers_dict : dict
            dictionary containing lists of found outliers above and below selected cutoff for
            specified features
    """
    if features == "all_features":
        features = dataframe.columns.tolist()
        if outlier_method == "STD multiplier":
            n_by_std = float(input("Enter the multiplier you want to use:"))
            feature_outliers_dict = {}
            for feature in features:
                feat_mean = dataframe[feature].mean()
                feat_std = dataframe[feature].std()
                outliers_above = [row_i[feature] for index, row_i in dataframe.iterrows() if row_i[feature] >= feat_mean+(n_by_std*feat_std)]
                outliers_below = [row_i[feature] for index, row_i in dataframe.iterrows() if row_i[feature] <= feat_mean-(n_by_std*feat_std)]
                feature_outliers_dict[feature+" outliers above"] = outliers_above
                feature_outliers_dict[feature+" outliers below"] = outliers_below
    
        elif outlier_method == "IQR":
            feature_outliers_dict = {}
            for feature in features:
                feat_iqr = sp.stats.iqr(dataframe[feature])
                feat_median = dataframe[feature].median()
                outliers_above = [row_i[feature] for index, row_i in dataframe.iterrows() if row_i[feature] >= feat_median+(1.5*feat_iqr)]
                outliers_below = [row_i[feature] for index, row_i in dataframe.iterrows() if row_i[feature] <= feat_median-(1.5*feat_iqr)]
                feature_outliers_dict[feature+" outliers above"] = outliers_above
                feature_outliers_dict[feature+" outliers below"] = outliers_below
    
    else:
        if outlier_method == "STD multiplier":
            n_by_std = float(input("Enter the multiplier you want to use:"))
            feature_outliers_dict = {}
            for feature in features:
                feat_mean = dataframe[feature].mean()
                feat_std = dataframe[feature].std()
                outliers_above = [row_i[feature] for index, row_i in dataframe.iterrows() if row_i[feature] >= feat_mean+(n_by_std*feat_std)]
                outliers_below = [row_i[feature] for index, row_i in dataframe.iterrows() if row_i[feature] <= feat_mean-(n_by_std*feat_std)]
                feature_outliers_dict[feature+" outliers above"] = outliers_above
                feature_outliers_dict[feature+" outliers below"] = outliers_below
    
        elif outlier_method == "IQR":
            feature_outliers_dict = {}
            for feature in features:
                feat_iqr = sp.stats.iqr(dataframe[feature])
                feat_median = dataframe[feature].median()
                outliers_above = [row_i[feature] for index, row_i in dataframe.iterrows() if row_i[feature] >= feat_median+(1.5*feat_iqr)]
                outliers_below = [row_i[feature] for index, row_i in dataframe.iterrows() if row_i[feature] <= feat_median-(1.5*feat_iqr)]
                feature_outliers_dict[feature+" outliers above"] = outliers_above
                feature_outliers_dict[feature+" outliers below"] = outliers_below
    return feature_outliers_dict

In [19]:
feature_outliers(dummyFeatures_df1, ["alpha","kurtosis"], "STD multiplier")

Enter the multiplier you want to use: 3


{'alpha outliers above': [],
 'alpha outliers below': [],
 'kurtosis outliers above': [],
 'kurtosis outliers below': []}

In [20]:
feature_outliers(dummyFeatures_df1, ["alpha","kurtosis"], "STD multiplier")

Enter the multiplier you want to use: 2


{'alpha outliers above': [],
 'alpha outliers below': [],
 'kurtosis outliers above': [5.088764991351239],
 'kurtosis outliers below': []}

In [21]:
feature_outliers(dummyFeatures_df1, ["alpha","kurtosis"], "IQR")

{'alpha outliers above': [],
 'alpha outliers below': [],
 'kurtosis outliers above': [5.088764991351239],
 'kurtosis outliers below': []}

In [22]:
feature_outliers(dummyFeatures_df1, "all_features", "IQR")

{'Track_ID outliers above': [],
 'Track_ID outliers below': [],
 'alpha outliers above': [],
 'alpha outliers below': [],
 'D_fit outliers above': [],
 'D_fit outliers below': [],
 'kurtosis outliers above': [5.088764991351239],
 'kurtosis outliers below': [],
 'asymmetry1 outliers above': [],
 'asymmetry1 outliers below': [],
 'asymmetry2 outliers above': [],
 'asymmetry2 outliers below': [],
 'asymmetry3 outliers above': [0.5573361703399577, 0.5189056831788302],
 'asymmetry3 outliers below': [],
 'AR outliers above': [3.764191154993878, 2.839301412613269],
 'AR outliers below': [],
 'elongation outliers above': [0.7343386776005465],
 'elongation outliers below': [],
 'boundedness outliers above': [],
 'boundedness outliers below': [],
 'fractal_dim outliers above': [],
 'fractal_dim outliers below': [],
 'trappedness outliers above': [],
 'trappedness outliers below': [],
 'efficiency outliers above': [2.419955314929289],
 'efficiency outliers below': [],
 'straightness outliers abov

In [23]:
feature_outliers(dummyFeatures_df1, "all_features", "STD multiplier")

Enter the multiplier you want to use: 3


{'Track_ID outliers above': [],
 'Track_ID outliers below': [],
 'alpha outliers above': [],
 'alpha outliers below': [],
 'D_fit outliers above': [],
 'D_fit outliers below': [],
 'kurtosis outliers above': [],
 'kurtosis outliers below': [],
 'asymmetry1 outliers above': [],
 'asymmetry1 outliers below': [],
 'asymmetry2 outliers above': [],
 'asymmetry2 outliers below': [],
 'asymmetry3 outliers above': [],
 'asymmetry3 outliers below': [],
 'AR outliers above': [],
 'AR outliers below': [],
 'elongation outliers above': [],
 'elongation outliers below': [],
 'boundedness outliers above': [],
 'boundedness outliers below': [],
 'fractal_dim outliers above': [],
 'fractal_dim outliers below': [],
 'trappedness outliers above': [],
 'trappedness outliers below': [],
 'efficiency outliers above': [],
 'efficiency outliers below': [],
 'straightness outliers above': [],
 'straightness outliers below': [],
 'MSD_ratio outliers above': [],
 'MSD_ratio outliers below': [],
 'frames outlier

In [36]:
# Clustering to identify what features might contribute to data quality
def feature_clustering(dataframe, features):
    """
    Simple clustering method to provide a tool in determining whether
    certain features have high contribution to data quality.

    To be implemented in V2. Planning to use scipy k means, or 
    sci-kit learn NearestNeighbors or DBSCAN
    """

In [34]:
import unittest

import pandas as pd
import numpy as np
import scipy as sp

df_exp1 = pd.DataFrame([[-1,0,1],[1,0,-1],[0.5,0,0.5]], index=["A","B","C"])
df_exp2 = pd.DataFrame([[1,0,1],[-1,0,-1],[0,0,0]], index=["A","B","C"])
class TestCorrelation(unittest.TestCase):

    def test_corr_rowi_rowj_right_type(self):
        computed_correlation = corr_rowi_rowj(df_exp1.iloc[0],df_exp1.iloc[2])
        assert isinstance(computed_correlation,(float,int)), "Computed correlation is neither int nor float, it is %s" % type(computed_correlation)

    def test_for_row_of_0(self):
        rowi = df_exp2.iloc[2]
        if rowi.all() == False:
            rowiPOP = rowi.pop(0)
            assert rowiPOP.all(), "Row is all 0, and .corr will not work"

    def test_corr_rowi_rowj_right_value(self):
        computed_correlation = corr_rowi_rowj(df_exp2.iloc[1],df_exp2.iloc[0])
        assert 0 <= abs(computed_correlation) <= 1.00, "Something went wrong, and the correlation is outside [0,|1|]"

    def test_corr_rowi_vs_all_right_type(self):
        computed_correlation = corr_rowi_vs_all(df_exp1.iloc[1],df_exp2)
        assert isinstance(computed_correlation, list), "Something went wrong, and the function did not return a list."

    def test_corr_rowi_vs_all_row_of_0(self):
        computed_correlation = corr_rowi_vs_all(df_exp1.iloc[1],df_exp2)
        compPOP = computed_correlation.pop(0)
        assert compPOP.all == True, "There was a row of zeros that resulted in NaN correlation values"

    def test_corr_rowi_vs_all_work_across_df(self):
        corr1 = corr_rowi_vs_all(df_exp1.iloc[1],df_exp2)
        corr2 = corr_rowi_vs_all(df_exp1.iloc[1],df_exp1)
        assert type(corr1) == type(corr2), "Did not work across dataframes"

    def test_pairwise_correlation_right_type(self):
        computed_correlation = pairwise_correlation(df_exp1)
        assert isinstance(computed_correlation, pd.DataFrame), "Something went wrong, and the function did not return a DataFrame."

    def test_pairwise_correlation_row_of_0_NA(self):
        computed_correlation = pairwise_correlation(df_exp2)
        corrNoNA = computed_correlation.dropna()
        print(corrNoNA)
        assert corrNoNA.empty == False, "There was a row of 0, which .corr did not handle"

    def test_pairwise_correlation_right_shape(self):
        computed_correlation = pairwise_correlation(df_exp1)
        assert computed_correlation.shape == df_exp1.shape, "Function resulted in different sized dataframe"

In [33]:
dummy1 = pd.DataFrame([[-1,0,1,1,0],[1,0,-1,0,1],[0.5,0.5,0.5,0,7]], columns=["A","B","C","D","E"])
dummy_stats = feature_descriptive_statistics(dummy1,["B"])
print(dummy_stats.iloc[0].dtype)

float64


In [39]:
print(feature_outliers(dummy1,"all_features","IQR"))

{'A outliers above': [], 'A outliers below': [-1.0], 'B outliers above': [0.5], 'B outliers below': [], 'C outliers above': [], 'C outliers below': [-1.0], 'D outliers above': [1.0], 'D outliers below': [], 'E outliers above': [7.0], 'E outliers below': []}


In [46]:
import unittest

import pandas as pd
import numpy as np
import scipy as sp

class TestStatistics(unittest.TestCase):
    dummy_df1 = pd.DataFrame([[-1,0,1,0,0],[1,0,-1,0,1],[0.5,0.5,0.5,0,7]], columns=["A","B","C","D","E"])
    dummy_df2 = pd.DataFrame([[1,0,1,2,1],[-1,0,-1,0,2],[0,0,0,1,30]], columns=["A","B","C","D","E"])
    dummy_dict = {"dummy_df1": dummy_df1, "dummy_df2": dummy_df2}
    dummy_list = ["dummy_df1", "dummy_df2"]
    
    def test_feature_descriptive_statistics_output1(self):
        stats_df = feature_descriptive_statistics(dummy_df1,["A"])
        self.assertTrue(isinstance(stats_df,pd.DataFrame))

    def test_feature_descriptive_statistics_output2(self):
        stats_df = feature_descriptive_statistics(dummy_df1,"all_features")
        self.assertTrue(isinstance(stats_df,pd.DataFrame))

    def test_feature_descriptive_statistics_output3(self):
        stats_df = feature_descriptive_statistics(dummy_df1,["B"])
        self.assertTrue(isinstance(stats_df.iloc[0].dtype,float))

    def test_feature_descriptive_statistics_output4(self):
        stats_df = feature_descriptive_statistics(dummer_df1,["A"])
        self.assrtTrue(np.isclose(stats_df["mean"].iloc[0],0))

    def test_multi_df_feat_descriptive_statistics_output1(self):
        stats_dict = (dummy_dict,"all_features")
        self.assertTrue(isintance(stats_dict,dict))

    def test_multi_df_feat_descriptive_statistics_output2(self):
        stats_dict = (dummy_dict,["C"])
        self.assertTrue(isintance(stats_dict,dict))

    def test_multi_df_feat_descriptive_statistics_output3(self):
        stats_dict = (dummy_dict,"all_features")
        self.assertTrue(isintance(stats_dict["dummy_df2"],pd.DataFrame))

    def test_feature_outliers_output1(self):
        outliers_list = feature_outliers(dummy_df1,"all_features")
        self.assertTrue(isinstance(outliers_list,dict))

    def test_feature_outliers_output2(self):
        outliers_list = feature_outliers(dummy_df1,["A","B"])
        self.assertTrue(isinstance(outliers_list,dict))

    def test_feature_outliers_output3(self):
        outliers_list = feature_outliers(dummy_df1,"all_features")
        self.assertTrue(isinstance(outliers_list["A"],list))

    def test_feature_outliers_output4(self):
        outliers_list = feature_outliers(dummy_df1,"all_features","IQR")
        self.assertTrue(np.isclose(outliers_list["E outliers above"],7))

    def test_feature_clustering(self):
        pass
