In [4]:
#!/usr/bin/env python

"""
Here, we use pstrurng() in statsmodels that can be imported as below:

        from statsmodels.stats.libqsturng import psturng

pstrung(q, r, v) is used to evaluates the probability from 0 to q for a
            studentized range having v degrees of freedom and r samples.

  Definition of the function:

    def psturng(q, r, v):
        Parameters
        ----------
        q : (scalar, array_like)
            quantile value of Studentized Range
            q >= 0.
        r : (scalar, array_like)
            The number of samples
            r >= 2 and r <= 200
            (values over 200 are permitted but not recommended)
        v : (scalar, array_like)
            The sample degrees of freedom
            if p >= .9:
                v >=1 and v >= inf
            else:
                v >=2 and v >= inf

        Returns
        -------
        p : (scalar, array_like)
            1. - area from zero to q under the Studentized Range
            distribution. When v == 1, p is bound between .001
            and .1, when v > 1, p is bound between .001 and .9.
            Values between .5 and .9 are 1st order appoximations.

Because it uses interpolation from R data, the probability of CDF is not
exactly same as the one from R's ptukey().
E.g.,
   Python:
    >>print(psturng(3.997799075635331, 10, 465.4956))
    0.13000739280348494

   R:
    > 1 - ptukey(3.997799075635331,  10, 465.4956)
    0.1305053
"""

import pandas as pd
import numpy as np
import scipy.stats as ss

def flatten(*iterables):
    """ Recursive flatten a nested list """
    for s in iterables:
        try:
            it_is = iter(s)
        except TypeError:
            yield s
        else:
            for i in it_is:
                for j in flatten(i):
                    yield j

def steel_dwass(data, rank_method='average'):
    """
    Steel-Dwass pairwise ranking test

    This function rewritten in Python refers to

    The Steel Dwass method performs the multiple comparisons whilst
    controlling the overall experiment-wise error rate
    (it is the non-parametric equivalent to the Tukey All-Pairs method)
     adapted from http://aoki2.si.gunma-u.ac.jp/R/Steel-Dwass.html


    Args:
        data(pandas dataframe):
            data with more than 3 groups
    Kwargs:
        rank_method (str, optional):
            The method used to assign ranks to tied elements.
            The options are ‘average’, ‘min’, ‘max’, ‘dense’ and ‘ordinal’.

        alpha (float): the significant level. Default is 0.05
    Returns:
        dataframe
    """
    import itertools as it
    from statsmodels.stats.libqsturng import psturng

    r_method_types = ['average', 'min', 'max', 'dense', 'ordinal']
    if rank_method not in r_method_types:
        raise ValueError("Unknown rank method: "
                         "it should be 'average', "
                         "'min', 'max', 'dense', 'ordinal'")

    # The rows are used as samples.
    cols = data.index
    ngroups = len(cols)
    if ngroups < 3:
        raise ValueError("The input data should more than 3 groups")

    # Make combinations
    combs = list(it.combinations(cols, 2))
    static = []

    for d in combs:
        # flatten to a list
        d1 = list(flatten(data.loc[d[0]].values))
        d2 = list(flatten(data.loc[d[1]].values))
        # length for each rpws
        d1_num = len(d1)
        d2_num = len(d2)
        # Total number of elements
        N = d1_num + d2_num
        # concatenate two lists
        d1_d2 = d1 + d2
        # get the rank for the combined list
        r = ss.rankdata(d1_d2, method=rank_method)
        
        R = sum(r[:d1_num])
        E = d1_num * (N + 1) / 2
        V = d1_num * d2_num/(N * (N - 1)) * (sum(r**2) - N * (N + 1)**2/4)
        t = abs(R - E) / np.sqrt(V)
        # Tukey cdf
        p_val = psturng(t * np.sqrt(2), ngroups, np.inf)
        static.append([t, p_val])

    return pd.DataFrame(static, columns=['t', 'p'], index=combs)



if __name__ == '__main__':
    # The sample data as shown in R
    
    data = [[5,4,6,3,3,7,6,5,3,5],
            [8,4,3,3,7,9,8,7,3,4],
            [7,6,8,9,10,9,8,9,7,8]]
        
#    data = [[6.9, 7.5, 8.5, 8.4, 8.1, 8.7, 8.9, 8.2, 7.8, 7.3, 6.8],
#            [9.6, 9.4, 9.5, 8.5, 9.4, 9.9, 8.7, 8.1, 7.8, 8.8],
#            [5.7, 6.4, 6.8, 7.8, 7.6, 7.0, 7.7, 7.5, 6.8, 5.9],
#            [7.6, 8.7, 8.5, 8.5, 9.0, 9.2, 9.3, 8.0, 7.2, 7.9, 7.8]]
    # add row indices
    inx = ['g1', 'g2', 'g3']
    data = np.asarray(data)
    # Convert to dataframe
    #
    df = pd.DataFrame(data, index=inx)
    # call the self-defined function
    mc = steel_dwass(df)
    print(df)
    print(mc)


    0  1  2  3   4  5  6  7  8  9
g1  5  4  6  3   3  7  6  5  3  5
g2  8  4  3  3   7  9  8  7  3  4
g3  7  6  8  9  10  9  8  9  7  8
                 t         p
(g1, g2)  0.808763  0.680859
(g1, g3)  3.585362  0.001000
(g2, g3)  2.268701  0.060304
