# Quantile

This file looks on quantiles without steaming data, the one-dimensional dataset for testing are randomly generated 

In [6]:
import numpy as np
import pandas as pd

In [7]:
dataset = np.random.randn(1000)

### Definition
For a population, of discrete values or for a continuous population density, the k-th q-quantile is the data value where the cumulative distribution function crosses k/q. That is, x is a $k$-th $q$-quantile for a variable $X$ if
$$
Pr[X \leq x] \leq \frac{k}{q}
$$

Different methods for estimating quantiles:
https://en.wikipedia.org/wiki/Quantile#Estimating_quantiles_from_a_sample


In [9]:
a = np.array(np.arange(10) + 1)

q = np.percentile(a, 95)

q_lin  = np.percentile(a, 95, interpolation='linear')
q_low  = np.percentile(a, 95, interpolation='lower')
q_high = np.percentile(a, 95, interpolation='higher')
q_near = np.percentile(a, 95, interpolation='nearest')
q_mid  = np.percentile(a, 95, interpolation='midpoint')

print ('default: {}, lin: {}, low: {}, high: {}, near: {}，mid: {}'
       .format (q, q_lin, q_low, q_high, q_near, q_mid)
      )
# we can see default is linear:
#     This optional parameter specifies the interpolation method to use,
#     when the desired quantile lies between two data points `i` and `j`:
#         * linear: `i + (j - i) * fraction`, where `fraction` is the
#           fractional part of the index surrounded by `i` and `j`.

default: 9.549999999999999, lin: 9.549999999999999, low: 9, high: 10, near: 10，mid: 9.5


### My Quantile

#### Type
R-4, SAS-1, SciPy-(0,1), Maple-3
#### Algo
$$ h = Np$$
$$Q_p = x_{⌊h⌋} + (h − ⌊h⌋) (x_{⌊h⌋+1} − x_{⌊h⌋})$$

In [30]:
# q is the floating number between [0,1]
# apply for one dimentional array only
import math
def compute_quantile(a, p):
    a = np.asarray(a)
    a.sort()
    if not (0 <= p and p <= 1):
        raise ValueError("Input p must be in the range [0,1]")
    
#     h = Np
    h = (len(a)-1) * p
    h_below = math.floor(h)
    h_above = h_below + 1
    x_below, x_above = a[h_below], a[h_above]
#     print ( "a, ", a,
#           "\n h", h,
#           "\n h_below", h_below, "h_above", h_above,
#           "\n x_below", x_below, "x_above", x_above)
    Qp = x_below + (h-h_below)*(x_above - x_below)
    
    return Qp

In [33]:
arr = np.random.randn(1000)
a1 = compute_quantile(arr, 0.3)
a2 = np.percentile(arr, 30)
print (a1, a2)

a,  [-2.96007430e+00 -2.86446142e+00 -2.84672269e+00 -2.67217052e+00
 -2.64113420e+00 -2.62784823e+00 -2.55340525e+00 -2.52697260e+00
 -2.50755348e+00 -2.49106032e+00 -2.40495030e+00 -2.39999441e+00
 -2.39167160e+00 -2.30704376e+00 -2.30062525e+00 -2.29974684e+00
 -2.27573622e+00 -2.26534135e+00 -2.19007641e+00 -2.16095943e+00
 -2.12977133e+00 -2.11285821e+00 -2.08709134e+00 -2.07593381e+00
 -2.06662678e+00 -2.04077315e+00 -2.01314343e+00 -2.00993520e+00
 -1.95570074e+00 -1.93275765e+00 -1.92599380e+00 -1.91814077e+00
 -1.90071400e+00 -1.88504474e+00 -1.87213495e+00 -1.84083935e+00
 -1.83205513e+00 -1.82681939e+00 -1.80577742e+00 -1.80549684e+00
 -1.80455388e+00 -1.80203013e+00 -1.79799459e+00 -1.76675456e+00
 -1.76244235e+00 -1.73358554e+00 -1.73317588e+00 -1.72971130e+00
 -1.72860602e+00 -1.72713151e+00 -1.72062980e+00 -1.72060899e+00
 -1.70763228e+00 -1.70730852e+00 -1.70172961e+00 -1.70157863e+00
 -1.69701025e+00 -1.68842245e+00 -1.68510951e+00 -1.68402271e+00
 -1.65800904e+00 -1.6