# Intro to Statistics - NumPy

In [1]:
import numpy as np
import scipy as sp

We will have a look at normal distributions using a dataset containing information about heights and weights of 1000 people

In [None]:
# dataset = np.genfromtxt('../datasets/weight-height.csv', dtype=[('mystring','S5'), ('myfloat_h', 'f8'), ('myfloat_w', 'f8')], names=True)
dataset = from_file = np.genfromtxt(
    "../datasets/500_Person_Gender_Height_Weight_Index.csv",
    delimiter=',',
    names=True,
    dtype=('S8', 'i8', 'i8', 'i8')
)
dataset.shape


The dataset as loaded is a structured array

Structured arrays are ndarrays whose datatype is a composition of simpler datatypes organized as a sequence of named fields.

Structured datatypes are implemented in numpy to have base type `numpy.void`

We have to break our structured array in homogeneous arrays.

In [None]:
type(dataset[0])

In [None]:
dataset

In [None]:
genders = np.asarray([el[0] for el in dataset])
heights = np.asarray([el[1] for el in dataset])
weights = np.asarray([el[2] for el in dataset])

In [None]:
genders

In [None]:
heights

In [None]:
weights

## A Histogram with Numpy

Let us now visualize the distribution of heights and weights using a histogram

In [None]:
h_hist, h_bin_edges = np.histogram(heights)
w_hist, w_bin_edges = np.histogram(weights)
print(h_hist)
print(h_bin_edges)

## Probability Density Functions (PBFs)

A probability density function (PDF) describes the probability of the value of a continuous random variable falling within a range. If the random variable can only have specific values (like throwing dice), a probability mass function (PMF) would be used to describe the probabilities of the outcomes.

Famous PBFs are the Gaussian, Exponential, and Laplace density functions

Famous PMF are the Poisson, Binomial mass functions

### Gaussian Distribution/PBF

The Gaussian or normal distribution is the most common PDF (probability density function) for continuous variables.

In [None]:
from scipy.stats import norm
from matplotlib import pyplot as plt

x = np.arange(-4, 4, 1e-04)
plt.plot(x, norm.pdf(x))


## A Histogram and Kernel Density Estimation (KDE) with SciPy and Matplotlib

Kernel density estimation is the process of estimating an unknown probability density function using a kernel function.

SciPy offers one KDE estimator to estimate Gaussian density functions. We will see now how we can use it to plot the density function for our height histograms.


In [None]:
from scipy.stats import gaussian_kde
h_density = gaussian_kde(heights)

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

fig, ax = plt.subplots(figsize=(8,6))
# An "interface" to matplotlib.axes.Axes.hist() method
ax.hist(
  x=heights,
  density=True,
  bins='auto',
  color='#0504aa',
  alpha=0.7,
  rwidth=0.85
)
ax.grid(axis='y', alpha=0.75)
# set labels, title
ax.set_xlabel('Value')
ax.set_ylabel('Frequency')
ax.set_title('500 subjects height histogram')
# add a text 
# ax.text(55, 0.09, r'$\mu=15, b=3$')

# plot the Kernel Density Distribution on top
grid = np.arange(heights.min(), heights.max(), 0.01) # create a linear grid on which to plot the KDE
_ = ax.plot(grid, h_density(grid).T, color="r")

#### Getting mean and standard deviation for our height distributions:

In [None]:
h_mean = heights.mean()
h_mean

In [None]:
h_std = heights.std()
h_std

### Getting Males and Females

In [None]:
male_gender == b"Male"

In [None]:
# partition heights and weights by gender


### Outlier detection

In [None]:
def detect_outliers(arr: np.array) -> np.array:
    # hint: check np.quantile and np.logical_or functions
    upper_quartile = np.quantile(arr, .75)
    lower_quartile = np.quantile(arr, .25)
    iqr = upper_quartile - lower_quartile
    lower_fence = lower_quartile - 1.5*iqr
    upper_fence = upper_quartile + 1.5*iqr
    print(lower_fence)
    print(upper_fence)
    is_outlier = np.logical_or(arr < lower_fence, arr > upper_fence)
    return is_outlier



In [None]:
mask = detect_outliers(heights)
mask

In [None]:
heights[mask]

In [None]:
print(np.where(mask == True))
print(np.unique(mask, return_counts=True, return_index=True))

In [None]:
filtered_heights = heights[~mask]
filtered_heights.shape

In [None]:
weights[~mask]
genders[~mask]