# Chapter 4 - The Measure of All Things - Statistics

In [2]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

## Basic Statistics

### Mode

In [5]:
from scipy import stats

marks = np.array([[8, 9 ,7],
                  [4.5, 6, 3.5],
                  [8.5, 10, 9],
                  [8, 6.5, 9.5],
                  [9, 10, 7.5]])

stats.mode(marks)

ModeResult(mode=array([ 8. , 10. ,  3.5]), count=array([2., 2., 1.]))

In [6]:
url1 = ('https://gist.githubusercontent.com/seankross/a412dfbd88b3db70b74b/raw/5f23f993cd87c283ce766e7ac6b329ee7cc2e1d1/mtcars.csv')

cars = pd.read_csv(url1)

cars.head()

Unnamed: 0,model,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
0,Mazda RX4,21.0,6,160.0,110,3.9,2.62,16.46,0,1,4,4
1,Mazda RX4 Wag,21.0,6,160.0,110,3.9,2.875,17.02,0,1,4,4
2,Datsun 710,22.8,4,108.0,93,3.85,2.32,18.61,1,1,4,1
3,Hornet 4 Drive,21.4,6,258.0,110,3.08,3.215,19.44,1,0,3,1
4,Hornet Sportabout,18.7,8,360.0,175,3.15,3.44,17.02,0,0,3,2


In [7]:
 # Select all rows and the 'mpg' column, then find the modes
cars.loc[:, 'mpg'].mode()

0    10.4
1    15.2
2    19.2
3    21.0
4    21.4
5    22.8
6    30.4
Name: mpg, dtype: float64

We may want to see the mode of fuel consumption for each of the cars with automatic (0) and manual (1) transmissions.

In [9]:
# Group the 'cars' DataFrame by the 'am' column (automatic/manual transmission)
ct = cars.groupby(['am'])            

# Apply the mode function to the 'mpg' column for each group
ct['mpg'].apply(lambda x: x.mode())  

am   
0   0    10.4
    1    15.2
    2    19.2
1   0    21.0
    1    30.4
Name: mpg, dtype: float64

### Median

In [11]:
# Select all rows and the 'mpg' column, then calculate the median of its values
cars.loc[:, 'mpg'].median() 

19.2

In [12]:
# Calculate the median of the 'mpg' column for each group in the grouped by transmission
ct['mpg'].median()  

am
0    17.3
1    22.8
Name: mpg, dtype: float64

### Mean

In [14]:
# Print the string 'MPG average:' followed by the mean value of the 'mpg' variable
print('MPG average: ', cars.mpg.mean().round(2))  

MPG average:  20.09


In [15]:
# Calculate the mean of the 'mpg' column for each group in the grouped by transmission
ct['mpg'].mean()

am
0    17.147368
1    24.392308
Name: mpg, dtype: float64

### Geometric Mean

In [17]:
# Define a function `g_mean` to calculate the geometric mean of input `x`
def g_mean(x):                          
    avglnx = np.log(x)                  # Compute the natural logarithm of each element in `x`
    return np.exp(avglnx.mean())        # Return the exponential of the mean of the logged values, which is the geometric mean

In [18]:
bsg_ages = [45, 30, 50, 26, 40]  # Data: The Battlestar Galactica officers ages

g_mean(bsg_ages)                 # Geometric mean from Data

37.090904350447026

In [19]:
from scipy.stats import gmean

gmean(bsg_ages)                  # Geometric mean using scipy library

37.090904350447026

In [20]:
# MPG geometric mean
gmean(cars.mpg)

19.25006404155361

In [21]:
# Another way
gmean(cars['mpg'])

19.25006404155361

In [22]:
# Calculate the Geometric Mean of the 'mpg' column for each group in the grouped by transmission
ct['mpg'].apply(gmean)

am
0    16.721440
1    23.649266
Name: mpg, dtype: float64

### Harmonic Mean

In [24]:
# Define a function `h_mean` to calculate the harmonic mean of input `x`
def h_mean(x):                
    sum = 0                   # Initialize the sum to 0
    for val in x:             # Iterate through each value in `x`
        sum += 1 / val        # Add the reciprocal of the current value to `sum`
    return len(x) / sum       # Return the length of `x` divided by the sum of reciprocals, which is the harmonic mean

In [25]:
bsg_ages = [45, 30, 50, 26, 40]  # Data: The Battlestar Galactica officers ages

h_mean(bsg_ages)                 # Data Harmonic Mean

35.96679987703658

### Range

In [27]:
cars.mpg.max(), cars.mpg.min()

(33.9, 10.4)

In [28]:
mpg_range = cars.mpg.max() - cars.mpg.min()

print(mpg_range)

23.5


In [29]:
# Range by Numpy library
np.ptp(cars.mpg)

23.5

In [30]:
# Range grouped by transmission type
ct['mpg'].apply(np.ptp)

am
0    14.0
1    18.9
Name: mpg, dtype: float64

### Quantile

In [32]:
mpg = cars.loc[:, 'mpg']                 # Select the 'mpg' column from the DataFrame `cars`

print('MPG Q1: ', mpg.quantile(0.25))    # Print the 25th percentile (Q1) of the 'mpg' column
print('MPG Q2: ', mpg.quantile(0.50))    # Print the 50th percentile (Q2/median) of the 'mpg' column
print('MPG Q3: ', mpg.quantile(0.75))    # Print the 75th percentile (Q3) of the 'mpg' column

MPG Q1:  15.425
MPG Q2:  19.2
MPG Q3:  22.8


In [33]:
# Print the quantiles of the 'mpg' column for the 10th percentile (0.1) and the maximum value (1)
print(mpg.quantile([0.1, 1]))

0.1    14.34
1.0    33.90
Name: mpg, dtype: float64


In [34]:
# Descriptive Statistics
mpg.describe()

count    32.000000
mean     20.090625
std       6.026948
min      10.400000
25%      15.425000
50%      19.200000
75%      22.800000
max      33.900000
Name: mpg, dtype: float64

### Mean Absolute Deviation

In [36]:
# Define a function `md` to calculate the mean absolute deviation of input `x`
def md(x, axis=None):                   
    avg = np.mean(x, axis)              # Compute the mean of `x` along the specified axis
    dev = np.absolute(x - avg)          # Calculate the absolute deviation of each element from the mean
    return np.mean(dev, axis)           # Return the mean of the absolute deviations along the specified axis

In [37]:
# Apply 'md' in 'mpg'
md(mpg)

4.714453125

In [38]:
# Mean absolute deviation applied in 'mpg' grouped by transmission type
ct['mpg'].apply(md)

am
0    3.044875
1    5.237870
Name: mpg, dtype: float64

### Variance & Standard Deviation

In [40]:
mpg.var()           # Sample Variance, ddof=1, default

36.32410282258065

In [41]:
mpg.var(ddof=0)     # Population Variance

35.188974609375

In [42]:
mpg.std()           # Sample Standard Deviation, ddof=1, default

6.026948052089105

In [43]:
mpg.std(ddof=0)     # Population Standard Deviation

5.932029552301219

In [44]:
# Calculate the variance of the 'mpg' column for each group in the grouped by transmission
ct['mpg'].var() 

am
0    14.699298
1    38.025769
Name: mpg, dtype: float64

In [45]:
# Calculate the standard deviation of the 'mpg' column for each group in the grouped by transmission
ct['mpg'].std()

am
0    3.833966
1    6.166504
Name: mpg, dtype: float64