# Statistics Functions - Data

In [3]:
import numpy as np
import pandas as pd
from pydataset import data
import os
import matplotlib.pyplot as plt
import random
import seaborn as sns

## Random

In [4]:
print(random.random())  #any random value 0 and 1

0.5765885305959892


In [5]:
random.randint(10, 100)  #any random integer between 10 and 100

66

In [6]:
random.randrange(11, 35, 2) #random even no between 11 and 35

27

In [7]:
my_list = ['apple', 'banana', 'cherry']
random.choice(my_list)

'cherry'

In [8]:
my_list = [1, 2, 3, 4, 5]
random.shuffle(my_list)
print(my_list)

[2, 1, 3, 4, 5]


In [9]:
random.sample(['red', 'blue'], counts=[6, 2], k=8)  #create 4 values of red, 2 values of blue, in random order; total 5

['blue', 'red', 'red', 'red', 'red', 'red', 'red', 'blue']

In [10]:
random.sample(['Nayeema', 'Huma', 'Ratiba', 'Mohammad'], k=3)  # without replacement

['Nayeema', 'Ratiba', 'Huma']

In [11]:
random.choices(['HR', 'IT'], k=3) # with replacement

['HR', 'IT', 'IT']

In [12]:
# I have school of 1000 students, I want to pickup only 10 students (without replacement) and allot them 3 projects (with replacement)

In [13]:
df = data('mtcars')
df.head()

Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
Mazda RX4,21.0,6,160.0,110,3.9,2.62,16.46,0,1,4,4
Mazda RX4 Wag,21.0,6,160.0,110,3.9,2.875,17.02,0,1,4,4
Datsun 710,22.8,4,108.0,93,3.85,2.32,18.61,1,1,4,1
Hornet 4 Drive,21.4,6,258.0,110,3.08,3.215,19.44,1,0,3,1
Hornet Sportabout,18.7,8,360.0,175,3.15,3.44,17.02,0,0,3,2


In [14]:
df.dtypes

mpg     float64
cyl       int64
disp    float64
hp        int64
drat    float64
wt      float64
qsec    float64
vs        int64
am        int64
gear      int64
carb      int64
dtype: object

In [15]:
df.describe()

Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
count,32.0,32.0,32.0,32.0,32.0,32.0,32.0,32.0,32.0,32.0,32.0
mean,20.090625,6.1875,230.721875,146.6875,3.596563,3.21725,17.84875,0.4375,0.40625,3.6875,2.8125
std,6.026948,1.785922,123.938694,68.562868,0.534679,0.978457,1.786943,0.504016,0.498991,0.737804,1.6152
min,10.4,4.0,71.1,52.0,2.76,1.513,14.5,0.0,0.0,3.0,1.0
25%,15.425,4.0,120.825,96.5,3.08,2.58125,16.8925,0.0,0.0,3.0,2.0
50%,19.2,6.0,196.3,123.0,3.695,3.325,17.71,0.0,0.0,4.0,2.0
75%,22.8,8.0,326.0,180.0,3.92,3.61,18.9,1.0,1.0,4.0,4.0
max,33.9,8.0,472.0,335.0,4.93,5.424,22.9,1.0,1.0,5.0,8.0


In [16]:
mpgvalues = df.mpg.sort_values().to_list() 
mpgvalues[0], mpgvalues[8], mpgvalues[16] , mpgvalues[24], mpgvalues[31]  #16,24,32]]  #32 : min(0), max(32), 25% (32/4=8), 50%(16), 75%(24)

(10.4, 15.5, 19.2, 22.8, 33.9)

## Functions 1

- Mean
- Median
- Mode

### Mean

In [17]:
df.mean(axis=0) #mean each col

mpg      20.090625
cyl       6.187500
disp    230.721875
hp      146.687500
drat      3.596563
wt        3.217250
qsec     17.848750
vs        0.437500
am        0.406250
gear      3.687500
carb      2.812500
dtype: float64

In [18]:
df.mean(axis=1).head() #mean each row

Mazda RX4            29.907273
Mazda RX4 Wag        29.981364
Datsun 710           23.598182
Hornet 4 Drive       38.739545
Hornet Sportabout    53.664545
dtype: float64

### Median

In [20]:
df.median(axis=0)

mpg      19.200
cyl       6.000
disp    196.300
hp      123.000
drat      3.695
wt        3.325
qsec     17.710
vs        0.000
am        0.000
gear      4.000
carb      2.000
dtype: float64

In [21]:
x = [1,2,3,3,4,5]
x

[1, 2, 3, 3, 4, 5]

In [28]:
float(np.median(x))

3.0

In [24]:
random.shuffle(x)
x

[1, 3, 2, 3, 5, 4]

In [25]:
np.median(x)

np.float64(3.0)

### MODE - max freq

In [27]:
from scipy import stats

In [29]:
df.mode() #pandas mode

Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
0,10.4,8.0,275.8,110.0,3.07,3.44,17.02,0.0,0.0,3.0,2.0
1,15.2,,,175.0,3.92,,18.9,,,,4.0
2,19.2,,,180.0,,,,,,,
3,21.0,,,,,,,,,,
4,21.4,,,,,,,,,,
5,22.8,,,,,,,,,,
6,30.4,,,,,,,,,,


In [31]:
df.gear.value_counts()

gear
3    15
4    12
5     5
Name: count, dtype: int64

In [32]:
df.carb.value_counts()

carb
4    10
2    10
1     7
3     3
6     1
8     1
Name: count, dtype: int64

In [33]:
df.cyl.mode()

0    8
Name: cyl, dtype: int64

In [34]:
df.cyl.value_counts()

cyl
8    14
4    11
6     7
Name: count, dtype: int64

In [35]:
import statistics as stats  #note same module in scipy

In [36]:
x1 = [1, 2, 2, 3, 4, 4, 4, 5,5,5,5]
x1

[1, 2, 2, 3, 4, 4, 4, 5, 5, 5, 5]

In [37]:
max(set(x1), key= x1.count)

5

In [38]:
print('Mode of list is ', stats.mode(x1))  # error if tie, more than 1 mode

Mode of list is  5


In [39]:
print('Multi Mode of list is ', stats.multimode(x1))

Multi Mode of list is  [5]


In [40]:
x2 = [1,1,2,2,3]
print(x2, ' : Multi Mode of list is ', stats.multimode(x2))

[1, 1, 2, 2, 3]  : Multi Mode of list is  [1, 2]


## Functions 2

- Minimum
- Maximum
- Range
- Count
- Quantile/ Quartile
- IQR
- Standard Deviation 
- Variance

In [41]:
df.head()

Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
Mazda RX4,21.0,6,160.0,110,3.9,2.62,16.46,0,1,4,4
Mazda RX4 Wag,21.0,6,160.0,110,3.9,2.875,17.02,0,1,4,4
Datsun 710,22.8,4,108.0,93,3.85,2.32,18.61,1,1,4,1
Hornet 4 Drive,21.4,6,258.0,110,3.08,3.215,19.44,1,0,3,1
Hornet Sportabout,18.7,8,360.0,175,3.15,3.44,17.02,0,0,3,2


In [42]:
df.describe()

Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
count,32.0,32.0,32.0,32.0,32.0,32.0,32.0,32.0,32.0,32.0,32.0
mean,20.090625,6.1875,230.721875,146.6875,3.596563,3.21725,17.84875,0.4375,0.40625,3.6875,2.8125
std,6.026948,1.785922,123.938694,68.562868,0.534679,0.978457,1.786943,0.504016,0.498991,0.737804,1.6152
min,10.4,4.0,71.1,52.0,2.76,1.513,14.5,0.0,0.0,3.0,1.0
25%,15.425,4.0,120.825,96.5,3.08,2.58125,16.8925,0.0,0.0,3.0,2.0
50%,19.2,6.0,196.3,123.0,3.695,3.325,17.71,0.0,0.0,4.0,2.0
75%,22.8,8.0,326.0,180.0,3.92,3.61,18.9,1.0,1.0,4.0,4.0
max,33.9,8.0,472.0,335.0,4.93,5.424,22.9,1.0,1.0,5.0,8.0


###  Minimum

In [44]:
df[['mpg','wt']].max() #default axis=0

mpg    33.900
wt      5.424
dtype: float64

### Maximum

In [45]:
df[['mpg','wt']].min(axis=0)

mpg    10.400
wt      1.513
dtype: float64

### Range

In [46]:
df[['mpg','wt']].max() - df[['mpg','wt']].min()

mpg    23.500
wt      3.911
dtype: float64

In [47]:
x3 = np.array([4, 8, 15, 16, 23, 42])
x3

array([ 4,  8, 15, 16, 23, 42])

In [48]:
df.apply(lambda col: col.max() - col.min()) #pandas

mpg      23.500
cyl       4.000
disp    400.900
hp      283.000
drat      2.170
wt        3.911
qsec      8.400
vs        1.000
am        1.000
gear      2.000
carb      7.000
dtype: float64

### Count

In [49]:
x4 = [1, 2, 2, 3, 3, 3, 4]
print(x4.count(4))   # Output: 3

1


In [50]:
from collections import Counter

In [51]:
x4 = [1, 2, 2, 2, 3, 3, 3, 3, 4, 5, 5, 5,5,5]
counts = Counter(x4)
print(counts)

Counter({5: 5, 3: 4, 2: 3, 1: 1, 4: 1})


In [52]:
# most common TWO items
counts.most_common(2)

[(5, 5), (3, 4)]

In [53]:
counts.most_common(1) # highest - 

[(5, 5)]

In [55]:
df.gear.value_counts()

gear
3    15
4    12
5     5
Name: count, dtype: int64

In [57]:
df.describe().head(1)

Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
count,32.0,32.0,32.0,32.0,32.0,32.0,32.0,32.0,32.0,32.0,32.0


In [58]:
df.describe(include='all').loc['count']

mpg     32.0
cyl     32.0
disp    32.0
hp      32.0
drat    32.0
wt      32.0
qsec    32.0
vs      32.0
am      32.0
gear    32.0
carb    32.0
Name: count, dtype: float64

In [59]:
df.count()

mpg     32
cyl     32
disp    32
hp      32
drat    32
wt      32
qsec    32
vs      32
am      32
gear    32
carb    32
dtype: int64

### Quantile/ Quartile

In [60]:
x5 = [10, 20, 30, 40, 50, 60, 70]

# 25th, 50th (median), and 75th percentiles
q25 = np.quantile(x5, 0.25) #Q1
q50 = np.quantile(x5, 0.5) #Q2  , median
q75 = np.quantile(x5, 0.75) #Q3
print(q25, q50, q75)

25.0 40.0 55.0


In [61]:
import statistics as stats
stats.median(x5)      # 40
#stats.quantiles(x5, n=100)   # quartiles

40

In [62]:
df.describe()

Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
count,32.0,32.0,32.0,32.0,32.0,32.0,32.0,32.0,32.0,32.0,32.0
mean,20.090625,6.1875,230.721875,146.6875,3.596563,3.21725,17.84875,0.4375,0.40625,3.6875,2.8125
std,6.026948,1.785922,123.938694,68.562868,0.534679,0.978457,1.786943,0.504016,0.498991,0.737804,1.6152
min,10.4,4.0,71.1,52.0,2.76,1.513,14.5,0.0,0.0,3.0,1.0
25%,15.425,4.0,120.825,96.5,3.08,2.58125,16.8925,0.0,0.0,3.0,2.0
50%,19.2,6.0,196.3,123.0,3.695,3.325,17.71,0.0,0.0,4.0,2.0
75%,22.8,8.0,326.0,180.0,3.92,3.61,18.9,1.0,1.0,4.0,4.0
max,33.9,8.0,472.0,335.0,4.93,5.424,22.9,1.0,1.0,5.0,8.0


In [63]:
df.describe(include='all').loc[['min','25%','50%','75%','max']]

Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
min,10.4,4.0,71.1,52.0,2.76,1.513,14.5,0.0,0.0,3.0,1.0
25%,15.425,4.0,120.825,96.5,3.08,2.58125,16.8925,0.0,0.0,3.0,2.0
50%,19.2,6.0,196.3,123.0,3.695,3.325,17.71,0.0,0.0,4.0,2.0
75%,22.8,8.0,326.0,180.0,3.92,3.61,18.9,1.0,1.0,4.0,4.0
max,33.9,8.0,472.0,335.0,4.93,5.424,22.9,1.0,1.0,5.0,8.0


## Percentile

In [66]:
from scipy import stats
import numpy as np

In [67]:
x6 = np.array([10, 20, 30, 40, 50, 60, 70])
q25, q50, q75 = np.percentile(x6, [25, 50, 75], method='linear')
print(q25, q50, q75)

25.0 40.0 55.0


In [65]:
np.quantile(x6, [0.25, 0.5, 0.75], method='midpoint')
#Different method options (e.g., 'linear', 'lower', 'higher', 'midpoint', 'nearest') 
#change how quantiles are interpolated

array([25., 40., 55.])

### IQR

- difference in Q3 & Q1

In [68]:
x7= [10, 20, 30, 40, 50, 60, 70]

Q1 = np.percentile(x7, 25)
Q3 = np.percentile(x7, 75)
IQR = Q3 - Q1

print(x7, "\n\n Q1:", Q1, " Q3:", Q3, " IQR:", IQR)

[10, 20, 30, 40, 50, 60, 70] 

 Q1: 25.0  Q3: 55.0  IQR: 30.0


In [69]:
from scipy.stats import iqr
import numpy as np

In [70]:
x7B = np.array([10, 20, 30, 40, 50, 60, 70])
IQR = iqr(x7B)
print(x7B, "\n IQR:", IQR)

[10 20 30 40 50 60 70] 
 IQR: 30.0


In [71]:
import statistics as stats

In [73]:
x7C = [10, 20, 30, 40, 50, 60, 70]
q = stats.quantiles(x7C, n=4)
IQR = q[2] - q[0]
print(x7C, "\n Q[2]", q[2], "Q[0]", q[0], ":: IQR:", IQR)

[10, 20, 30, 40, 50, 60, 70] 
 Q[2] 60.0 Q[0] 20.0 :: IQR: 40.0


### Outliers

In [75]:
x8 = [-150, -100,-90, -25, -5, -1, 10, 20, 30, 40, 50, 60, 70, 500, 1000] #vary this data

In [76]:
Q1 = np.percentile(x8, 25)
Q3 = np.percentile(x8, 75)
IQR = Q3 - Q1
Q2 = np.percentile(x8, 50)

lower_bound = Q1 - 1.5 * IQR  # <
upper_bound = Q3 + 1.5 * IQR  # >
print(Q1, Q2, Q3, IQR, lower_bound, upper_bound)
# values < lower_bound and values > upper_bound are oultiers

-15.0 20.0 55.0 70.0 -120.0 160.0


In [77]:
outliers = [x for x in x8 if x < lower_bound or x > upper_bound]
print("Outliers:", outliers)

Outliers: [-150, 500, 1000]


### Standard Deviation/Variance

In [83]:
import statistics as stats

x9 = [10, 20, 30, 40, 50]
print(x9)

In [84]:
np.std(x9), np.var(x9) #std is squareroot of var

NameError: name 'x9' is not defined