# Measure of Central tendency

In [1]:
import pandas as pd
import numpy as np
import statistics

# load csv
df = pd.read_csv('../00_datasets/titanic.csv')

# filter only males
df_male = df[df['sex'] == 'male']

print(np.mean(df_male['fare'])) # mean of fare
print(np.median(df_male['fare'])) # median of fare
print(statistics.mode(df_male['who'])) # mode of who


25.523893414211443
10.5
man


# Variance and standard deviation

In [None]:
import pandas as pd
import numpy as np

# load csv
df = pd.read_csv('../00_datasets/titanic.csv')

# Print variance and sd of co2_emission for each food_category
print(df.groupby('who')['fare'].agg(["std", 'var']))
# print(df.groupby('who')['fare'].agg([np.std, np.var])) -- same as above

# np.std(df['fare'], ddof=1)  # ddof=1 for sample standard deviation
# np.std(df['fare'], ddof=0) # ddof=0 for population standard deviation

             std          var
who                          
child  33.466438  1120.002500
man    44.021339  1937.878308
woman  60.318995  3638.381145


# quartiles, quintiles, and deciles

In [10]:
import pandas as pd
import numpy as np

# load csv
df = pd.read_csv('../00_datasets/titanic.csv')

# quartiles
print(np.quantile(df['fare'], [0, 0.25, 0.5, 0.75, 1])) # quantiles of fare

# print(np.min(df['fare'])) # min of fare
# print(np.median(df['fare'])) # max of fare
# print(np.max(df['fare'])) # max of fare

# quintiles
print(np.quantile(df['fare'], np.linspace(0, 1, 6))) # no of quartiles are 5, so we get 6 values (0, 0.2, 0.4, 0.6, 0.8, 1)
# print(np.linspace(0, 1, 6))


[  0.       7.9104  14.4542  31.     512.3292]
[  0.       7.8542  10.5     21.6792  39.6875 512.3292]


# Outliers

<img src='../0_resources/images/outliers.avif' />

In [28]:
import pandas as pd
import numpy as np

# load csv
df = pd.read_csv('../00_datasets/titanic.csv')

fare = df['fare']
# print(fare.head())

# quartiles
q1 = np.quantile(fare, 0.25) # 1st quartile
q3 = np.quantile(fare, 0.75) # 3rd quartile
iqr = q3 - q1 # IQR
print("IQR: ", iqr)


lower_bound = q1 - 1.5 * iqr # lower bound
upper_bound = q3 + 1.5 * iqr # upper bound
print("BOUNDS: ", lower_bound, upper_bound)

outliers = fare[(fare < lower_bound) | (fare > upper_bound)]
print("Number of outliers: ", outliers.count()) # count of outliers
print(outliers)


IQR:  23.0896
BOUNDS:  -26.724 65.6344
Number of outliers:  116
1       71.2833
27     263.0000
31     146.5208
34      82.1708
52      76.7292
         ...   
846     69.5500
849     89.1042
856    164.8667
863     69.5500
879     83.1583
Name: fare, Length: 116, dtype: float64
