### LIFE EXPECTANCY STATISTICAL ANALYSIS

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
# Reading csv file to be worked on into Dataframe

df = pd.read_csv("http://data.un.org/_Docs/SYB/CSV/SYB64_246_202110_Population%20Growth,%20Fertility%20and%20Mortality%20Indicators.csv", encoding = "latin-1")
df.head()

In [None]:
# Analysing the data under the Unnamed 3 column
df["Unnamed: 3"].unique()

In [None]:
# Extracting Life expectancy data for both sexes

df_mask = df[df["Unnamed: 3"] == "Life expectancy at birth for both sexes (years)"].reset_index()
print(df_mask.shape)
df_mask.head()

In [None]:
# Analysing the data under the "Population growth and indicators of fertility and mortality" column
df_mask["Population growth and indicators of fertility and mortality"].nunique()
df_mask[df_mask["Population growth and indicators of fertility and mortality"] == "Afghanistan"]

In [None]:
# Removing rows that are not countries only
df_mask.drop(labels = range(0, 90), inplace = True, axis = 0)

In [None]:
# Creating a new dataframe to use for analysis and removing unrequired columns
df1 = df_mask.drop(columns = ["index", "T03","Unnamed: 3", "Unnamed: 5", "Unnamed: 6"]).reset_index().rename(columns={"Unnamed: 2": "Year", "Unnamed: 4": "Life expectancy at birth for both sexes (years)"})
df1 = df1.drop(columns = "index")
print(df1.shape)
df1.head()

In [None]:
# Changing data type of "Life expectancy at birth for both sexes (years)" column to float 
df1["Life expectancy at birth for both sexes (years)"] = df1["Life expectancy at birth for both sexes (years)"].astype("float")
df1.info()

In [None]:
# Convert DataFrame to numpy array and print size
df_array = df1["Life expectancy at birth for both sexes (years)"].to_numpy()
N = df_array.size
print(N)

In [None]:
# Plotting an histogram to show the Distribution of Life expectancy at birth (Numpy)

plt.hist(df_array,
        bins = 25)

# Mean and median of Life expectancy column
df_mean = np.round_(np.mean(df_array), decimals = 1)
df_med = np.round_(np.median(df_array),decimals = 1)

# Plotting a line for the mean and median
plt.axvline(
    df_mean, linestyle="--", color='orange', label = "Mean Life expectancy"
);
plt.axvline(
    df_med, linestyle="--", color='green', label = "Median Life expectancy"
)
plt.legend(loc = "upper left");

In [None]:
# Range of Life expectancy column
df_range = np.max(df_array) - np.min(df_array)
df_range = np.round_(df_range, decimals = 1)

# Q1 and Q3 of Life expectancy column
q1, q3 = np.quantile(df_array, [0.25, 0.75])
q1 = np.round_(q1, decimals = 1)
q3 = np.round_(q3, decimals = 1)
# IQR of Life expectancy column
iqr = np.round_((q3 - q1), decimals = 1)

# Standard of Life expectancy column
sd = np.round_(np.std(df_array), decimals = 1)

sum_ms = np.round_(df_mean + sd, decimals = 1)
dif_ms = np.round_(df_mean - sd, decimals = 1)
print("Mean is {}, Median is {}, Range is {}, Q1 is {}, Q3 is {}\
       \nIQR is {}, Standard devation is {}, STD + Mean = {}, STD - Mean = {}" \
      .format(df_mean, df_med, df_range, q1, q3, iqr, sd, sum_ms, dif_ms)
     )

# Plotting an histogram to show the Distribution of Life expectancy at birth
plt.hist(df_array,
        bins = 25)

# Plotting a line for the mean and median
plt.axvline(
    q1, linestyle="--", color='orange', label = "First Quartile"
);
plt.axvline(
    q3, linestyle="--", color='green', label = "Third Quartile"
)
plt.axvline(
    sum_ms, linestyle="--", color='yellow', label = "Mean + STD"
);
plt.axvline(
    dif_ms, linestyle="--", color='cyan', label = "Mean - STD"
)
plt.legend(loc = "upper left");

In [None]:
a, b = 13, 5

xx = np.random.beta(a,b, size=N)
p = np.linspace(0,100,100)
y = np.percentile((df_array/100), p)
yy = np.percentile(xx, p)
plt.plot(y,yy, 'o')
plt.plot(y,y, color='red')

In [None]:
from scipy.stats import beta

r = beta.rvs(a,b, size=N)
plt.hist(r, 25)

# Plotting a line for the mean, first, second and third quartile.
plt.axvline(
    np.mean(r), linestyle="--", color='magenta', label = "Mean"
);
plt.axvline(
    np.quantile(r, 0.25), linestyle="--", color='gold', label = "Q1"
)
plt.axvline(
    np.quantile(r, 0.5), linestyle="--", color='yellow', label = "Q2"
);
plt.axvline(
    np.quantile(r, 0.75), linestyle="--", color='cyan', label = "Q3"
)

plt.legend(loc = "upper left");