In [None]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
# Standard Score
from sklearn.preprocessing import StandardScaler
import seaborn as sns

In [None]:
raw_data="""Rank Name Age Height
(scaled)
Weight
(lb)
Reach
(cm)
1 Alexander Volkanovski 33 1.00 144.5 182
2 Islam Makhachev 31 4.60 154.5 179
3 Leon Edwards 31 6.40 169.5 188
4 Kamaru Usman 35 6.40 170 193
5 Francis Ngannou 35 10.00 257 211
6 Israel Adesanya 33 10.00 185 203
7 Charles Oliveira 33 4.60 175 188
8 Alex Pereira 35 9.64 184.6 200
9 Aljamain Sterling 33 1.72 153 180
10 Jiří Procházka 30 8.92 205 203"""

In [None]:
# data = paste()
data = raw_data.split("\n")
columns = data[0].split()
for item in list(data[1:]):
    if item.startswith("("):
        columns[-1] += f" {item}"
        data.remove(item)
    elif item[0].isdigit():
        break
    else:
        columns.append(item)
        data.remove(item)
data = data[1:]
data = [x.split() for x in data]
data = [x[:1] + [" ".join(x[1:3])] + x[3:] for x in data]
data = pd.DataFrame(data, columns=columns)
for type_,col in zip([int,str,int,float,float,int],data.columns):
    data[col] = data[col].astype(type_)

display(data.rename(columns={col:f"{col} ({data[col].dtype})" for col in data.columns}))

In [None]:
# (a) Equal width binning
lower_bound_open = 125
upper_bound_closed = 265
n_bins = 4
interval_width = (upper_bound_closed - lower_bound_open) / n_bins
lower_bounds = [lower_bound_open + interval_width * i for i in range(n_bins)]
upper_bounds = [lower_bound_open + interval_width * (i+1) for i in range(n_bins)]
col_ = data.columns[4]

bins = []
def get_df_bin(df, col, lower_bound, upper_bound):
    return df[(df[col] > lower_bound) & (df[col] <= upper_bound)]

for i in range(n_bins):
    print(f"({lower_bounds[i]},{upper_bounds[i]}]")
    display(get_df_bin(data, col_, lower_bounds[i], upper_bounds[i]))

In [None]:
# (b) Min-max reverse scaling
l,u = 168,193
col = data.columns[3]
scaler = MinMaxScaler(feature_range=(l,u)).fit(data[[col]])

idxs = [4,6,8]
idxs = [x-1 for x in idxs]

df_b = data.copy()
df_b["Height (original)"] = scaler.transform(df_b[[col]])
# Highlight rows of df_b in idxs
# float_cols = [x for x in df_b.columns if df_b[x].dtype == float]
df_b_style = df_b.style.apply(lambda x: ["background: gray" if x.name in idxs else "" for i in x], axis=1)
df_b_style = df_b_style.format({col: "{:.2f}" for col in [x for x in df_b.columns if df_b[x].dtype == float]})
display(df_b_style)

In [None]:
# (c) Standard Score
col = data.columns[5]
df_c = data.copy()
mean = df_c[col].mean()
std = df_c[col].std()
std_given = 10.53
scaler = StandardScaler().fit(df_c[[col]])
df_c["Reach (standard score)"] = scaler.transform(df_c[[col]])
my_scaler = lambda x: (x - mean) / std_given
df_c["Reach (standard score with given std)"] = df_c[col].apply(my_scaler)

idxs = list(map(lambda x: x-1, [3,5]))
df_c_style = df_c.style.apply(lambda x: ["background: gray" if x.name in idxs else "" for i in x], axis=1)
df_c_style = df_c_style.format({col: "{:.2f}" for col in [x for x in df_c.columns if df_c[x].dtype == float]})
print(f"Mean: {mean:.2f}, Std: {std:.2f}, Std (given): {std_given:.2f}")
display(df_c_style)