## Import section

In [None]:
!pip install missingno > /dev/null
!pip install tabulate > /dev/null

In [None]:
import pandas as pd
import numpy as np
import missingno as msno
from matplotlib import pyplot as plt
from sklearn.experimental import enable_iterative_imputer
from sklearn.linear_model import LinearRegression, BayesianRidge
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer, IterativeImputer
from sklearn.experimental import enable_iterative_imputer
from tabulate import tabulate
from IPython.display import display
%matplotlib inline

### Read data, treat 0 as NaN

In [None]:
def nan_data(df, percent):
    return df.mask(np.random.random(df.shape) < percent)

missing_values = [0]
data = pd.read_csv("./data/Fish.csv", na_values=missing_values)
# data = data[data.Weight < 1200]
# data = data[data.Weight > 100]
original_data = data.copy()
data[data.drop(columns='Species').columns] = nan_data(data.drop(columns='Species'), 0.08)

####  Describe data

In [None]:
print(data.describe())
print("-----------------------------------")
print(data.head(10))

## Visualize missing data

#### Percentage of missing data + non-missing data as bars

In [None]:
print(data.isnull().sum() * 100 / len(data))
print(msno.bar(data))

#### Missing data matrix and "heatmap"

In [None]:
print(msno.matrix(data))
# heatmap show correlation nullity correlation:
# how strongly the presence or absence of one variable affects the presence of another
print(msno.heatmap(data)) 

# Regression

In [None]:
def plot_data(axes, title, x_train, y_train, x_test, y_test):
    axes.set_title(title)
    axes.scatter(x_train, y_train, color="green"),
    axes.plot(x_test, y_test, color="red", linewidth=1)

models = [LinearRegression(),
          BayesianRidge()]

res, stats_x, stats_y = [], [], []
colx, coly = 'Weight', 'Length2'
to_df = lambda a: pd.DataFrame(data=a, index=range(len(a)), columns=[colx, coly])
iter_imputer = lambda m: (lambda a: IterativeImputer(estimator=m, max_iter=20, random_state=0).fit_transform(a), f"{m.__class__.__name__} imputation", data)
cases = [
    (None, "Full, original data", original_data),
    (None, "deleted NaN rows", data),
    (lambda a: SimpleImputer().fit_transform(a), "Mean imputation", data),
    (lambda a: to_df(a).interpolate().to_numpy(), "Interpolation imputation", data),
    (lambda a: to_df(a).fillna(method='ffill').to_numpy(), "Hot Deck LOCF imputation", data),
    iter_imputer(models[0]),
    iter_imputer(models[1])
]
f, axarr = plt.subplots(len(cases), len(models), sharex=True, sharey=True,figsize=(12,12))
f.suptitle(f"{coly} vs {colx}")
for i, (imputer, title, datasource) in enumerate(cases):
    data_slice = datasource[[colx, coly]]
    if imputer == None:
        data_slice = data_slice.dropna()
    data_slice = data_slice.to_numpy()
    if imputer != None:
        data_slice=imputer(data_slice)
    xs = data_slice[:,0].reshape((-1,1))
    ys = data_slice[:,1].reshape((-1,1))
    x_train,x_test,y_train,y_test=train_test_split(xs,ys,test_size=0.4)
    coeffs = []
    for y, model in enumerate(models):
        y_test = model.fit(x_train,y_train.ravel()).predict(x_test)
        plot_data(axarr[i,y], title, x_train, y_train, x_test, y_test)
        coeffs.append(model.coef_.ravel()[0])
    res.append((title, len(xs), f"{coeffs[0]:.4f}", f"{coeffs[1]:.4f}"))
    descr = pd.DataFrame(data=data_slice, columns=[colx, coly]).describe()
    for c, stats in [(colx, stats_x), (coly, stats_y)]:
        stats.append((title, descr[c]["mean"], descr[c]["std"],
                      descr[c]["min"], descr[c]["25%"], descr[c]["50%"],
                      descr[c]["75%"], descr[c]["max"]))

display(tabulate(res, headers=['Type', 'Number of samples', 'linear regression', 'bayesian ridge'], tablefmt="html"))
print("Weight")
display(tabulate(stats_x, headers=['mean', 'std', 'min', '25%', '50%', '75%', 'max'], tablefmt="html"))
print("Length2")
display(tabulate(stats_y, headers=['mean', 'std', 'min', '25%', '50%', '75%', 'max'], tablefmt="html"))