# Functions

### Mean square root measure evaluation

In [None]:
def eval_mse(y, yhat):
    """ 
    Calculate the mean squared error on a data set.
    Args:
      y    : (ndarray  Shape (m,) or (m,1))  target value of each example
      yhat : (ndarray  Shape (m,) or (m,1))  predicted value of each example
    Returns:
      err: (scalar)             
    """
    m = len(y)
    err = 0.0
    for i in range(m):
        err_i  = ( (yhat[i] - y[i])**2 ) 
        err   += err_i                                                                
    err = err / (2*m) 
    
    return(err)

### Student's and Welsh's T-tests and Mann-Whitney U-Test automatinc evaluation
https://towardsdatascience.com/statistical-significance-testing-of-two-independent-sample-means-with-scipy-638cb834b4d1

In [None]:
from scipy.stats import levene, ttest_ind, mannwhitneyu, shapiro

def check_normality(grp, alpha=0.05):
    """Checks whether the distribution is normal or not via the Shapiro criteria"""
    
    shapiro_p = shapiro(grp).pvalue
    if shapiro_p<alpha:
        print(f"Sampled from non-normal distribution (p-value={shapiro_p:.4f}) => Normality assumption is not met")
        return False
    else:
        print(f"Sampled from normal distribution (p-value={shapiro_p:.4f})  => Normality assumption is met")
        return True   

def check_mean_significance(grp1, grp2, alpha=0.05, alternative='two-sided'):  
    """Performs the t-test or the u-test based on the variance and distribution of both datasets"""
    
    print("========== Checking for normality assumption for first group ==========")
    is_grp1_norm = check_normality(grp1, alpha=alpha)
    
    print("\n========== Checking for normality assumption for second group ==========")
    is_grp2_norm = check_normality(grp2, alpha=alpha)
        
    if is_grp1_norm & is_grp2_norm:        
        print("\n========== Checking for equality of population variance ==========")
        levene_pvalue = levene(grp1, grp2).pvalue
        if levene_pvalue < alpha:
            print(f"Groups have unequal variance (p-value = {levene_pvalue:.4f}) => Equal variance assumption is not met")
            equal_var=False
            test = "Welsch's t-test"
        else:
            print(f"Groups have equal variance (p-value = {levene_pvalue:.4f}) => Equal variance assumption is met")
            equal_var=True
            test = "Student's t-test"

        print(f"\n========== Checking for statistical significance of difference in means ({test}) ==========")
        t_pvalue = ttest_ind(grp1, grp2, equal_var=equal_var, alternative=alternative).pvalue
        if t_pvalue < alpha:
            print(f"We reject the null hypothesis (p-value = {t_pvalue:.4f}).")
        else:
            print(f"We do not reject the null hypothesis (p-value = {t_pvalue:.4f}).")

    else:
        print(f"\n========== Checking for statistical significance of difference in means (Mann-Whitney U Test) ==========")
        mw_pvalue = mannwhitneyu(grp1, grp2, alternative=alternative).pvalue
        if mw_pvalue < alpha:
            print(f"We reject the null hypothesis (p-value = {mw_pvalue:.4f})")
        else:
            print(f"We do not reject the null hypothesis (p-value = {mw_pvalue:.4f})")

### Plotting box and bar plots in parallel

In [None]:
def BoxAndBar(x_name, y_name, x_label=None, y_label=None, dataframe_name=df, dpi=120):
    
    if x_label == None:
        x_label = x_name
    if y_label == None:
        y_label = y_name
    
    plt.figure(figsize=(9, 5), dpi=dpi) # width and height in inches
    plot = sns.barplot(x=x_name, y=y_name, data=dataframe_name, errorbar = "sd", capsize = 0.1, estimator=np.mean)
    plot.set_xlabel(x_label,fontsize=24)
    plot.set_ylabel(y_label,fontsize=24)
    plot.tick_params(labelsize=17)
    #sns.move_legend(plot, "lower right")
    plt.savefig('{0}.png'.format(y_name), transparent=True, bbox_inches='tight')

    plt.figure(figsize=(6, 5), dpi=dpi) # width and height in inches
    plot2 = sns.boxplot(x=x_name, y=y_name, data=dataframe_name)
    plot2.set_xlabel(x_label,fontsize=22)
    plot2.set_ylabel(y_label,fontsize=22)
    plot2.tick_params(labelsize=17)
    #sns.move_legend(plot, "lower right")
    plt.savefig('box{0}.png'.format(y_name), transparent=True, bbox_inches='tight')
    plt.show() 

### Plot histograms

In [None]:
def makeHist (x_name, hue, x_label=None, dataframe_name=df, dpi=100):
    
    if x_label == None:
        x_label = x_name
    
    plt.figure(figsize=(9, 6), dpi=100) # width and height in inches
    plot = sns.kdeplot(
       data=dataframe_name, x=x_name, hue=hue,
       fill=True, common_norm=False, palette="tab10",
       alpha=.5, linewidth=0, warn_singular=False
    )
    plot.set_xlabel(x_label, fontsize=24)
    plot.set_ylabel("Density", fontsize=24)
    plot.tick_params(labelsize=17)
    plt.savefig(f'KDE_{x_name}.png', transparent=True, bbox_inches='tight')
    plt.show()

### Confusion matrix plotter

In [None]:
def plot_confusion_matrix(y,y_predict):
    "this function plots the confusion matrix"
    from sklearn.metrics import confusion_matrix

    cm = confusion_matrix(y, y_predict)
    ax= plt.subplot()
    sns.heatmap(cm, annot=True, ax = ax); #annot=True to annotate cells
    ax.set_xlabel('Predicted labels')
    ax.set_ylabel('True labels')
    ax.set_title('Confusion Matrix'); 
    #ax.xaxis.set_ticklabels(['did not land', 'land']); ax.yaxis.set_ticklabels(['did not land', 'landed']) 
    plt.show() 

### Threshold iteration

In [None]:
def thresholds_tests (X_test,y_test):
    pred_proba_df = pd.DataFrame(logreg.predict_proba(X_test))
    threshold_list = [0.05,0.1,0.15,0.2,0.25,0.3,0.35,0.4,0.45,0.5,0.55,0.6,0.65,.7,.75,.8,.85,.9,.95,.99]
    y_test = pd.DataFrame(y_test)
    for i in threshold_list:
        print ('\n******** For i = {} ******'.format(i))
        y_test_pred = pred_proba_df.applymap(lambda x: 1 if x>i else 0)
        test_accuracy = accuracy_score(y_test.values.reshape(y_test.values.size,1),
                                               y_test_pred.iloc[:,1].values.reshape(y_test_pred.iloc[:,1].values.size,1))
        print('Our testing accuracy is {}'.format(test_accuracy))

        print(confusion_matrix(y_test.values.reshape(y_test.values.size,1),
                               y_test_pred.iloc[:,1].values.reshape(y_test_pred.iloc[:,1].values.size,1)))

In [None]:
def manual_threshold_model(threshold_val):
    
    y_test_pred = pred_proba_df.applymap(lambda x: 1 if x>threshold_val else 0)
    test_accuracy = accuracy_score(y_test.values.reshape(y_test.values.size,1),
                                           y_test_pred.iloc[:,1].values.reshape(y_test_pred.iloc[:,1].values.size,1))
    print('The testing accuracy is {}'.format(test_accuracy))
    
    y_pred = y_test_pred.iloc[:,1].values.reshape(y_test_pred.iloc[:,1].values.size,1)

    return y_pred