# Compute performance metrics for the given Y and Y_score without sklearn

In [1]:
import numpy as np
import pandas as pd
# other than these two you should not import any other packages

<pre>
<font color='red'><b>A.</b></font> Compute performance metrics for the given data <strong>5_a.csv</strong>
   <b>Note 1:</b> in this data you can see number of positive points >> number of negatives points
   <b>Note 2:</b> use pandas or numpy to read the data from <b>5_a.csv</b>
   <b>Note 3:</b> you need to derive the class labels from given score</pre> $y^{pred}= \text{[0 if y_score < 0.5 else 1]}$

<pre>
<ol>
<li> Compute Confusion Matrix </li>
<li> Compute F1 Score </li>
<li> Compute AUC Score, you need to compute different thresholds and for each threshold compute tpr,fpr and then use               numpy.trapz(tpr_array, fpr_array) <a href='https://stackoverflow.com/q/53603376/4084039'>https://stackoverflow.com/q/53603376/4084039</a>, <a href='https://stackoverflow.com/a/39678975/4084039'>https://stackoverflow.com/a/39678975/4084039</a> Note: it should be numpy.trapz(tpr_array, fpr_array) not numpy.trapz(fpr_array, tpr_array)</li>
<li> Compute Accuracy Score </li>
</ol>
</pre>

In [2]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [3]:
filepath = '/content/drive/My Drive/Colab Notebooks/Assignments/As7_Compute Performance metrics without Sklearn/5_a.csv'
df_a = pd.read_csv(filepath)
print(df_a.head())
print(df_a.shape)

     y     proba
0  1.0  0.637387
1  1.0  0.635165
2  1.0  0.766586
3  1.0  0.724564
4  1.0  0.889199
(10100, 2)


In [4]:
# Predicting class labels from probability scores with threshold as 0.5

df_a['y_pred'] = [0 if x < 0.5 else 1 for x in df_a['proba']]
df_a['y_pred'].sum()

10100

In [None]:
def confusion_mat(y_actual, y_pred):
    '''
    This function returns confusion matrix along with True Negatives, False Negatives, False Positives and True Positives
    '''
    # sums up all the True data points
    TP = sum((y_actual == 1) & (y_pred == 1))   # True positive
    TN = sum((y_actual == 0) & (y_pred == 0))   # True Negative
    FP = sum((y_actual == 0) & (y_pred == 1))   # False positive
    FN = sum((y_actual == 1) & (y_pred == 0))   # False negative

    # Confusion Matrix
    cm = pd.DataFrame(data = [[TN, FN], [FP, TP]], index = ['pred_0' , 'pred_1'], columns = ['actual_0', 'actual_1'])
    cm

    return cm

In [None]:
def F1_score(y_actual, y_pred):
    '''
    This function returns harmonic mean of precision and recall that is F1_score
    '''

    # sums up all the True data points
    TP = sum((y_actual == 1) & (y_pred == 1))   # True Positive
    FP = sum((y_actual == 0) & (y_pred == 1))   # False positive
    FN = sum((y_actual == 1) & (y_pred == 0))   # False negative

    recall = TP / (TP+FN)
    precision = TP / (TP+FP)

    f1score = 2*recall*precision / (recall + precision)  # Harmonic Mean

    return f1score

In [None]:
def AUC_score(y_actual, y_prob):
    '''
    This function returns the area under the roc curve. It ranges between 0 and 1
    '''

    tpr_list = []      # Initialize True_positive_rate list
    fpr_list = []      # Initialize False_positive_rate list

    # For each value of probability scores as thresholds predict class labels
    for tau in sorted(list(set(y_prob))):
        y_pred = pd.Series([0 if x < tau else 1 for x in y_prob])

        # sums up all the True data points
        TP = sum((y_actual == 1) & (y_pred == 1))   # True positive
        TN = sum((y_actual == 0) & (y_pred == 0))   # True negative
        FP = sum((y_actual == 0) & (y_pred == 1))   # False positive
        FN = sum((y_actual == 1) & (y_pred == 0))   # False negative

        TPR = TP / (TP + FN)      # True_positive_rate for each threshold value
        FPR = FP / (FP + TN)      # False_positive_rate for each threshold value

        tpr_list.append(TPR)      # Append TPR to tpr_list
        fpr_list.append(FPR)      # Append FPR to fpr_list

    # plt.plot(fpr_list, tpr_list)

    # Area under curve (AUC) using trapezoidal rule
    # np.flip - Reverse the order of elements in an array along the given axis to plot from lower to higher values
    auc = np.trapz(np.flip(tpr_list), np.flip(fpr_list))

    return auc

In [None]:
def accuracy(y_actual, y_pred):
    '''
    This function returns the accuracy of prediction that is number of correctly classified out of all
    '''

    # sums up all the True data points
    TP = sum((y_actual == 1) & (y_pred == 1))   # True positive
    TN = sum((y_actual == 0) & (y_pred == 0))   # True negative

    accuracy = (TP + TN) / len(y_actual)

    return accuracy

In [None]:
# Results of custom Implementation

print(confusion_mat(df_a.y, df_a.y_pred))
print(F1_score(df_a.y, df_a.y_pred))
print(AUC_score(df_a.y, df_a.proba))
print(accuracy(df_a.y, df_a.y_pred))

        actual_0  actual_1
pred_0         0         0
pred_1       100     10000
0.9950248756218906
0.48829900000000004
0.9900990099009901


In [None]:
# Results from sklearn library

from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, roc_auc_score

print(confusion_matrix(df_a.y, df_a.y_pred))
print(f1_score(df_a.y, df_a.y_pred))
print(roc_auc_score(df_a.y, df_a.proba))
print(accuracy_score(df_a.y, df_a.y_pred))

[[    0   100]
 [    0 10000]]
0.9950248756218906
0.48829900000000004
0.9900990099009901


<pre>
<font color='red'><b>B.</b></font> Compute performance metrics for the given data <strong>5_b.csv</strong>
   <b>Note 1:</b> in this data you can see number of positive points << number of negatives points
   <b>Note 2:</b> use pandas or numpy to read the data from <b>5_b.csv</b>
   <b>Note 3:</b> you need to derive the class labels from given score</pre> $y^{pred}= \text{[0 if y_score < 0.5 else 1]}$

<pre>
<ol>
<li> Compute Confusion Matrix </li>
<li> Compute F1 Score </li>
<li> Compute AUC Score, you need to compute different thresholds and for each threshold compute tpr,fpr and then use               numpy.trapz(tpr_array, fpr_array) <a href='https://stackoverflow.com/q/53603376/4084039'>https://stackoverflow.com/q/53603376/4084039</a>, <a href='https://stackoverflow.com/a/39678975/4084039'>https://stackoverflow.com/a/39678975/4084039</a></li>
<li> Compute Accuracy Score </li>
</ol>
</pre>

In [None]:
filepath = '/content/drive/My Drive/Colab Notebooks/Assignments/As7_Compute Performance metrics without Sklearn/5_b.csv'

df_b = pd.read_csv(filepath)
print(df_b.head())
print(df_b.shape)

     y     proba
0  0.0  0.281035
1  0.0  0.465152
2  0.0  0.352793
3  0.0  0.157818
4  0.0  0.276648
(10100, 2)


In [None]:
df_b['y_pred'] = [0 if x < 0.5 else 1 for x in df_b.proba]
df_b['y_pred'].sum()

294

In [None]:
# Results of custom Implementation

print(confusion_mat(df_b.y, df_b.y_pred))
print(F1_score(df_b.y, df_b.y_pred))
print(AUC_score(df_b.y, df_b.proba))
print(accuracy(df_b.y, df_b.y_pred))

        actual_0  actual_1
pred_0      9761        45
pred_1       239        55
0.2791878172588833
0.9377570000000001
0.9718811881188119


In [None]:
# Results from sklearn library

print(confusion_matrix(df_b.y, df_b.y_pred))
print(f1_score(df_b.y, df_b.y_pred))
print(roc_auc_score(df_b.y, df_b.proba))
print(accuracy_score(df_b.y, df_b.y_pred))

[[9761  239]
 [  45   55]]
0.2791878172588833
0.9377570000000001
0.9718811881188119


<font color='red'><b>C.</b></font> Compute the best threshold (similarly to ROC curve computation) of probability which gives lowest values of metric <b>A</b> for the given data <strong>5_c.csv</strong>
<br>

you will be predicting label of a data points like this: $y^{pred}= \text{[0 if y_score < threshold  else 1]}$

$ A = 500 \times \text{number of false negative} + 100 \times \text{numebr of false positive}$

<pre>
   <b>Note 1:</b> in this data you can see number of negative points > number of positive points
   <b>Note 2:</b> use pandas or numpy to read the data from <b>5_c.csv</b>
</pre>

In [None]:
filepath = '/content/drive/My Drive/Colab Notebooks/Assignments/As7_Compute Performance metrics without Sklearn/5_c.csv'

df_c = pd.read_csv(filepath)
print(df_c.head())
print(df_c.shape)
print(sum(df_c.y))

   y      prob
0  0  0.458521
1  0  0.505037
2  0  0.418652
3  0  0.412057
4  0  0.375579
(2852, 2)
1047


In [None]:
df_c['y_pred'] = [0 if x < 0.5 else 1 for x in df_c.prob]
df_c['y_pred'].sum()

753

In [None]:
def best_threshold(y_actual, y_prob):
    '''
    This function finds the optimal threshold for the metric
    A = 500 × num of false negative + 100 × num of false positive
    '''

    A_list = []     # Initialize metric value list
    thresholds_list = sorted(list(set(y_prob)))  # unique thresholds sorted

    # for each threshold value in thresholds list predict class labels
    for tau in thresholds_list:
        y_pred = pd.Series([0 if x < tau else 1 for x in y_prob])

        FP = sum((y_actual == 0) & (y_pred == 1))   # False positive
        FN = sum((y_actual == 1) & (y_pred == 0))   # False negative

        A = (500 * FN) + (100 * FP)   # Metric formula
        A_list.append(A)

    index_smallest_A = A_list.index(min(A_list))  # index of smallest A

    best_threshold = thresholds_list[index_smallest_A]  # optimal threshold

    return best_threshold

In [None]:
# Performance of optimal threshold derived for given metric

sel_threshold = best_threshold(df_c.y, df_c.prob)
print('optimal threshold : ', sel_threshold)

y_pred = pd.Series([0 if x < sel_threshold else 1 for x in df_c['prob']])

print(confusion_mat(df_c.y, y_pred))
print(F1_score(df_c.y, y_pred))
print(AUC_score(df_c.y, df_c.prob))
print(accuracy(df_c.y, y_pred))

optimal threshold :  0.2300390278970873
        actual_0  actual_1
pred_0       785        78
pred_1      1020       969
0.6383399209486166
0.8288141557331724
0.615007012622721


In [None]:
# Performance for given metric without optimal threshold to understand the difference

print(confusion_mat(df_c.y, df_c.y_pred))
print(F1_score(df_c.y, df_c.y_pred))
print(AUC_score(df_c.y, df_c.prob))
print(accuracy(df_c.y, df_c.y_pred))

        actual_0  actual_1
pred_0      1637       462
pred_1       168       585
0.65
0.8288141557331724
0.7791023842917251


<pre>
<font color='red'><b>D.</b></font> Compute performance metrics(for regression) for the given data <strong>5_d.csv</strong>
    <b>Note 2:</b> use pandas or numpy to read the data from <b>5_d.csv</b>
    <b>Note 1:</b> <b>5_d.csv</b> will having two columns Y and predicted_Y both are real valued features
<ol>
<li> Compute Mean Square Error </li>
<li> Compute MAPE: https://www.youtube.com/watch?v=ly6ztgIkUxk</li>
<li> Compute R^2 error: https://en.wikipedia.org/wiki/Coefficient_of_determination#Definitions </li>
</ol>
</pre>

In [None]:
filepath = '/content/drive/My Drive/Colab Notebooks/Assignments/As7_Compute Performance metrics without Sklearn/5_d.csv'

df_d = pd.read_csv(filepath)
print(df_d.head())
print(df_d.shape)

       y   pred
0  101.0  100.0
1  120.0  100.0
2  131.0  113.0
3  164.0  125.0
4  154.0  152.0
(157200, 2)


In [None]:
def mean_square_error(y_actual, y_pred):
    '''
    This function returns the mean squared error for regression problems
    '''
    n = len(y_actual)  # num of data points
    mse = np.mean((y_actual - y_pred)**2)

    return mse

In [None]:
# MSE from custom implementation
print('MSE from custom implementation : ', mean_square_error(df_d.y, df_d.pred))

# MSE from sklearn library to verify
from sklearn.metrics import mean_squared_error, r2_score
print('MSE from sklearn : ', mean_squared_error(df_d.y, df_d.pred))

MSE from custom implementation :  177.16569974554707
MSE from sklearn :  177.16569974554707


In [None]:
def modified_MAPE(y_actual, y_pred):
    '''
    This function returns the modified Mean Absolute Percentage Error wherein the denominator is changed to the mean of abs(y_actual)
    '''
    n = len(y_actual)  # Num of data points

    # modified_MAPE - denominator is average of abs(y_actual) instead of abs(y_actual) in MAPE
    mape = np.mean((abs(y_actual - y_pred)) / abs(y_actual).mean()) * 100

    return mape

In [None]:
modified_MAPE(df_d.y, df_d.pred)

12.912029940096314

In [None]:
def r_squared_error(y_actual, y_pred):
    '''
    This function returns the r squared error for regression models
    '''
    y_actual_mean = y_actual.mean()  # mean of y_actual

    ss_tot = sum((y_actual - y_actual_mean)**2)  # sum of squares total
    ss_res = sum((y_actual - y_pred)**2)   # sum of squares residuals

    r2 = 1 - (ss_res/ ss_tot)     # r2

    return r2

In [None]:
# r2 from custom implementation
print('r2 from custom implementation : ', r_squared_error(df_d.y, df_d.pred))

# r2 from sklearn implementation
print('r2 from sklearn : ', r2_score(df_d.y, df_d.pred))

r2 from custom implementation :  0.9563582786990964
r2 from sklearn :  0.9563582786990937
