# Compute performance metrics for the given Y and Y_score without sklearn

In [74]:
import numpy as np
import pandas as pd
# other than these two you should not import any other packages

In [75]:
# mount drive to access files

from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).



## A. Compute performance metrics for the given data '5_a.csv'

In [76]:
# read data using pandas

df=pd.read_csv('/content/drive/MyDrive/5_a.csv')
df_a = pd.DataFrame(df)
df_auc = df.copy(deep=True)
df_a.head()


Unnamed: 0,y,proba
0,1.0,0.637387
1,1.0,0.635165
2,1.0,0.766586
3,1.0,0.724564
4,1.0,0.889199


In [77]:
# check value counts of classes

df_a['y'].value_counts()

1.0    10000
0.0      100
Name: y, dtype: int64

Highly imbalanced dataset. 
99 % data points belong to class 1.




In [78]:
#derive the class labels from given score
predicted=[]
for i in df_a["proba"]:
  if i <= 0.5:
    predicted.append(0)
  else:
    predicted.append(1)
    #print(i)
df_a["predicted"] = predicted
df_a.head()
df_a['predicted'].value_counts()


1    10100
Name: predicted, dtype: int64

In [79]:
# define a function for confusion matrix
# handle all test cases

def confusion_matrix(df):
  TP=0
  TN=0
  FP=0
  FN=0
  F1_N = 0
  for i in range(len(df['y'])):
    if df.iloc[i]['predicted'] == 1 and df.iloc[i]['y'] == 1:
      TP = TP +1
    elif df.iloc[i]['predicted'] == 0 and df.iloc[i]['y'] == 0:
      TN = TN +1
    elif df.iloc[i]['predicted'] == 1 and df.iloc[i]['y'] == 0:
      FP = FP +1
    elif df.iloc[i]['predicted'] == 0 and df.iloc[i]['y'] == 1:
      FN = FN +1
  confusion_matrix = np.array([[TN ,FN],[FP ,TP]])
  total_0 = TN + FP
  total_1 = FN + TP
  total_0pred = TN + FN
  total_1pred = FP + TP
  if total_0pred == 0:
    if total_1pred == 0:     
      precision_TN = 0
      precision_FN = 0
      precision_FP = 0
      precision_TP = 0
    else:
      precision_TN = 0
      precision_FN = 0
      precision_FP = FP / total_1pred
      precision_TP = TP / total_1pred
  elif total_1pred == 0:      
      precision_TN = TN / total_0pred
      precision_FN = FN / total_0pred
      precision_FP = 0
      precision_TP = 0
  else:
      precision_TN = TN / total_0pred
      precision_FN = FN / total_0pred
      precision_FP = FP / total_1pred
      precision_TP = TP / total_1pred
  precision_matrix = np.array([[precision_TN ,precision_FN],[precision_FP ,precision_TP]])

  if total_0 == 0:
    if total_1 == 0:     
      recall_TN = 0
      recall_FN = 0
      recall_FP = 0
      recall_TP = 0
    else:
      recall_TN = 0
      recall_FN = FN / total_1
      recall_FP = 0
      recall_TP = TP / total_1
  elif total_1pred == 0:      
      recall_TN = TN / total_0
      recall_FN = 0
      recall_FP = FP / total_0
      recall_TP = 0
  else:
      recall_TN = TN / total_0
      recall_FN = FN / total_1
      recall_FP = FP / total_0
      recall_TP = TP / total_1
  recall_matrix = np.array([[recall_TN ,recall_FN],[recall_FP ,recall_TP]])
  if precision_TN == 0 or recall_TN == 0:
    if precision_TP == 0 or recall_TP == 0:
      F1_N = 0
      F1_1 = 0
    else:
      F1_1 = 2*((precision_TP *recall_TP)/(precision_TP +recall_TP))
  elif precision_TP == 0 or recall_TP == 0:
    F1_1 = 0
    F1_N = 2*((precision_TN *recall_TP)/(precision_TN +recall_TP))
  else:
    F1_N = 2*((precision_TN *recall_TP)/(precision_TN +recall_TP))
    F1_1 = 2*((precision_TP *recall_TP)/(precision_TP +recall_TP))




  Accuracy= (TN+TP)/(TN+TP+FN+FP)
  
  print("Confusion Matrix \n",confusion_matrix)
  print("Precision Matrix \n",precision_matrix)
  print("Recall Matrix \n", recall_matrix)
  print("F1 Score for Class 0 \n", F1_N)
  print("F1 Score for Class 1 \n", F1_1)
  print("Accuracy \n", Accuracy)


confusion_matrix(df_a)


Confusion Matrix 
 [[    0     0]
 [  100 10000]]
Precision Matrix 
 [[0.         0.        ]
 [0.00990099 0.99009901]]
Recall Matrix 
 [[0. 0.]
 [1. 1.]]
F1 Score for Class 0 
 0
F1 Score for Class 1 
 0.9950248756218906
Accuracy 
 0.9900990099009901


In [80]:
# define a function for auc score
# handle all test cases


def auc_score(df):
  df_sorted = df.sort_values(by = ['proba'],ascending=False)
  pred = df_sorted["proba"].to_numpy()
  actual = df_sorted["y"].to_numpy()
  count_0 = actual.tolist().count(0)
  count_1 = actual.tolist().count(1)
  TPR_array = []
  FPR_array = []
  for i in pred:
    TP = 0
    FP= 0
    a=np.where(pred > i,1, 0)
    for j in range(len(a)):
      if actual[j]==a[j] == 1:
        TP = TP +1
      elif actual[j] != a[j] and a[j] == 1:
        FP = FP +1
    FPR = FP / count_0
    TPR = TP / count_1
    TPR_array.append(TPR)
    FPR_array.append(FPR)
  auc = np.trapz(TPR_array, FPR_array)
  print(auc)


auc_score(df_auc)

0.48829900000000004




## B. Compute performance metrics for the given data '5_b.csv'

In [81]:
df=pd.read_csv('/content/drive/MyDrive/5_b.csv')
df_b = pd.DataFrame(df)
df_b_auc = df_b.copy(deep=True)
df_b.head()

Unnamed: 0,y,proba
0,0.0,0.281035
1,0.0,0.465152
2,0.0,0.352793
3,0.0,0.157818
4,0.0,0.276648


In [82]:
df_b['y'].value_counts()

0.0    10000
1.0      100
Name: y, dtype: int64

In [83]:
predicted=[]
for i in df_b["proba"]:
  if i <= 0.5:
    predicted.append(0)
  else:
    predicted.append(1)
    #print(i)
df_b["predicted"] = predicted
df_b.head()
df_b['predicted'].value_counts()

0    9806
1     294
Name: predicted, dtype: int64

In [84]:
confusion_matrix(df_b)

Confusion Matrix 
 [[9761   45]
 [ 239   55]]
Precision Matrix 
 [[0.99541097 0.00458903]
 [0.81292517 0.18707483]]
Recall Matrix 
 [[0.9761 0.45  ]
 [0.0239 0.55  ]]
F1 Score for Class 0 
 0.7085183743227995
F1 Score for Class 1 
 0.2791878172588833
Accuracy 
 0.9718811881188119


In [85]:
auc_score(df_b_auc)

0.9376570000000001


### C. Compute the best threshold (similarly to ROC curve computation) of probability which gives lowest values of metric <b>A</b> for the given data 


In [86]:
df_c=pd.read_csv('/content/drive/MyDrive/5_c.csv')
df_c.head()

Unnamed: 0,y,prob
0,0,0.458521
1,0,0.505037
2,0,0.418652
3,0,0.412057
4,0,0.375579


In [87]:
df_c['y'].value_counts()

0    1805
1    1047
Name: y, dtype: int64

In [88]:
def threshold(df):
  df_sorted = df.sort_values(by = ['prob'],ascending=False)
  pred = df_sorted["prob"].to_numpy()
  actual = df_sorted["y"].to_numpy()
  threshold = {}
  for i in pred:
    FN = 0
    FP= 0
    a=np.where(pred > i,1, 0)
    for j in range(len(a)):
      if actual[j] != a[j] and a[j] == 0:
        FN = FN +1
      elif actual[j] != a[j] and a[j] == 1:
        FP = FP +1
    A = ((500* FN) + (100 * FP))
    threshold.update({i:A})
  threshold_prob = min(threshold, key=threshold.get)
  print(threshold_prob)

threshold(df_c)

0.2298716443615991



## D.</b></font> Compute performance metrics(for regression) for the given data 5_d.csv

<li> Compute Mean Square Error </li>
<li> Compute MAPE: https://www.youtube.com/watch?v=ly6ztgIkUxk</li>
<li> Compute R^2 error: https://en.wikipedia.org/wiki/Coefficient_of_determination#Definitions </li>
</ol>
</pre>

In [89]:
df_d=pd.read_csv('/content/drive/MyDrive/5_d.csv')
df_d.head()

Unnamed: 0,y,pred
0,101.0,100.0
1,120.0,100.0
2,131.0,113.0
3,164.0,125.0
4,154.0,152.0


In [90]:
def error(df):
  sum_err = 0
  MAPE_err = 0
  SS_total = 0
  pred = df["pred"].to_numpy()
  actual = df["y"].to_numpy()
  avg = np.mean(actual)
  for i in range(len(pred)):
    err = (actual[i]-pred[i])
    sqrd_err = err**2
    sum_err = sum_err + sqrd_err
    abs_err = abs(err/ avg)
    MAPE_err = MAPE_err + abs_err
    total_err = (actual[i]-avg)**2
    SS_total = SS_total + total_err

  MSE = sum_err / len(pred)
  MAPE = MAPE_err / len(pred)
  sqr_R = (1-(sum_err / SS_total))
  
  
  print("Mean Squared Error : ",MSE)
  print("Mean Absolute Percentage Error : ",MAPE)
  print("R^2 Error : ", sqr_R)

error(df_d)
  

Mean Squared Error :  177.16569974554707
Mean Absolute Percentage Error :  0.12912029940096315
R^2 Error :  0.9563582786990964
