In [1]:
import pandas as pd
import numpy as np

In [2]:
data_5_a = pd.read_csv(r"..\A.csv")

In [3]:
data_5_a.head()

Unnamed: 0,y,proba
0,1,0.637387
1,1,0.635165
2,1,0.766586
3,1,0.724564
4,1,0.889199


In [4]:
def predicted_y(dataset, threshold): 
    '''
    Function to get y_pred values from predicted probability score for a threshold
    '''
    dataset['y_pred'] = np.where(dataset['proba']>=threshold , 1 ,0 )
    return dataset['y_pred']

In [5]:
predicted_y(data_5_a,0.5)

0        1
1        1
2        1
3        1
4        1
        ..
10095    1
10096    1
10097    1
10098    1
10099    1
Name: y_pred, Length: 10100, dtype: int32

In [6]:
data_5_a.head()

Unnamed: 0,y,proba,y_pred
0,1,0.637387,1
1,1,0.635165,1
2,1,0.766586,1
3,1,0.724564,1
4,1,0.889199,1


Confusion Matrix

In [7]:
def confusion_matrix(dataset):
    
    TP = ((dataset['y']==1.0) & (dataset['y_pred'] == 1.0)).sum() 
    FP = ((dataset['y']==0.0) & (dataset['y_pred'] == 1.0)).sum() 
    TN = ((dataset['y']==0.0) & (dataset['y_pred'] == 0.0)).sum() 
    FN = ((dataset['y']==1.0) & (dataset['y_pred'] == 0.0)).sum() 
    
    conf_matrix = np.array([[TN,FN],[FP,TP]])
    return conf_matrix

In [8]:
print("Confusion matrix of 5_A :\n",confusion_matrix(data_5_a))

Confusion matrix of 5_A :
 [[    0     0]
 [  100 10000]]


In [9]:
def f1_score(dataset):
    '''
    Func returns f1 score of dataset
    '''
    
    confusion_matrix_values = confusion_matrix(dataset)
    recall_num = confusion_matrix_values[1,1]
    recall_deno = (dataset['y'] ==1).sum()
    recall = recall_num/ recall_deno
    
    prec_num = confusion_matrix_values[1,1]
    prec_deno = (dataset['y_pred'] ==1).sum()
    precision = prec_num/ prec_deno
    
    f1_score = 2*(precision)*(recall)/(precision+recall)
    
    return f1_score

In [10]:
print("f1 score of 5_A :\n",f1_score(data_5_a))

f1 score of 5_A :
 0.9950248756218906


In [11]:
def accuracy(dataset):
    '''
    Func returns accuracy score for dataset passed.
    '''
    confusion_matrix_values = confusion_matrix(dataset)
    num = confusion_matrix_values[1,1] + confusion_matrix_values[0,0] 
    deno = np.sum(confusion_matrix_values)
    accuracy = num/deno
    
    return accuracy

In [12]:
print("Accuracy score of 5_A :\n",accuracy(data_5_a))

Accuracy score of 5_A :
 0.9900990099009901


In [13]:
from tqdm import tqdm
import time
def AUC_score(dataset):
    tpr_array = []
    fpr_array = []
    dataset.drop(columns = ['y_pred'])
    dataset = dataset.sort_values( by= ["proba"], ascending = False)
    
    for threshold in tqdm(dataset['proba'].unique()):
        dataset['y_pred'] =np.where( dataset['proba'] >=  threshold, 1,0)
        TP = ((dataset['y']==1.0) & (dataset['y_pred'] == 1.0)).sum() 
        FP = ((dataset['y']==0.0) & (dataset['y_pred'] == 1.0)).sum() 
        TN = ((dataset['y']==0.0) & (dataset['y_pred'] == 0.0)).sum() 
        FN = ((dataset['y']==1.0) & (dataset['y_pred'] == 0.0)).sum()
        P = (TP+FN)  
        N = (TN+FP)
        TPR = (TP/P)
        FPR = (FP/N)
        tpr_array.append(TPR)
        fpr_array.append(FPR)
        
    auc = np.trapz(tpr_array,fpr_array)
    return auc
        
        

In [14]:
print("AUC of 5_A :\n",AUC_score(data_5_a))

100%|██████████| 10100/10100 [00:22<00:00, 444.51it/s]

AUC of 5_A :
 0.48829900000000004





### B.csv

In [15]:
data_5_b = pd.read_csv(r"..\B.csv")

In [16]:
data_5_b.shape

(10100, 2)

In [17]:
predicted_y(data_5_b,0.5)

data_5_b.head()

Unnamed: 0,y,proba,y_pred
0,0,0.281035,0
1,0,0.465152,0
2,0,0.352793,0
3,0,0.157818,0
4,0,0.276648,0


In [18]:
print("Confusion matrix of 5_B :\n",confusion_matrix(data_5_b))

Confusion matrix of 5_B :
 [[9761   45]
 [ 239   55]]


In [19]:
print("f1 score of 5_B :\n",f1_score(data_5_b))

f1 score of 5_B :
 0.2791878172588833


In [20]:
print("Accuracy score of 5_B :\n",accuracy(data_5_b))

Accuracy score of 5_B :
 0.9718811881188119


In [21]:
print("AUC of 5_B:\n",AUC_score(data_5_b))

100%|██████████| 10099/10099 [00:23<00:00, 425.98it/s]

AUC of 5_B:
 0.9377570000000001





## C

In [22]:
data_5_c = pd.read_csv(r"..\C.csv")

In [23]:
data_5_c.head(5)

Unnamed: 0,y,prob
0,0,0.458521
1,0,0.505037
2,0,0.418652
3,0,0.412057
4,0,0.375579


In [24]:
data_5_c.rename(columns = {'prob':'proba'},inplace = True) # Renaming probability score column for above defined functions to be applicable

In [25]:
data_5_c.head(5)

Unnamed: 0,y,proba
0,0,0.458521
1,0,0.505037
2,0,0.418652
3,0,0.412057
4,0,0.375579


In [26]:
def best_threshold(dataset):
    '''
    Func to calculate dcitonary of A score  with its corresponding threshold value
    '''
    score = {}
    for i in tqdm(dataset.proba):
        y_pred = predicted_y(dataset, i)
        dataset['y_pred'] = y_pred
        matrix = confusion_matrix(dataset)
        FP = matrix[1,0]
        FN = matrix[0,1]
        A = (500*FN) + (100*FP)
        score[i] = A
        dataset.drop(columns=['y_pred'])
    return score

In [27]:
scores = best_threshold(data_5_c) # A Score:Threshold dictonary of 5_C dataset

100%|██████████| 2852/2852 [00:08<00:00, 323.65it/s]


In [28]:
sorted_scores = {k: v for k, v in sorted(scores.items(), key=lambda item: item[1])} #Sorted dictonary based on A score

In [29]:
lowest_A = list(sorted_scores.values())[0]

In [30]:
print("Lowest A score value for 5_C dataset : ", lowest_A)

Lowest A score value for 5_C dataset :  141000


In [31]:
best_threshold = list(sorted_scores.keys())[0]

In [32]:
print("Probability score to be taken as threshold which gave lowest A score : ", best_threshold)

Probability score to be taken as threshold which gave lowest A score :  0.2300390278970873


## D

In [33]:
data_5_d = pd.read_csv(r"..\D.csv")

In [34]:
data_5_d.head(5)

Unnamed: 0,y,pred
0,101.0,100.0
1,120.0,100.0
2,131.0,113.0
3,164.0,125.0
4,154.0,152.0


In [35]:
def abs_error(dataset):
    '''
    Func to calculate absolute error = actual value - predicted value
    '''
    error= []
    error = dataset['y']-dataset['pred']
    return error

In [36]:
error_list = abs_error(data_5_d)
sq_errors = [number ** 2 for number in error_list]
error_sum = sum(sq_errors)
mean_sq_error = error_sum / len(data_5_d)
print("Mean square errror for 5_D dataset :", mean_sq_error)

Mean square errror for 5_D dataset : 177.16569974554707


In [37]:
def MAPE(dataset):
    '''
    Func to calculate MAPE 
    '''
    deno = sum(data_5_d['y'])
    num = sum(abs(abs_error(dataset)))
    MAPE = num/deno
    return MAPE

In [38]:
print("MAPE for 5_D dataset : ", MAPE(data_5_d))

MAPE for 5_D dataset :  0.1291202994009687


In [39]:
def R2_error(dataset):
    SS = []
    mean = sum(dataset['y']) / len(dataset)
    SS = dataset['y'] - mean
    SS_total = sum([number ** 2 for number in SS])
    R2 = 1 - ( error_sum/ SS_total)
    return R2

In [40]:
print("R2_error for 5_D dataset : ",R2_error(data_5_d))

R2_error for 5_D dataset :  0.9563582786990964
