# Compute performance metrics for the given Y and Y_score without sklearn

In [1]:
#importing libraries which are needed
import numpy as np
import pandas as pd

## A. Compute performance metrics for the given data '5_a.csv'

In [2]:
df = pd.read_csv('5_a.csv')
df

Unnamed: 0,y,proba
0,1.0,0.637387
1,1.0,0.635165
2,1.0,0.766586
3,1.0,0.724564
4,1.0,0.889199
...,...,...
10095,1.0,0.665371
10096,1.0,0.607961
10097,1.0,0.777724
10098,1.0,0.846036


In [3]:
# Converting probability value to class label by 0.5 as threshold

df['y_pred'] = [1 if i>0.5 else 0 for i in df['proba']]
df

Unnamed: 0,y,proba,y_pred
0,1.0,0.637387,1
1,1.0,0.635165,1
2,1.0,0.766586,1
3,1.0,0.724564,1
4,1.0,0.889199,1
...,...,...,...
10095,1.0,0.665371,1
10096,1.0,0.607961,1
10097,1.0,0.777724,1
10098,1.0,0.846036,1


In [4]:
len(df[df['proba'] >=0.5])   

10100

***observations***
1. This data set is highly imbalance thus positive class label >> negative class lebel

In [5]:
df

Unnamed: 0,y,proba,y_pred
0,1.0,0.637387,1
1,1.0,0.635165,1
2,1.0,0.766586,1
3,1.0,0.724564,1
4,1.0,0.889199,1
...,...,...,...
10095,1.0,0.665371,1
10096,1.0,0.607961,1
10097,1.0,0.777724,1
10098,1.0,0.846036,1


In [6]:
df['y'].value_counts()

1.0    10000
0.0      100
Name: y, dtype: int64

***observations***
1. As mentioned before the positive class label is more and negative class label is very less

### Confusion Matrix

In [7]:
def confusion_matrix(y,y_pred):
    df=pd.DataFrame({'y':y,'y_pred':y_pred})
    TN=len(df[(df['y']==0) & (df['y_pred']==0)])
    FP=len(df[(df['y']==0) & (df['y_pred']==1)])
    FN=len(df[(df['y']==1) & (df['y_pred']==0)])
    TP=len(df[(df['y']==1) & (df['y_pred']==1)])
                
    confusion_matrix = np.array([[TN ,FP],[FN,TP]])
    return confusion_matrix


confusion_matrix(df['y'], df['y_pred'])


array([[    0,   100],
       [    0, 10000]])

### F1 score

In [8]:
def f1_score(y,ypred):
    #calling confustion matrix function which we created before and storing it in variable cn
    cn=confusion_matrix(y,ypred) 
    
    TN=cn[0][0]
    FP=cn[0][1]
    FN=cn[1][0]
    TP=cn[1][1]
    
    precision = TP/(TP+FN)
    recall = TP/(TP+FP)
    

    f1_score = 2*((precision*recall) /(precision+recall))
    return f1_score

f1_score(df['y'], df['y_pred'])

0.9950248756218906

### Accuracy

In [9]:
def accuracy_score(y,y_pred):
    
    #calling confustion matrix function which we created before and storing it in variable cn
    cn=confusion_matrix(y,y_pred)
    
    TN=cn[0][0]
    FP=cn[0][1]
    FN=cn[1][0]
    TP=cn[1][1]
    
    accuracy = (TP+TN)/(TN+FP+FN+TP)
    return accuracy

accuracy_score(df['y'], df['y_pred'])

0.9900990099009901

### AUC score

In [10]:
def auc_score(y,proba):
    tpr_=[]
    fpr_=[]
    data = pd.DataFrame({'y':y,'proba':proba})
    sort_data= data.sort_values("proba",ascending=False) 
    for i in range(len(sort_data)):
    
        sort_data['y_pred']=np.where(sort_data['proba']>=sort_data.iloc[i]['proba'],1,0)
    
        cn=confusion_matrix(sort_data['y'],sort_data['y_pred'])
        TN =cn[0][0]
        FP =cn[0][1]
        FN =cn[1][0]
        TP =cn[1][1]
        tpr = TP/(TP+FN)
        fpr = FP/(TN+FP)
    
        tpr_.append(tpr)
        fpr_.append(fpr)
        
    score = round(np.trapz(tpr_, fpr_),4)
        
    return score
    
auc_score(df['y'],df['proba'])    

0.4883

## B. Compute performance metrics for the given data '5_b.csv'


### Importing another data set and checking

In [11]:
df_b=pd.read_csv('5_b.csv')
df_b.head()

Unnamed: 0,y,proba
0,0.0,0.281035
1,0.0,0.465152
2,0.0,0.352793
3,0.0,0.157818
4,0.0,0.276648


In [12]:
# Converting probability value to class label by 0.5 as threshold
df_b['y_pred'] = [1 if i>=0.5 else 0 for i in df_b['proba']]
df_b

Unnamed: 0,y,proba,y_pred
0,0.0,0.281035,0
1,0.0,0.465152,0
2,0.0,0.352793,0
3,0.0,0.157818,0
4,0.0,0.276648,0
...,...,...,...
10095,0.0,0.474401,0
10096,0.0,0.128403,0
10097,0.0,0.499331,0
10098,0.0,0.157616,0


### Confusion Matrix

In [14]:
confusion_matrix(df_b['y'],df_b['y_pred'])

array([[9761,  239],
       [  45,   55]])

### F1 score

In [15]:
f1_score(df_b['y'],df_b['y_pred'])

0.2791878172588833

### Accuracy

In [16]:
accuracy_score(df_b['y'],df_b['y_pred'])

0.9718811881188119

### AUC score

In [17]:
auc_score(df_b['y'],df_b['proba'])

0.9378

### C. Compute the best threshold (similarly to ROC curve computation) of probability which gives lowest values of metric <b>A</b> for the given data 
<br>

you will be predicting label of a data points like this: $y^{pred}= \text{[0 if y_score < threshold  else 1]}$

$ A = 500 \times \text{number of false negative} + 100 \times \text{numebr of false positive}$

<pre>
   <b>Note 1:</b> in this data you can see number of negative points > number of positive points
   <b>Note 2:</b> use pandas or numpy to read the data from <b>5_c.csv</b>
</pre>

In [18]:
df_c=pd.read_csv('5_c.csv')
df_c.head()

Unnamed: 0,y,prob
0,0,0.458521
1,0,0.505037
2,0,0.418652
3,0,0.412057
4,0,0.375579


In [20]:
def best_threshold(y,proba):
    values=[]
    unique=set()
    data = pd.DataFrame({'y':y,'proba':proba})
    sort_data= data.sort_values("proba",ascending=False) 
    for i in range(len(sort_data)):
        if sort_data.iloc[i]['proba'] in unique:
            continue
            
        sort_data['y_pred']=np.where(sort_data['proba']>=sort_data.iloc[i]['proba'],1,0)
    
        cn=confusion_matrix(sort_data['y'],sort_data['y_pred'])
        TN =cn[0][0]
        FP =cn[0][1]
        FN =cn[1][0]
        TP =cn[1][1]
        A=500*FN+100*FP
        values.append((A,sort_data.iloc[i]['proba']))
        unique.add(sort_data.iloc[i]['proba'])
        
    
        
    return min(values)[1]

best_threshold(df_c['y'],df_c['prob'])

0.2300390278970873

## D.</b></font> Compute performance metrics(for regression) for the given data 5_d.csv
<pre>    <b>Note 2:</b> use pandas or numpy to read the data from <b>5_d.csv</b>
    <b>Note 1:</b> <b>5_d.csv</b> will having two columns Y and predicted_Y both are real valued features
<ol>
<li> Compute Mean Square Error </li>
<li> Compute MAPE: </li>
<li> Compute R^2 error

In [21]:
#https://statisticsbyjim.com/regression/mean-squared-error-mse/


In [22]:
df_d=pd.read_csv('5_d.csv')
df_d.head()

Unnamed: 0,y,pred
0,101.0,100.0
1,120.0,100.0
2,131.0,113.0
3,164.0,125.0
4,154.0,152.0


In [23]:
def mean_squared_error(y,ypred):
    df=pd.DataFrame({'y':y,'pred':ypred})
    n=len(y)
    squared_residual = list((df['y'] - df['pred'])**2)
    mean_squared_error = sum(squared_residual) / n
    
    return mean_squared_error

In [24]:
def mean_absolute_percentage_error(y,ypred):
    df=pd.DataFrame({'y':y,'pred':ypred})
    avg= np.mean(y)
    error=list(abs(df['y'] - df['pred']))
    mape =sum(error)/sum(df['y'])*100
    
    return mape
    

In [25]:
def r_2(y,ypred):
    df=pd.DataFrame({'y':y,'pred':ypred})
    y_mean = np.mean(df['y'])
    ss_tot = sum(list(map(lambda x: (x-y_mean)**2 ,df['y'] )))
    ss_res = sum(list((df['y'] - df['pred'])**2))

    r_2= 1-(ss_res/ss_tot)
    return r_2

In [26]:
print('Mean Squared Error :' ,mean_squared_error(df_d['y'] , df_d['pred']))
print('Mean Absolute Percentage Error :',mean_absolute_percentage_error(df_d['y'] , df_d['pred']))
print('R squared :' , r_2(df_d['y'] , df_d['pred']))

Mean Squared Error : 177.16569974554707
Mean Absolute Percentage Error : 12.91202994009687
R squared : 0.9563582786990964
