In [1]:
import pandas as pd
import joblib
import plotly.express as px
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import auc
import warnings
warnings.filterwarnings('ignore')

def plot_pr_curve(model, x, y, title='Precision-Recall curve'):
    y_pred = model.predict_proba(x)[:,1]
    precision, recall, thresholds = precision_recall_curve(y, y_pred)
    df = pd.DataFrame({'Precision':precision[:-1],
                                    'Recall':recall[:-1],
                                    'Threshold':thresholds})
    
#     fig = px.scatter(df, x='Recall', y='Precision')
#     fig.update_layout(title=title)
#     fig.show()
    
    best_recall = 0
    threshold = 0
    for i in range(len(recall)-1):
        if precision[i] > 0.8:
            best_recall = round(recall[i], 3)
            threshold = round(thresholds[i], 3)
            break

    print(f'auc: {auc(recall, precision)}')
    print(f'at 0.8 precision, recall is approx {best_recall}, at threshold {threshold}')

In [2]:
from src.dataset_importer import TestImporter
from src.utils import import_test_configuration

config = import_test_configuration('test_settings/test_settings.ini')
X, y = TestImporter(config).make_test()

rf1 = joblib.load('n_models/rf2.sav')
svc1 = joblib.load('n_models/svc_95.sav')
gnb1 = joblib.load('models/model_37/trained.sav')
mlp1 = joblib.load('models/model_39/trained.sav')
gradboost1 = joblib.load('n_models/gradboost2.sav')

In [13]:
X.TotalLinesOfCode

0          6.0
1          1.0
2          9.0
3          9.0
4         12.0
          ... 
203072     1.0
203073     1.0
203074     7.0
203075    15.0
203076    22.0
Name: TotalLinesOfCode, Length: 203077, dtype: float64

### 80-20 split, 95th percentile

In [3]:
%%time
plot_pr_curve(rf1, X, y, title='Precision Recall curve for RandomForest')

auc: 0.297585594568989
at 0.8 precision, recall is approx 0.205, at threshold 0.104
Wall time: 8.98 s


In [11]:
%%time
plot_pr_curve(gnb1, X, y, title='Precision Recall curve for NaiveBayes')

auc: 0.11927500863997646
at 0.8 precision, recall is approx 0, at threshold 0
Wall time: 446 ms


In [5]:
%%time
plot_pr_curve(mlp1, X, y, title='Precision Recall curve for MLP')

auc: 0.6345016681264436
at 0.8 precision, recall is approx 0.487, at threshold 0.409
Wall time: 473 ms


In [6]:
%%time
plot_pr_curve(gradboost1, X, y, title='Precision Recall curve for GradBoostClassifier')

auc: 0.7037847165026446
at 0.8 precision, recall is approx 0.68, at threshold 0.198
Wall time: 4.28 s


In [7]:
%%time
plot_pr_curve(svc1, X, y, title='Precision Recall curve for SVM')

auc: 0.11508867332105673
at 0.8 precision, recall is approx 0.018, at threshold 0.141
Wall time: 37min 47s
