In [1]:
! [ -e /content ] && pip install -Uqq fastai

[K     |████████████████████████████████| 194kB 2.8MB/s 
[K     |████████████████████████████████| 51kB 5.9MB/s 
[?25h

In [2]:
from fastai.vision.all import *

In [3]:
path = untar_data('http://eve.kean.edu/~valentm3/CandleDemo.tgz')

In [6]:
test_files = get_image_files(path/'TestSkewed')

In [7]:
print(len(test_files))

13132


In [8]:
def label_func(fname):
    return fname[0] == '1'

In [9]:
# Download learner file
model = get_files(path/'Models')[0]

# Load the learner that was trained
learn = load_learner(model)

In [10]:
# Get's accuracy on test set, and all 4 possibilities for confusion matrix
total = len(test_files)
correct = 0
positives = 0
negatives = 0
true_positives = 0
false_positives = 0
true_negatives = 0
false_negatives = 0
for _file in test_files:

    # Correct in gerneral
    target = label_func(_file.name)
    pred = learn.predict(_file)[0] == 'True'
    correct += pred == target
    
    # Tally up everything
    if target:
        positives += 1
        if pred:
            true_positives += 1 # Actual true, pred true
        else:
            false_negatives += 1 # Actual true, pred false
    else:
        negatives += 1
        if pred:
            false_positives += 1 # Actual false, pred true
        else:
            true_negatives += 1 # Actual false, pred false

accuracy = correct/total
print(f'Accuracy for test set is: {accuracy}')
print(f'Actual True: {positives} | Predicted True: {true_positives}')
print(f'Actual True: {positives} | Predicted False: {false_negatives}')
print(f'Actual False: {negatives} | Predicted False: {true_negatives}')
print(f'Actual False: {negatives} | Predicted True: {false_positives}')

print(f'Percentage of predicting buy and being correct: {true_positives/(true_positives + false_positives)*100}') # Precision / Positive Prediction Value
print(f'Percentage of not buys, where we predicted buy: {false_positives/(negatives)*100}') # False positive rate

# Motivation for both
# 1) Not considering labels and given it said True what is the chance it's right?
# 2) Looking at a label that's False, what's the chance of it saying True?

# What's the relationship? In this specific context, there's going to be considerably more instances it see's where
# the label would be False. Hence why it was trained on a True/False ration of 1/100
# The observed False positive rate is relatively low at 15%, which answers the above question of:
# "Given it's shown a false, what's the chance it predicts True?"
# Although as stated, it is going to be shown a lot of Falses, and if each time it's shown a False
# it has a 15% to be wrong, the probablity of it sequentially identifying falses N times is (1-0.15)^N
# For example, if we were to look at 100 stocks that were False, the probability of it correctly identifying them
# all as Falses is about 8e-6%.
# Let's now observe the Precision, which is described as "Given it said True, what is the chance of being correct?"
# There is a distinct difference between this reasoning and the previous. In the other example, we worked under
# the assumption we were being shown 100 Falses, and wanted to know the probablity of correctly identifying all of 
# them as False. The key is assuming what we are looking at. Which given the majority of stocks we'll look at
# will be False is a good circumstance to observe. But now, we are not considering what we might look at
# We are solely concerned with given the prediction is alreay True, what's the chance of it being correct? 
# That number is relatively low, at 6%. The use of each of these numbers depends on the situation they wish to be used
# If the goal is to initially scan the stocks and filter out ones that may be good to look at, 
# The first mentioned analysis of predicting N falses correctly isn't problematic. 
# Assume we are going to filter 100 stocks that are False. The highest probablity combination of identifying falses correctly
# and incorrectly is 11% at 85 correct and 15 incorrect. Probablity of this event happening in general,
# multiplied by all the permutations it could happen in sequentially. 
# Using this number, if we are filtering through 100 stocks that are Falses, there's an 11 percent chance that it
# will identity 85 of them correctly as False, and 15 percent incorrectly as True. 
# However this sounds, it's missing a major point. First we are assuming they are all False,
# which actually has a high probablity of being true. Secondly, there's an 11% chance that 85 correct and 15 incorrect 
# WILL be the output. That isn't particularly interesting, because....what if it's not the output? 
# Predicting the output is one things, but we need to find confidence in what the output actually is. 
# For that, we need to turn to our next measure, the precision
# The precision says that given the prediction is True, what's the chance of being correct? That number,
# as calculated before is 6%. 
# Using the same sequential logic as before, predicting a bunch of True's in a row (over some time frame) is going to be difficult. 
# Assuming we use this model as a filtering technique, there's a 94% chance that we discover a false positive. 
# If we were to filter some amount of stocks, and 100 came back as true, 94 of them are in reality probably false.
# BUT, 6% of them are in fact true. This can safely be used for further analysis.
# It offers a way to filter a large amount of stocks into a smaller more mangeable batch. 
# The concern isn't missing True positives, but decreasing the batch of stocks to a smaller subset
# that hopefully contains a potential true. 
# one final conclusion is that given a 6% chance to be correct with a prediction of positive,
# A filtering process would have return at least 17 stocks, for there to potentially be 1 that could be a hit.

Accuracy for test set is: 0.8529546146816935
Actual True: 138 | Predicted True: 128
Actual True: 138 | Predicted False: 10
Actual False: 12994 | Predicted False: 11073
Actual False: 12994 | Predicted True: 1921
Percentage of predicting buy and being correct: 6.246949731576379
Percentage of not buys, where we predicted buy: 14.783746344466678
