In [1]:
import pandas as pd
import html

# Task 3: Research and theory
### Task 3A: Research - State of the art solutions

### Task 3B: Theory - MSE versus MAE

$$ MSE = \frac{1}{N}\sum_{i=1}^{N}(y_i - \hat{y}_i)^2 $$

$y_i$ is the actual expected output and $\hat{y}_i$ is the model's prediction.

MSE measures averages squared error of our predictions. For each point, it calculates square difference between the predictions and the target and then average those values.

The higher this value, the worse the model is. It's never negative since we're squaring the individual prediction-wise error before summing them, but would be zero for a perfect model. 

*Advantage*: Useful if we have unexpected values that we should care about. Vey high or low value that we should pay attention.

*Disadvantage*: If we make a single very bad prediction, the squaring will make the error even worse and it may skew the metric towards overestimating the model’s badness. That is a particularly problematic behaviour if we have noisy data (that is, data that for whatever reason is not entirely reliable) — even a “perfect” model may have a high MSE in that situation, so it becomes hard to judge how well the model is performing. On the other hand, if all the errors are small, or rather, smaller than 1, than the opposite effect is felt: we may underestimate the model’s badness.

*Note that* if we want to have a constant prediction the best one will be the mean value of the target values. It can be found by setting the derivative of our total error with respect to that constant to zero, and find it from this equation.


$$ MAE = \frac{1}{N}\sum_{i=1}^{N}|y_i - \hat{y}_i| $$

MAE calculates the error as an average of absolute differences between the target values and the predictions. The MAE is a linear score which means that all the individual differences are weighted equally in the average. For example, the difference between 10 and 0 will be twice the difference between 5 and 0.

What is important about this metric is that it penalizes huge errors that not as that badly as MSE does. Thus, it’s not that sensitive to outliers as mean square error.


#### MAE – Mean Absolute Error
MAE is the most intuitive of them all. The name in itself is pretty good at telling us what’s going on.

- Mean: average
- Absolute: without direction, get rid of any negative signs
Simply put, the average difference observed in the predicted and actual values across the whole test set.

In the background, the algorithm takes the differences in all of the predicted and actual prices, adds them up and then divides them by the number of observations. It doesn’t matter if the prediction is higher or lower than the actual price, the algorithm just looks at the absolute value. A lower value indicates better accuracy.

As a general guide, I think we can use MAE when we aren’t too worried about the outliers.

#### Mean Squared Error
I personally don’t focus too much on MSE as I see it as a stepping stone for calculating RMSE. However, let’s see what’s it about.

- Mean: average
- Squared: square the errors so a difference of 2, becomes 4, a difference of 3 becomes 9
As you can see, as a result of the squaring, it assigns more weight to the bigger errors. The algorithm then continues to add them up and average them. If you are worried about the outliers, this is the number to look at. Keep in mind, it’s not in the same unit as our dependent value. In our case, the value was roughly 82,3755,495, this is NOT the dollar value of the error like MAE. As before, lower the number the better.

### Task 3C: Theory - analyze a less obvious dataset

In [2]:
def read_file(filename):
    """ Reads a file. """
    
    f = open(filename, "r")
    lines = f.readlines()
    f.close()
    
    return lines


def create_dataframe(lines):
    """ Create a dataframe from a csv file. """
    
    # get column names from first line
    col_names = lines[0].split(';')
    cols = [col_names[i].strip() for i in range(len(col_names))]
    
    # prepare data frame
    amount_lines = len(lines)
    df = pd.DataFrame(columns=cols, index=range(amount_lines - 1))
    
    # fill dataframe
    i = 0
    for line in lines[1:]:

        parts = line.split(';', 1)

        df.loc[i].label = parts[0]
        df.loc[i].text = parts[1].strip()

        i = i + 1
        
    return df

def create_dataframe_stratified(lines):
    """ Create a dataframe from a csv file. """
    
    # get column names from first line
    col_names = lines[0].split(';')
    cols = [col_names[i].strip() for i in range(len(col_names))]
    
    # prepare data frame
    amount_lines = len(lines)
    df = pd.DataFrame(columns=cols, index=range(9309))
    
    # fill dataframe
    i = 0
    for line in lines[1:]:

        parts = line.split(';', 1)
        
        if parts[0] == "spam":
            for j in range(6):
                df.loc[i + j].label = parts[0]
                df.loc[i + j].text = parts[1].strip()
            
            i = i + 6
                
        else:
            df.loc[i].label = parts[0]
            df.loc[i].text = parts[1].strip()

            i = i + 1
        
    return df

In [3]:
lines = read_file("data/SmsCollection.csv")
df = create_dataframe(lines)
df_stratified = create_dataframe_stratified(lines)

display(df.describe())
display(df_stratified.describe())

Unnamed: 0,label,text
count,5574,5574
unique,2,5160
top,ham,"Sorry, I'll call later"
freq,4827,30


Unnamed: 0,label,text
count,9309,9309
unique,2,5160
top,ham,"Sorry, I'll call later"
freq,4827,30


In [4]:
# For mac, uncomment:
# import nltk
# import ssl

# try:
#     _create_unverified_https_context = ssl._create_unverified_context
# except AttributeError:
#     pass
# else:
#     ssl._create_default_https_context = _create_unverified_https_context

# nltk.download()

In [5]:
from nltk import stem
from nltk.corpus import stopwords

stemmer = stem.SnowballStemmer('english')
stopwords = set(stopwords.words('english'))

In [6]:
def clean_messages(msg, unescape_html, lower, rm_stopwords, use_stemmer):
    """ This is a function that converts messages to all lowercase letters, removes stopwords
        and stems all words: stemming reduces inflection forms to normalise words with the same lemma."""
    
    if unescape_html:
        # unescape html
        msg = html.unescape(msg)
    
    if lower:
        # converting messages to lowercase
        msg = msg.lower()
    
    if rm_stopwords:
        # removing stopwords
        msg = [word for word in msg.split() if word not in stopwords]
        
    if rm_stopwords and use_stemmer:
        # using a stemmer
        msg = " ".join([stemmer.stem(word) for word in msg])
    elif rm_stopwords:
        msg = " ".join([word for word in msg])
    
    return msg

In [7]:
## show
def display_big(df):
    with pd.option_context('display.min_rows', 50, 'display.max_colwidth', 10000):
        display(df)
        


In [8]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit

# use term frequency - inverse document frequency
from sklearn.feature_extraction.text import TfidfVectorizer

# https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.StratifiedKFold.html

from sklearn import svm


def clean_and_split_data(df, test_size, clean_options, stratified):
    df_train = df.copy()
    
    
    df_train["text"] = df_train["text"].apply(clean_messages, args=(clean_options))
    
    # Use stratified sampling to split test- and training set
    if stratified is True:
        
        split_model = StratifiedShuffleSplit(n_splits = 1, test_size = test_size, random_state = 1)
        
        for train_index, test_index in split_model.split(df_train['text'], df_train['label']):
                trainmsg, testmsg = df_train['text'][train_index], df_train['text'][test_index]
                trainlabel, testlabel = df_train['label'][train_index], df_train['label'][test_index]


    else:
        # split dataset into test and training
        trainmsg, testmsg, trainlabel, testlabel = train_test_split(
                    df_train['text'], 
                    df_train['label'], 
                    test_size = test_size, 
                    random_state = 1)
    
   

    return trainmsg, testmsg, trainlabel, testlabel



def train_model(trainmsg, trainlabel, C):
        
    # vectorize
    vectorizer = TfidfVectorizer()
    trainmsg = vectorizer.fit_transform(trainmsg)
    
    # actually classify and train
    my_svm = svm.SVC(C=C) #, gamma='auto')
    my_svm.fit(trainmsg, trainlabel)
    
    return vectorizer, my_svm


In [9]:
from sklearn.metrics import confusion_matrix

stratified = False

In [10]:
def run_model(df, clean_options, test_size, C, stratified):
    trainmsg, testmsg, trainlabel, testlabel = clean_and_split_data(df, test_size=test_size, 
                                                                    clean_options=clean_options,
                                                                    stratified=stratified)

    vectorizer, my_svm = train_model(trainmsg=trainmsg, 
                                        trainlabel=trainlabel,
                                        C=C)

    testmsg = vectorizer.transform(testmsg)

    y_pred = my_svm.predict(testmsg)
    m = confusion_matrix(testlabel, y_pred)
    print(m)
    
    n = (m[1][0] + m[1][1] + m[0][0] + m[0][1])
    positives_error = m[1][0] / (m[1][0] + m[1][1])
    negatives_error = m[0][1] / (m[0][0] + m[0][1])
    
    print("False positives: ",  round(positives_error * 100, 1), "% +/-", get_ci(n, positives_error), "%")
    print("False negatives: ",  round(negatives_error * 100, 1), "% +/-", get_ci(n, negatives_error), "%")
    
def get_ci(n, error, confidence=0.99):
    
    if confidence==0.99:
        const = 2.58
    else:
        return None
    
    return round(const * ((error * (1-error))/n)**0.5 * 100, 1)

In [18]:
clean_options1 = [False, # html
                    False, # lower
                    False, # stopwords
                    False] # stemmer

clean_options2 = [True, # html
                    True, # lower
                    True, # stopwords
                    True] # stemmer

clean_options3 = [True, # html
                    True, # lower
                    False, # stopwords
                    False] # stemmer

clean_options4 = [False, # html
                    False, # lower
                    True, # stopwords
                    True] # stemmer

In [19]:
test_size = 0.2
C = 1

run_model(df, clean_options1, test_size, C, stratified)


test_size = 0.2
C = 1000

run_model(df, clean_options1, test_size, C, stratified)


test_size = 0.1
C = 1000

run_model(df, clean_options1, test_size, C, stratified)

[[977   0]
 [ 13 125]]
False positives:  9.4 % +/- 2.3 %
False negatives:  0.0 % +/- 0.0 %
[[976   1]
 [ 12 126]]
False positives:  8.7 % +/- 2.2 %
False negatives:  0.1 % +/- 0.2 %
[[490   1]
 [  5  62]]
False positives:  7.5 % +/- 2.9 %
False negatives:  0.2 % +/- 0.5 %


In [20]:

test_size = 0.2
C = 1

run_model(df, clean_options2, test_size, C, stratified)


test_size = 0.2
C = 1000

run_model(df, clean_options2, test_size, C, stratified)

test_size = 0.1
C = 1000

run_model(df, clean_options2, test_size, C, stratified)

[[976   1]
 [ 14 124]]
False positives:  10.1 % +/- 2.3 %
False negatives:  0.1 % +/- 0.2 %
[[976   1]
 [ 11 127]]
False positives:  8.0 % +/- 2.1 %
False negatives:  0.1 % +/- 0.2 %
[[491   0]
 [  5  62]]
False positives:  7.5 % +/- 2.9 %
False negatives:  0.0 % +/- 0.0 %


In [21]:

test_size = 0.2
C = 1

run_model(df, clean_options3, test_size, C, stratified)


test_size = 0.2
C = 1000

run_model(df, clean_options3, test_size, C, stratified)


test_size = 0.1
C = 1000

run_model(df, clean_options3, test_size, C, stratified)

[[977   0]
 [ 13 125]]
False positives:  9.4 % +/- 2.3 %
False negatives:  0.0 % +/- 0.0 %
[[976   1]
 [ 12 126]]
False positives:  8.7 % +/- 2.2 %
False negatives:  0.1 % +/- 0.2 %
[[490   1]
 [  6  61]]
False positives:  9.0 % +/- 3.1 %
False negatives:  0.2 % +/- 0.5 %


In [22]:

test_size = 0.2
C = 1

run_model(df, clean_options4, test_size, C, stratified)


test_size = 0.2
C = 1000

run_model(df, clean_options4, test_size, C, stratified)


test_size = 0.1
C = 1000

run_model(df, clean_options4, test_size, C, stratified)

[[976   1]
 [ 14 124]]
False positives:  10.1 % +/- 2.3 %
False negatives:  0.1 % +/- 0.2 %
[[976   1]
 [ 11 127]]
False positives:  8.0 % +/- 2.1 %
False negatives:  0.1 % +/- 0.2 %
[[491   0]
 [  6  61]]
False positives:  9.0 % +/- 3.1 %
False negatives:  0.0 % +/- 0.0 %


In [23]:
test_size = 0.2
C = 1

run_model(df_stratified, clean_options1, test_size, C, stratified)


test_size = 0.2
C = 1000

run_model(df_stratified, clean_options1, test_size, C, stratified)


test_size = 0.1
C = 1000

run_model(df_stratified, clean_options1, test_size, C, stratified)

[[943   2]
 [  0 917]]
False positives:  0.0 % +/- 0.0 %
False negatives:  0.2 % +/- 0.3 %
[[944   1]
 [  0 917]]
False positives:  0.0 % +/- 0.0 %
False negatives:  0.1 % +/- 0.2 %
[[482   0]
 [  0 449]]
False positives:  0.0 % +/- 0.0 %
False negatives:  0.0 % +/- 0.0 %


In [24]:
test_size = 0.2
C = 1

run_model(df_stratified, clean_options2, test_size, C, stratified)


test_size = 0.2
C = 1000

run_model(df_stratified, clean_options2, test_size, C, stratified)


test_size = 0.1
C = 1000

run_model(df_stratified, clean_options2, test_size, C, stratified)

[[944   1]
 [  0 917]]
False positives:  0.0 % +/- 0.0 %
False negatives:  0.1 % +/- 0.2 %
[[944   1]
 [  0 917]]
False positives:  0.0 % +/- 0.0 %
False negatives:  0.1 % +/- 0.2 %
[[482   0]
 [  0 449]]
False positives:  0.0 % +/- 0.0 %
False negatives:  0.0 % +/- 0.0 %
