In [3]:
# Import our libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


# Read in our dataset
df = pd.read_table('/Users/Administrator/Documents/Work/pylearn/Bayes/smsspamcollection/SMSSpamCollection',
                   sep='\t', 
                   header=None, 
                   names=['label', 'sms_message'])

# Fix our response value
df['label'] = df.label.map({'ham':0, 'spam':1})

# Split our dataset into training and testing data
X_train, X_test, y_train, y_test = train_test_split(df['sms_message'], 
                                                    df['label'], 
                                                    random_state=1)

# Instantiate the CountVectorizer method
count_vector = CountVectorizer()

# Fit the training data and then return the matrix
training_data = count_vector.fit_transform(X_train)

# Transform testing data and return the matrix. Note we are not fitting the testing data into the CountVectorizer()
testing_data = count_vector.transform(X_test)

# Instantiate our model
naive_bayes = MultinomialNB()

# Fit our model to the training data
naive_bayes.fit(training_data, y_train)

# Predict on the test data
predictions = naive_bayes.predict(testing_data)

# Score our model
print('Accuracy score: ', format(accuracy_score(y_test, predictions)))
print('Precision score: ', format(precision_score(y_test, predictions)))
print('Recall score: ', format(recall_score(y_test, predictions)))
print('F1 score: ', format(f1_score(y_test, predictions)))

Accuracy score:  0.9885139985642498
Precision score:  0.9720670391061452
Recall score:  0.9405405405405406
F1 score:  0.9560439560439562


In [None]:
# New code from here

In [17]:
# Import the Bagging, RandomForest, and AdaBoost Classifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier

> **Step 2:** Now that you have imported each of the classifiers, `instantiate` each with the hyperparameters specified in each comment.  In the upcoming lessons, you will see how we can automate the process to finding the best hyperparameters.  For now, let's get comfortable with the process and our new algorithms.

In [27]:
# Instantiate a BaggingClassifier with:
# 200 weak learners (n_estimators) and everything else as default values
My_BaggingClassifier = BaggingClassifier(n_estimators = 200)
# help(BaggingClassifier)

# Instantiate a RandomForestClassifier with:
# 200 weak learners (n_estimators) and everything else as default values
My_RandomForestClassifier = RandomForestClassifier(n_estimators = 200)

# Instantiate an a AdaBoostClassifier with:
# With 300 weak learners (n_estimators) and a learning_rate of 0.2
My_AdaBoostClassifier = AdaBoostClassifier(n_estimators = 300,learning_rate = 0.2)
# help(My_RandomForestClassifier)

> **Step 3:** Now that you have instantiated each of your models, `fit` them using the **training_data** and **y_train**.  This may take a bit of time, you are fitting 700 weak learners after all!

In [28]:
# Fit your BaggingClassifier to the training data
My_BaggingClassifier.fit(training_data,y_train)
# help(fit)

# Fit your RandomForestClassifier to the training data
My_RandomForestClassifier.fit(training_data,y_train)

# Fit your AdaBoostClassifier to the training data
My_AdaBoostClassifier.fit(training_data,y_train)


AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=0.2,
                   n_estimators=300, random_state=None)

> **Step 4:** Now that you have fit each of your models, you will use each to `predict` on the **testing_data**.

In [29]:
# Predict using BaggingClassifier on the test data
My_BaggingClassifier_predict = My_BaggingClassifier.predict(testing_data)

# Predict using RandomForestClassifier on the test data
My_RandomForestClassifier_predict = My_RandomForestClassifier.predict(testing_data)

# Predict using AdaBoostClassifier on the test data
My_AdaBoostClassifier_predict = My_AdaBoostClassifier.predict(testing_data)


> **Step 5:** Now that you have made your predictions, compare your predictions to the actual values using the function below for each of your models - this will give you the `score` for how well each of your models is performing. It might also be useful to show the Naive Bayes model again here, so we can compare them all side by side.

In [30]:
def print_metrics(y_true, preds, model_name=None):
    '''
    INPUT:
    y_true - the y values that are actually true in the dataset (NumPy array or pandas series)
    preds - the predictions for those values from some model (NumPy array or pandas series)
    model_name - (str - optional) a name associated with the model if you would like to add it to the print statements 
    
    OUTPUT:
    None - prints the accuracy, precision, recall, and F1 score
    '''
    if model_name == None:
        print('Accuracy score: ', format(accuracy_score(y_true, preds)))
        print('Precision score: ', format(precision_score(y_true, preds)))
        print('Recall score: ', format(recall_score(y_true, preds)))
        print('F1 score: ', format(f1_score(y_true, preds)))
        print('\n\n')
    
    else:
        print('Accuracy score for ' + model_name + ' :' , format(accuracy_score(y_true, preds)))
        print('Precision score ' + model_name + ' :', format(precision_score(y_true, preds)))
        print('Recall score ' + model_name + ' :', format(recall_score(y_true, preds)))
        print('F1 score ' + model_name + ' :', format(f1_score(y_true, preds)))
        print('\n\n')

In [34]:
# Print Bagging scores
print_metrics(y_test, My_BaggingClassifier_predict, "My_BaggingClassifier")

# Print Random Forest scores
print_metrics(y_test, My_RandomForestClassifier_predict, "My_RandomForestClassifier")

# Print AdaBoost scores
print_metrics(y_test, My_AdaBoostClassifier_predict, "My_AdaBoostClassifier")

# Naive Bayes Classifier scores
print_metrics(y_test, predictions, "Naive Bayes")


Accuracy score for My_BaggingClassifier : 0.9734386216798278
Precision score My_BaggingClassifier : 0.9021739130434783
Recall score My_BaggingClassifier : 0.8972972972972973
F1 score My_BaggingClassifier : 0.899728997289973



Accuracy score for My_RandomForestClassifier : 0.9827709978463748
Precision score My_RandomForestClassifier : 1.0
Recall score My_RandomForestClassifier : 0.8702702702702703
F1 score My_RandomForestClassifier : 0.930635838150289



Accuracy score for My_AdaBoostClassifier : 0.9770279971284996
Precision score My_AdaBoostClassifier : 0.9693251533742331
Recall score My_AdaBoostClassifier : 0.8540540540540541
F1 score My_AdaBoostClassifier : 0.9080459770114943



Accuracy score for Naive Bayes : 0.9885139985642498
Precision score Naive Bayes : 0.9720670391061452
Recall score Naive Bayes : 0.9405405405405406
F1 score Naive Bayes : 0.9560439560439562





### Recap

Now you have seen the whole process for a few ensemble models! 

1. **Import** the model.
2. **Instantiate** the model with the hyperparameters of interest.
3. **Fit** the model to the training data.
4. **Predict** on the test data.
5. **Score** the model by comparing the predictions to the actual values.

And that's it.  This is a very common process for performing machine learning.


### But, Wait...

You might be asking - 

* What do these metrics mean? 

* How do I optimize to get the best model?  

* There are so many hyperparameters to each of these models, how do I figure out what the best values are for each?

**This is exactly what the last two lessons of this course on supervised learning are all about.**

**Notice, you can obtain a solution to this notebook by clicking the orange icon in the top left!**
