# SI 618 Day 11: Classification

Version 2023.11.08.1.CT

Today we're going to focus on classification and model evaluation. 

In [None]:
import seaborn as sns
import pandas as pd

In [None]:
data = sns.load_dataset('iris')
data.groupby('species').count()

In [None]:
from sklearn.model_selection import train_test_split
# "target" contains the column name of the classification labels
target = "species"

X = data.drop(target,axis=1)
y = data[target]

random_state = 7
test_size = 0.33
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)


In [None]:
len(X_test)

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, confusion_matrix

## Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB

model = GaussianNB()
model.fit(X_train, y_train)
# make predictions for test data
y_pred = model.predict(X_test)
predictions = y_pred 


In [None]:
y_pred

In [None]:
y_test.values

$accuracy = \frac{True Positives + True Negatives}{All Samples}$

$precision = \frac{True Positives}{True Positives + False Positives}$

$recall = \frac{True Positives}{True Positives + False Negatives}$

$F1 = \frac{2 \times (Precision \times Recall)}{Precision + Recall}$

In [None]:
sns.pairplot(data=data, hue='species')

In [None]:
# evaluate predictions
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))


## Support Vector Machines

In [None]:
from sklearn import svm
model = svm.SVC(gamma="scale",kernel="rbf")
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print(accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))


## Decision Trees

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from IPython.display import Image

In [None]:
model = DecisionTreeClassifier(
    random_state=3, 
    criterion='entropy',
    splitter='best', 
    max_depth=6, 
    min_samples_split=2)

model.fit(X_train,y_train)

y_pred = model.predict(X_test)

# evaluate predictions
accuracy = model.score(X_test, y_test)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

In [None]:
import matplotlib.pyplot as plt
from sklearn.tree import plot_tree

In [None]:
y_test

In [None]:
# based on https://mljar.com/blog/visualize-decision-tree/
fig = plt.figure(figsize=(25,20))
# we use '_' to capture the return from plot_tree(...) and never do anything with it
_ = plot_tree(model, 
                   feature_names=X.columns,  # ordered list of features
                   class_names=y.unique(), # unique() is handy here due to numerical sorting of classes
                   filled=True)

In [None]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
print(accuracy)

from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

## Random Forests

In [None]:
seed = 42

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

model = RandomForestClassifier(
    n_estimators=100,
    #random_state=seed, 
    criterion='entropy',
    max_depth=None, 
    min_samples_split=2)

model.fit(X_train,y_train)

y_pred = model.predict(X_test)

# evaluate predictions
accuracy = model.score(X_test, y_test)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:
# Check feature importance
feat_importance = model.feature_importances_
pd.DataFrame({'Feature Importance':feat_importance},
            index=list(X)).plot(kind='barh')

##  Cross-validation




### Tuning the model
The model paramters are called _hyperparameters_.  Setting them to appropriate values and running the model experiment again can be tedious, so we have a technique called cross-validation to help with this.

From http://scikit-learn.org/stable/modules/cross_validation.html:

> When evaluating different settings (“hyperparameters”) for estimators, such as the C setting that must be manually set for an SVM, there is still a risk of overfitting on the test set because the parameters can be tweaked until the estimator performs optimally. This way, knowledge about the test set can “leak” into the model and evaluation metrics no longer report on generalization performance. To solve this problem, yet another part of the dataset can be held out as a so-called “validation set”: training proceeds on the training set, after which evaluation is done on the validation set, and when the experiment seems to be successful, final evaluation can be done on the test set.

> However, by partitioning the available data into three sets, we drastically reduce the number of samples which can be used for learning the model, and the results can depend on a particular random choice for the pair of (train, validation) sets.

> A solution to this problem is a procedure called cross-validation (CV for short). A test set should still be held out for final evaluation, but the validation set is no longer needed when doing CV. In the basic approach, called k-fold CV, the training set is split into k smaller sets (other approaches are described below, but generally follow the same principles). The following procedure is followed for each of the k “folds”:

> A model is trained using  of the folds as training data;
the resulting model is validated on the remaining part of the data (i.e., it is used as a test set to compute a performance measure such as accuracy).
The performance measure reported by k-fold cross-validation is then the average of the values computed in the loop. This approach can be computationally expensive, but does not waste too much data (as is the case when fixing an arbitrary validation set), which is a major advantage in problems such as inverse inference where the number of samples is very small.

The mean score and the 95% confidence interval (2 x the standard deviation) of the score estimate are hence given by:


In [None]:
model

In [None]:
y


In [None]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model,X,y, cv=5, n_jobs=-1)
print("Accuracy: %0.2f%% (+/- %0.2f)" % (scores.mean()*100, scores.std() * 2*100))

But let's say we want to fiddle with the hyperparameters.

To do this, we first set up a parameter grid with the lists of paramters we want to try:

In [None]:
param_grid = {
                 'n_estimators': [5, 10, 15, 20, 25, 30, 35, 40, 50, 100],
                 'max_depth': [2, 5, 7, 9],
             }

We now run the cross-validation on the classifier. Note: this will take a while (why?).

In [None]:
model

In [None]:
from sklearn.model_selection import GridSearchCV

grid_clf = GridSearchCV(model, param_grid, cv=10, n_jobs=-1)
grid_clf.fit(X,y)

In [None]:
grid_clf.best_estimator_

In [None]:
grid_clf.best_params_

In [None]:
# Details, which can be imported into a pandas dataframe:
results = pd.DataFrame(grid_clf.cv_results_)
results.head()

In [None]:
model = RandomForestClassifier(criterion='entropy', max_depth=2, n_estimators=25)
model = grid_clf.best_estimator_

In [None]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model,X,y, cv=10)
print("Accuracy: %0.2f%% (+/- %0.2f)" % (scores.mean()*100, scores.std() * 2*100))

## BREAK

# Part 2: Your first Kaggle competition

In preparation for tackling the next homework assignment, we are going to spend some time in class working through your first [Kaggle competition](https://www.kaggle.com/competitions).

Specifically, we'll be working through the [Titanic competition](https://www.kaggle.com/c/titanic).

You'll need to download the dataset, which consists of three files: train.csv, test.csv, and gender_submission.csv.

The dataset has already been split into training and testing datasets, although you don't have access to the correct labels for the testing dataset.  You create a csv file that you submit to Kaggle, which will report your overall accuracy on the test dataset.  You will not be told *which* rows were correctly classified.

We recommend that you use cross-validation to evaluate your model, and then use the entire training dataset to train your model before applying it to the test dataset.

Your target accuracy is 85% (i.e. you should keep trying until you achieve at least 85%).  You can look at some of the other notebooks that have been submitted but be careful not to cheat (some of the notebooks that achieve 100% accuracy do so by downloading the solution data file -- that's cheating). Good luck!


You might find it useful to use one or more of the following classifiers:

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB

In [None]:
# from https://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html
names = ["Logistic Regression", "Nearest Neighbors", "Linear SVM", "RBF SVM", "Gaussian Process",
         "Decision Tree", "Random Forest", "Neural Net", "AdaBoost",
         "Naive Bayes"]

classifiers = [
    LogisticRegression(),
    KNeighborsClassifier(3),
    SVC(kernel="linear", C=0.025),
    SVC(gamma=2, C=1),
    GaussianProcessClassifier(1.0 * RBF(1.0)),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    MLPClassifier(alpha=1, max_iter=1000),
    AdaBoostClassifier(),
    GaussianNB(),
    ]

In [None]:
# start by loading in the dataset and conducting some basic EDA

In [None]:
# train your models and then evaluate them using cross-validation

In [None]:
# apply your best model to the test set and create a submission file

In [None]:
# upload your submission file to Kaggle and record your score