In [111]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import OneHotEncoder

from sklearn.dummy import DummyClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.svm import LinearSVC, SVC
from sklearn.linear_model import LogisticRegression

## Question 1

In [2]:
iris = datasets.load_iris()
X = iris.data
y = iris.target

In [3]:
# a small digression: compare dummy classifiers with each other
baseline_names = ['stratified', 'most_frequent', 'prior', 'uniform']
baselines = (DummyClassifier(strategy=strategy) for strategy in baseline_names)

for name, baseline in zip(baseline_names, baselines):
    print(name)
    print(np.mean(cross_val_score(baseline, X, y, cv=10)))
    print()

stratified
0.36

most_frequent
0.33333333333333337

prior
0.33333333333333337

uniform
0.37333333333333335



In [4]:
models = [DummyClassifier(strategy='most_frequent'),
          MultinomialNB(),
          GaussianNB(),
          LinearSVC(),
          SVC(),
          LogisticRegression()]

for model in models:
    print(str(model), end='\n\n')
    print(np.mean(cross_val_score(model, X, y, cv=50)), end='\n\n')

DummyClassifier(constant=None, random_state=None, strategy='most_frequent')

0.33333333333333326

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

0.9533333333333333

GaussianNB(priors=None)

0.9533333333333335

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

0.96

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

0.9733333333333334

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

0.96



All non-dummy classifiers have very high accuracy. The highest accuracy belongs to SVC

## Question 2

In [6]:
#data = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/abalone/abalone.data", header=None)
#data.to_csv("./data/abalone.csv", header=False, index=False)

In [7]:
data = pd.read_csv("./data/abalone.csv", header=None)
data.head()
data.shape

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


(4177, 9)

In [8]:
y0 = data[8].values
y = np.where(y0 <= 10, 0, 1)
y

X = data.loc[:, :8]
X[0] = X[0].astype('category').cat.codes
X = X.values
X


array([1, 0, 0, ..., 0, 0, 1])

array([[ 2.    ,  0.455 ,  0.365 , ...,  0.101 ,  0.15  , 15.    ],
       [ 2.    ,  0.35  ,  0.265 , ...,  0.0485,  0.07  ,  7.    ],
       [ 0.    ,  0.53  ,  0.42  , ...,  0.1415,  0.21  ,  9.    ],
       ...,
       [ 2.    ,  0.6   ,  0.475 , ...,  0.2875,  0.308 ,  9.    ],
       [ 0.    ,  0.625 ,  0.485 , ...,  0.261 ,  0.296 , 10.    ],
       [ 2.    ,  0.71  ,  0.555 , ...,  0.3765,  0.495 , 12.    ]])

In [9]:
# bake-off
models = [DummyClassifier(strategy='most_frequent'),
          MultinomialNB(),
          GaussianNB(),
          LinearSVC(),
          LogisticRegression()]

for model in models:
    print(str(model), end='\n\n')
    print(np.mean(cross_val_score(model, X, y, cv=50)), end='\n\n')

DummyClassifier(constant=None, random_state=None, strategy='most_frequent')

0.6535741572562024

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

0.7325021339714257

GaussianNB(priors=None)

0.779039922757231

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

1.0

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

0.9995180722891567



Who wins the bake-off? The Linear SVC does! With perfect accuracy? Why might that be? Perhaps the data is linearly separable with no instance on the separation border.

### Now try a three class problem:

In [10]:
bins = np.array([0, 8.5, 10.5, 30])
y = np.digitize(y0, bins=bins)
y

array([3, 1, 2, ..., 2, 2, 3])

In [11]:
# This is an equal frequency discretisation
sum(y == 1) / len(y)
sum(y == 2) / len(y)
sum(y == 3) / len(y)

0.33684462532918363

0.3167344984438592

0.34642087622695716

In [12]:
for model in models:
    print(str(model), end='\n\n')
    print(np.mean(cross_val_score(model, X, y, cv=50, verbose=1)), end='\n\n')

DummyClassifier(constant=None, random_state=None, strategy='most_frequent')

0.346439544610086

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

0.5330097829316858

GaussianNB(priors=None)



[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    0.1s finished


0.828374607674395

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)



[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:   19.2s finished


0.9885471947227319

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

0.9571031665090366



[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    1.4s finished


Ha! The LinearSVM is still the most accurate albeit the slowest. Logistic Regression also performs well.

### Use all 29 classes

In [13]:
y = y0

In [14]:
for model in models:
    print(str(model), end='\n\n')
    print(np.mean(cross_val_score(model, X, y, cv=50, verbose=1)), end='\n\n')

[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    0.1s finished


DummyClassifier(constant=None, random_state=None, strategy='most_frequent')

0.1657654207032828

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)



[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    0.2s finished


0.1680083838124116

GaussianNB(priors=None)



[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    0.4s finished


0.9984267881910046

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)



[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:  2.0min finished


0.3095220787448958

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

0.3028236857602358



[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:   14.1s finished


Why is LinearSVC so slow? Because a model is built for pairs of classes. Why is GaussianNB so accurate?

## Question 3

In [87]:
adult = pd.read_csv('./data/adult.csv', header=None)

y = adult[14].astype('category').cat.codes
y

adult = adult.drop([14], axis=1)
adult.head()

0        0
1        0
2        0
3        0
4        0
5        0
6        0
7        1
8        1
9        1
10       1
11       1
12       0
13       0
14       1
15       0
16       0
17       0
18       0
19       1
20       1
21       0
22       0
23       0
24       0
25       1
26       0
27       1
28       0
29       0
        ..
32531    0
32532    1
32533    1
32534    0
32535    0
32536    1
32537    0
32538    1
32539    1
32540    0
32541    0
32542    0
32543    0
32544    0
32545    1
32546    0
32547    0
32548    0
32549    0
32550    0
32551    0
32552    0
32553    0
32554    1
32555    0
32556    0
32557    1
32558    0
32559    0
32560    1
Length: 32561, dtype: int8

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba


In [89]:
categorical_columns = np.arange(14)[adult.dtypes == 'object']

adult_onehot = pd.get_dummies(adult, columns=categorical_columns)
adult_numerical = adult.select_dtypes(include='int64')

adult_onehot.head()
adult_numerical.head()

Unnamed: 0,0,2,4,10,11,12,1_?,1_Federal-gov,1_Local-gov,1_Never-worked,...,13_Portugal,13_Puerto-Rico,13_Scotland,13_South,13_Taiwan,13_Thailand,13_Trinadad&Tobago,13_United-States,13_Vietnam,13_Yugoslavia
0,39,77516,13,2174,0,40,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,50,83311,13,0,0,13,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,38,215646,9,0,0,40,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,53,234721,7,0,0,40,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,28,338409,13,0,0,40,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Unnamed: 0,0,2,4,10,11,12
0,39,77516,13,2174,0,40
1,50,83311,13,0,0,13
2,38,215646,9,0,0,40
3,53,234721,7,0,0,40
4,28,338409,13,0,0,40


In [112]:
# alternatively:

#enc = OneHotEncoder(handle_unknown='ignore')
#enc.transform(adult) couldn't convert string to float??

ValueError: could not convert string to float: 'United-States'

In [91]:
X = adult_numerical

for model in models:
    print(str(model), end='\n\n')
    print(np.mean(cross_val_score(model, X, y, cv=10, verbose=1)), end='\n\n')

DummyClassifier(constant=None, random_state=None, strategy='most_frequent')

0.7591904489970196

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

0.7825619776418057

GaussianNB(priors=None)



[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.1s finished


0.7951845260142772

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)



[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   18.8s finished


0.5692325790076789

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

0.7975492777227503



[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.7s finished


LinearSVC has now become not only the slowest but also the least accurate. The zero-R classifier has quite high accuracy of 75%, and other classifiers do not improve much above this value.

In [93]:
X = adult_onehot

for model in models:
    print(str(model), end='\n\n')
    print(np.mean(cross_val_score(model, X, y, cv=10, verbose=1)), end='\n\n')

DummyClassifier(constant=None, random_state=None, strategy='most_frequent')

0.7591904489970196

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)



[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.2s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.7s finished


0.7826541058042441

GaussianNB(priors=None)



[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.9s finished


0.795276635317317

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)



[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   44.3s finished


0.7840977739120201

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

0.7975185463326391



[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    1.8s finished


Adding one-hot encoded discrete attributes doesn't improve any classifiers except LinearSVC (yet it's still the worst)

## Question 4

In [106]:
#vec = DictVectorizer()
#X = vec.fit_transform(adult.values).toarray()