# Guessing the number: linear regression

## Using more variables

In [1]:
from sklearn.datasets import load_boston
from sklearn.preprocessing import scale
boston = load_boston()
X = scale(boston.data)
y = boston.target

In [2]:
from sklearn.linear_model import LinearRegression
regression = LinearRegression(normalize=True)
regression.fit(X, y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=True)

In [3]:
print(regression.score(X, y))

0.7406077428649428


In [4]:
print([a + ':' + str(round(b, 2)) for a, b in zip(
    boston.feature_names, regression.coef_,)])

['CRIM:-0.92', 'ZN:1.08', 'INDUS:0.14', 'CHAS:0.68', 'NOX:-2.06', 'RM:2.67', 'AGE:0.02', 'DIS:-3.1', 'RAD:2.66', 'TAX:-2.08', 'PTRATIO:-2.06', 'B:0.86', 'LSTAT:-3.75']


## Understanding limitations and potential problems

# Moving to Logistic Regression

## Applying logistic regression

In [5]:
from sklearn.datasets import load_iris
iris = load_iris()
X = iris.data[:-1,:]
y = iris.target[:-1]

In [6]:
from sklearn.linear_model import LogisticRegression
logistic = LogisticRegression()
logistic.fit(X, y)
single_row_pred = logistic.predict(
    iris.data[-1, :].reshape(1, -1))
single_row_pred_proba = logistic.predict_proba(
    iris.data[-1, :].reshape(1, -1))
print ('Predicted class %s, real class %s' 
       % (single_row_pred, iris.target[-1]))
print ('Probabilities for each class from 0 to 2: %s' 
       % single_row_pred_proba)

Predicted class [2], real class 2
Probabilities for each class from 0 to 2: [[0.00168787 0.28720074 0.71111138]]


## Considering when classes are more than two

In [7]:
from sklearn.datasets import load_digits
digits = load_digits()
train = range(0, 1700)
test = range(1700, len(digits.data))
X = digits.data[train]
y = digits.target[train]
tX = digits.data[test]
ty = digits.target[test]

In [8]:
from sklearn.multiclass import OneVsRestClassifier 
from sklearn.multiclass import OneVsOneClassifier
OVR = OneVsRestClassifier(logistic).fit(X, y)
OVO = OneVsOneClassifier(logistic).fit(X, y)
print('One vs rest accuracy: %.3f' % OVR.score(tX, ty))
print('One vs one accuracy: %.3f' % OVO.score(tX, ty))

One vs rest accuracy: 0.938
One vs one accuracy: 0.969


# Making Things as Simple as Naïve Bayes

## Predicting text classifications

In [9]:
from sklearn.datasets import fetch_20newsgroups
newsgroups_train = fetch_20newsgroups(
    subset='train', remove=('headers', 'footers', 'quotes'))
newsgroups_test = fetch_20newsgroups(
    subset='test', remove=('headers', 'footers', 'quotes'))

In [10]:
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
Bernoulli = BernoulliNB(alpha=0.01)
Multinomial = MultinomialNB(alpha=0.01)

In [11]:
import sklearn.feature_extraction.text as txt
multinomial = txt.HashingVectorizer(stop_words='english', 
                                binary=False, norm=None)
binary = txt.HashingVectorizer(stop_words='english',
                           binary=True, norm=None)

In [12]:
import numpy as np
target = newsgroups_train.target
target_test = newsgroups_test.target
multi_X = np.abs(
    multinomial.transform(newsgroups_train.data))
multi_Xt = np.abs(
    multinomial.transform(newsgroups_test.data))
bin_X = binary.transform(newsgroups_train.data)
bin_Xt = binary.transform(newsgroups_test.data)

In [13]:
Multinomial.fit(multi_X, target)
Bernoulli.fit(bin_X, target)

from sklearn.metrics import accuracy_score
for name, model, data in [('BernoulliNB', Bernoulli, bin_Xt), 
                      ('MultinomialNB', Multinomial, multi_Xt)]:
    accuracy = accuracy_score(y_true=target_test, 
                              y_pred=model.predict(data))
    print ('Accuracy for %s: %.3f' % (name, accuracy))

Accuracy for BernoulliNB: 0.570
Accuracy for MultinomialNB: 0.651


In [14]:
print('number of posts in training: %i' 
      % len(newsgroups_train.data))
D={word:True for post in newsgroups_train.data 
   for word in post.split(' ')}
print('number of distinct words in training: %i' 
      % len(D))
print('number of posts in test: %i' 
      % len(newsgroups_test.data))

number of posts in training: 11314
number of distinct words in training: 300972
number of posts in test: 7532


# Exploring Lazy Learning with K-nearest Neighbors

## Predicting after observing neighbors

In [15]:
from sklearn.datasets import load_digits
from sklearn.decomposition import PCA
digits = load_digits()
train = range(0, 1700)
test = range(1700, len(digits.data))
pca = PCA(n_components = 25)
pca.fit(digits.data[train])
X = pca.transform(digits.data[train]) 
y = digits.target[train]
tX = pca.transform(digits.data[test]) 
ty = digits.target[test]

In [16]:
from sklearn.neighbors import KNeighborsClassifier
kNN = KNeighborsClassifier(n_neighbors=5, p=2)
kNN.fit(X, y)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [17]:
print('Accuracy: %.3f' % kNN.score(tX,ty) )
print('Prediction: %s Actual: %s' 
      % (kNN.predict(tX[-15:,:]),ty[-15:]))

Accuracy: 0.990
Prediction: [2 2 5 7 9 5 4 8 1 4 9 0 8 9 8] Actual: [2 2 5 7 9 5 4 8 8 4 9 0 8 9 8]


## Choosing wisely your k parameter

In [18]:
for k in [1, 5, 10, 50, 100, 200]:
    kNN = KNeighborsClassifier(n_neighbors=k).fit(X, y)
    print('for k = %3i accuracy is %.3f' 
          % (k, kNN.score(tX, ty)))

for k =   1 accuracy is 0.979
for k =   5 accuracy is 0.990
for k =  10 accuracy is 0.969
for k =  50 accuracy is 0.959
for k = 100 accuracy is 0.959
for k = 200 accuracy is 0.907
