In [None]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.linear_model import Perceptron
import numpy as np

In [None]:
iris = datasets.load_iris()
X = iris.data[:,[2,3]]
y = iris.target

In [None]:
'''
    stratify makes sure training and test subjects have the same class label proportions as the input set
    test_size is % of data to use for testing
'''
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=.3,random_state=1,stratify=y)

In [None]:
sc = StandardScaler()
sc.fit(X_train) #Calculate std deviation and sample mean for all features
X_train_std = sc.transform(X_train) #Scale train and test sets
X_test_std = sc.transform(X_test)

In [None]:

ppn = Perceptron(eta0=.1,random_state=1)
ppn.fit(X_train_std,y_train)

y_pred = ppn.predict(X_test_std)
print('Misclassified examples: %d' % (y_test != y_pred).sum())
print(f'Accuracy: {100 * accuracy_score(y_test,y_pred):.2f}%')

In [None]:
from matplotlib.colors import ListedColormap
import matplotlib.pyplot as plt

''' View decision regions for classifier '''
def plot_decision_regions(X,y,classifier,test_idx=None,resolution=.2):

    markers= ('o','s','^','v','<')
    colors = ('red','blue','lightgreen','gray','cyan')
    cmap = ListedColormap(colors[:len(np.unique(y))])

    x1_min,x1_max = X[:,0].min() - 1, X[:,0].max() + 1
    x2_min,x2_max = X[:,1].min() - 1, X[:,1].max() + 1

    xx1,xx2 = np.meshgrid(np.arange(x1_min,x1_max,resolution),np.arange(x2_min,x2_max,resolution))

    lab = classifier.predict(np.array([xx1.ravel(),xx2.ravel()]).T)
    lab = lab.reshape(xx1.shape)
    plt.contourf(xx1,xx2,lab,alpha=.3,cmap=cmap)
    plt.xlim(xx1.min(),xx1.max())
    plt.ylim(xx2.min(),xx2.max())

    for idx,cl in enumerate(np.unique(y)):

        plt.scatter(
            x=X[y==cl,0],
            y=X[y==cl,1],
            alpha=.8,
            c=colors[idx],
            marker=markers[idx],
            label=f'Class {cl}',
            edgecolor='black'
        )

    if test_idx:
        X_test,y_test = X[test_idx,:],y[test_idx]
        plt.scatter(
            X_test[:,0],
            X_test[:,1],
            c='none',
            edgecolor='black',
            alpha=1,
            linewidth=1,
            marker='o',
            s=100,
            label='Test Set'
        )

X_combined_std = np.vstack((X_train_std,X_test_std))
y_combined = np.hstack((y_train,y_test))
plot_decision_regions(
    X=X_combined_std,
    y=y_combined,
    classifier=ppn,
    test_idx=range(105,150)
)
plt.xlabel('Petal Length [standardized]')
plt.ylabel('Petal Width [standardized]')
plt.legend(loc='upper left')
plt.tight_layout()
plt.show()

# Logistic Regression

The Perceptron's major issue is that it never converges on datasets that aren't perfectly linerally seperable, meaning that we can't optimize it much more beyond what you see above. 

Another, more powerful option, is logistic regression. Logistic regression performs very well on linerally seperable datasets. 

The general premise of how LR works starts with the concept of odds, defined as P/(1-P). So, an event that has a 75% chance of happening would be 75/(100-75) -> 75/25 -> 3/1 or 3:1 odds. 

In [None]:
def odds(x):
    return x / (1-x)
plt.plot([odds(x) for x in np.arange(.01,1,.01)])
plt.xlabel('Positive Event Probability (Percentage)')
plt.ylabel('Event Odds')

If we treat our probabilites on a range of 0-1 (0% to 100%), we can then use the Logit function to expand those odds out to numbers across the entire real number specturm, defined as logit(p) = log(p/(1-p)). 

In [None]:
def logit(p):
    return np.log(odds(p))

plt.plot([logit(x) for x in np.arange(.01,1,.01)])
plt.axhline(0,color='k')
plt.xlabel('Positive Event Probability')
plt.ylabel('Event Odds')

We're close, but we need to be able to use this for values on the y axis and not the x axis for feature probabilty. To do this, we use the inverse logit function, known as the Logistic Sigmoid Function f(z) = 1/(1+e^-z)

In [None]:
def sigmoid(z):
    # 1/ 1+e^-z
    return 1.0 / (1.0 + np.exp(-z))

z = np.arange(-7,7,.1)
sigma_z = sigmoid(z)

plt.plot(z,sigma_z)
plt.axvline(0,color='k')
plt.ylim(-.1,1.1)
plt.xlabel('z')
plt.ylabel('$\sigma (z)$')
plt.yticks([0,.5,1])
ax = plt.gca()
ax.yaxis.grid(True)
plt.tight_layout()
plt.show()

The major benefit to this over the thresholding used for the Perceptron is that we can also extract a estimation of the probability that a data record belongs to a specific class. This is used for things such as weather forcasting to determine not only if it's going to rain, but also how likely it is. 

In [None]:
def loss_1(z):
    return -np.log(sigmoid(z))
def loss_0(z):
    return -np.log(1-sigmoid(z))

z = np.arange(-10,10,.1)
sigma_z = sigmoid(z)

c1 = [loss_1(x) for x in z]
plt.plot(sigma_z,c1,label='L(w,b) if y=1')

c0 = [loss_0(x) for x in z]
plt.plot(sigma_z,c0,linestyle='--',label='L(w,b) if y=0')

plt.ylim(0.0,5.1)
plt.xlim([0,1])

plt.xlabel('$\sigma(z)$')
plt.ylabel('L(w,b)')

plt.legend()
plt.tight_layout()
plt.show()

# Logistic Regression & Gradient Descent Implementation

In [None]:
class LogisticRegressionGD:

    '''
        @param eta: learning rate
        @param n_iter: total epochs
        @param random_state = seed for random weight generation
    '''
    def __init__(self,eta=0.01,n_iter=50,random_state=1):
        self.eta = eta
        self.n_iter = n_iter
        self.random_state = random_state

    ''' 
        @param X: training data
        @param y: target values for training
    '''
    def fit(self,X,y):
        
        rgen = np.random.RandomState(self.random_state)
        self.w_ = rgen.normal(loc=0.0,scale=0.01,size=X.shape[1])
        self.b_ = np.float_(0.)
        self.losses_ = []

        for i in range(self.n_iter):
            net_input = self.net_input(X)
            output = self.activation(net_input)
            errors = (y-output)
            self.w_ += self.eta * 2.0 * X.T.dot(errors) / X.shape[0]
            self.b_ += self.eta * 2.0 * errors.mean()
            loss = (
                -y.dot(np.log(output)) - ((1-y).dot(np.log(1-output))) / X.shape[0]
            )
            self.losses_.append(loss)
        return self

    def net_input(self,X):
        return np.dot(X,self.w_) + self.b_
    
    def activation(self,z):
        return 1. / (1. + np.exp(-np.clip(z,-250,250))) #Sigmoid
    
    def predict(self,X):
        return np.where(self.activation(self.net_input(X)) >= .5 , 1, 0)

In [None]:
X_train_01_subset = X_train_std[(y_train == 0) | (y_train == 1)]
y_train_01_subset = y_train[(y_train == 0) | (y_train == 1)]
lrgd = LogisticRegressionGD(eta=.3,n_iter=1000,random_state=1)
lrgd.fit(X_train_01_subset,y_train_01_subset)

plot_decision_regions(X=X_train_01_subset,y=y_train_01_subset,classifier=lrgd)
plt.xlabel('Petal Length [standardized]')
plt.ylabel('Petal Width [standardized]')
plt.legend(loc='upper left')
plt.tight_layout()
plt.show()

# Sklearn Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(C=100.0,solver='lbfgs',multi_class='ovr')
lr.fit(X_train_std,y_train)

plot_decision_regions(X=X_combined_std,y=y_combined,classifier=lr,test_idx=range(105,150))
plt.xlabel('Petal Length [standardized]')
plt.ylabel('Petal Width [standardized]')
plt.legend(loc='upper left')
plt.tight_layout()
plt.show()

In [None]:
lr.predict_proba(X_test_std[:3,:]) # <- Probability each of the first three flowers belong to each class

# Handling Overfitting via Regularization (the C parameter)

In [None]:
#C is inversely proportional to the regularization paramter, therefore decreasing it's value means increasing the regularization strength as shown below
#This graph shows only the weight coefficients for Iris-versicolor versus all classifiers. A weight coefficiennt approaching 0 leads to underfitting due to an 
#aggressive over regularization. So, paradoxically, where regularization can reduce over fit, over regularization can lead to underfit. 

weights,params= [],[]
for c in np.arange(-5,5):
    lr= LogisticRegression(C=10.**c,multi_class='ovr')
    lr.fit(X_train_std,y_train)
    weights.append(lr.coef_[1])
    params.append(10.**c)
weights = np.array(weights)
plt.plot(params,weights[:,0],label='Petal Length')
plt.plot(params,weights[:,1],linestyle='--',label='Petal Width')
plt.ylabel('Weight Coefficient')
plt.xlabel('C')
plt.legend(loc='upper left')
plt.xscale('log')
plt.show()

# Support Vector Machines

In [None]:
from sklearn.svm import SVC

svm = SVC(kernel='linear',C=1,random_state=1)
svm.fit(X_train_std,y_train)

plot_decision_regions(X_combined_std,y_combined,classifier=svm,test_idx=range(105,150))
plt.xlabel('Petal Length [standardized]')
plt.ylabel('Petal Width [standardized]')
plt.legend(loc='upper left')
plt.tight_layout()
plt.show()

In [None]:
#If our dataset is larger than memory, an alternative is the SGDClassifier, a concept similar to stochastic gradient descent

from sklearn.linear_model import SGDClassifier

ppn = SGDClassifier(loss='perceptron') # <- loss=perceprton, log for logistic regression, and hinge for SVM

## Kernel SVMs for nonlinear classification

In [None]:
#Creating a synthetic data set for non-linear classification
np.random.seed(1)

X_xor = np.random.randn(200,2)
y_xor = np.logical_xor(X_xor[:,0] > 0,X_xor[:,1] > 1)
y_xor = np.where(y_xor,1,0)

plt.scatter(
    X_xor[y_xor == 1,0],
    X_xor[y_xor == 1,1],
    c='royalblue',
    marker='s',
    label='Class 1'
)
plt.scatter(
    X_xor[y_xor == 0,0],
    X_xor[y_xor == 0,1],
    c='tomato',
    marker='o',
    label='Class 0'
)
plt.xlim([-3,3])
plt.ylim([-3,3])
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.legend(loc='best')
plt.tight_layout()
plt.show()

## Classification using the Radial Bias Function (RBF), aka Gaussian Kernel

In [None]:
svm = SVC(kernel='rbf',random_state=1,gamma=.1,C=10.0)
svm.fit(X_xor,y_xor)

plot_decision_regions(X_xor,y_xor,classifier=svm)
plt.legend(loc='upper left')
plt.tight_layout()
plt.show()

## Application of RBF on Iris Dataset

In [None]:
svm = SVC(kernel='rbf',gamma=.2,random_state=1,C=1.0)
svm.fit(X_train_std,y_train)
plot_decision_regions(X_combined_std,y_combined,classifier=svm,test_idx=range(105,150))
plt.xlabel('Petal Length [standardized]')
plt.ylabel('Petal Width [standardized]')
plt.legend(loc='upper left')
plt.tight_layout()
plt.show()

In [None]:
#Super high gamma here
svm = SVC(kernel='rbf',gamma=100.0,random_state=1,C=1.0)
svm.fit(X_train_std,y_train)
plot_decision_regions(X_combined_std,y_combined,classifier=svm,test_idx=range(105,150))
plt.xlabel('Petal Length [standardized]')
plt.ylabel('Petal Width [standardized]')
plt.legend(loc='upper left')
plt.tight_layout()
plt.show()

# Decision Tree Learning

Decision trees are easily interpretable as we can think of them as breaking our data down and classifying it using a series of questions. \
These questions are generated by starting at the root and splitting it and each subsequent node until all leaf nodes are pure, i.e they all belong \
to the same class. We determine which nodes to split based on the ones that provide the most Information Gain (IG) as those tend to be the most informative \
features.

# Entropy Impurity Measure

This aims to maximize the mutual distribution of data, theremore it is largest when the distribution of class \
labels in a feature node is even.

In [None]:
def entropy(p):
    return -p * np.log2(p) - (1-p) * np.log2((1-p))

x = np.arange(0.0,1.0,0.01)
ent = [entropy(p) if p != 0 else None for p in x]

plt.ylabel('Entropy')
plt.xlabel('Class Membership Probability p(i=1)')
plt.plot(x,ent)
plt.show()

## Visualization of Entropy, Gini, and Classification Impurity Measures

In [None]:
def gini(p):
    return p*(1-p) + (1-p)*(1-(1-p))
def error(p):
    return 1-np.max([p,1-p])

sc_ent = [e*.5 if e else None for e in ent]
err = [error(i) for i in x]
fig = plt.figure()
ax = plt.subplot(111)
for i,lab,ls,c in zip([ent,sc_ent,gini(x),err],['Entropy','Entropy (Scaled)','Gini Impurity','Misclassification Error'],['-','-','--','-.'],['black','lightgray','red','green','cyan']):
    line = ax.plot(x,i,label=lab,linestyle=ls,lw=2,color=c)

ax.legend(loc='upper center',bbox_to_anchor=(.5,1.15),ncol=5,fancybox=True,shadow=False)
ax.axhline(y=.5,linewidth=1,color='k',linestyle='--')
ax.axhline(y=1.0,linewidth=1,color='k',linestyle='--')
plt.ylim([0,1.1])
plt.xlabel('p(i=1)')
plt.ylabel('Impurity Index')
plt.show()

# Implementing a Decision Tree in SKlearn

In [None]:
from sklearn.tree import DecisionTreeClassifier
tree_model = DecisionTreeClassifier(criterion='gini',max_depth=4,random_state=1)

tree_model.fit(X_train,y_train)
X_combined = np.vstack((X_train,X_test))
y_combined = np.hstack((y_train,y_test))

plot_decision_regions(X_combined,y_combined,classifier=tree_model,test_idx=range(105,150))
plt.xlabel('Petal Length [cm]')
plt.ylabel('Petal Width [cm]')
plt.legend(loc='upper left')
plt.tight_layout()
plt.show()


# Visualizing Our Tree

In [None]:
from sklearn import tree

feature_names = ['Sepal Length','Sepal Width','Petal Length','Petal Width']
tree.plot_tree(tree_model,feature_names=feature_names,filled=True)
plt.show()

# Random Forest Implementation

In [None]:
import re

re.findall(r'[a-zA-Z0-9]{4}','thisthatthose')

In [None]:
d = {100000:1,'B':2,'C':3}
d[100000]

In [None]:
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(n_estimators=25,random_state=1,n_jobs=2)
forest.fit(X_train,y_train)

plot_decision_regions(X_train,y_train,classifier=forest)
plt.xlabel('Petal Length [cm]')
plt.ylabel('Petal Width [cm]')
plt.legend(loc='upper left')
plt.tight_layout()
plt.show()

# K-Nearest Neighbors Implementation

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5,p=2,metric='minkowski')
knn.fit(X_train_std,y_train)

plot_decision_regions(X_combined_std,y_combined,classifier=knn,test_idx=range(105,150))
plt.xlabel('Petal Length [standardized]')
plt.ylabel('Petal Length [standardized]')
plt.legend(loc='upper left')
plt.tight_layout()
plt.show()