## Sklearn
* Scikit Learn
* https://scikit-learn.org/stable/
* built on Numpy, Scipy, Matplotlib 
* works with pandas
* workflow might be
    * pandas/numpy to load and manipulate data
    * sklearn to build and validate a model
    * matplotlib to visualize results
    * pandas/numpy to save results
* started by a Google intern in 2007 and opensource for anyone to use
* also has build in metric calculations, feature extraction and transformation tools

#### models follow a pretty similar syntax

#### very well documented code base, easy to follow and understand as well as extract information from a trained model

## Logistic Regression
* https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html

In [71]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, classification_report

In [72]:
df = pd.read_csv("../data/iris.csv")

In [73]:
df.head(1)

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety
0,5.1,3.5,1.4,0.2,Setosa


#### we will make a binary classificaiton, where 1 is the Setosa class and 0 is all other classes

In [74]:
df["classification"] = np.where(df["variety"] == "Setosa",1, 0)

In [75]:
df.sample(5)

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety,classification
36,5.5,3.5,1.3,0.2,Setosa,1
13,4.3,3.0,1.1,0.1,Setosa,1
90,5.5,2.6,4.4,1.2,Versicolor,0
45,4.8,3.0,1.4,0.3,Setosa,1
145,6.7,3.0,5.2,2.3,Virginica,0


#### split out our features and our target

In [106]:
x = df.drop(["variety", "classification", "petal.length", "petal.width", "sepal.width"], axis = 1)

In [107]:
y = df["classification"]

#### initialize our model

In [108]:
reg = LogisticRegression() # we store model object: what information it has, and how to interact with it

#### fit or train our model using the fit() method and passing in our x and y

In [109]:
reg.fit(x,y)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

#### make predictions using our model, using the predict() method and passing in some feature data
* note, feature data must be the same size/schema, we can't make a model on 5 features and pass in 10

In [110]:
yhat = reg.predict(x)

In [111]:
yhat

array([0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [112]:
np.array(y)

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

#### we can score our model using the accuracy score from sklearn

In [113]:
accuracy_score(y, yhat)

0.74

#### now we want to gather our coefficients, perhaps for interpretation.  we do so by accessing to coef_ attribuet from out model object

In [84]:
reg.coef_

array([[-0.99519005,  2.40545503, -3.13030777]])

#### ordering stays consistent, so we can zip the columns from out dataframe that was passed into the model and our coefficients
* note ethe coef_ is a nested array, so we have to get the values from the first element

In [85]:
for i in zip(x.columns, reg.coef_[0]):
    print(i)

('sepal.length', -0.9951900465348612)
('sepal.width', 2.405455025159528)
('petal.width', -3.130307770074779)


#### we also need to get our intercept, as this is a regression problem

In [86]:
reg.intercept_

array([0.34292399])

#### Logistic Equation
.247 + .41(sepal.length) + 1.46(sepal.width) + -2.26(petal.length) + -1.02(petal.width)

In [87]:
x.head(2)

Unnamed: 0,sepal.length,sepal.width,petal.width
0,5.1,3.5,0.2
1,4.9,3.0,0.2


#### custom prediction function

In [88]:
def custom_predict(sepal_length, sepal_width, petal_width):
    return 0.34292399 + -0.9951900465348612*(sepal_length) + 2.405455025159528*(sepal_width) + -3.130307770074779*(petal_width)

In [89]:
pred = custom_predict(5.1, 3.5, .2)
pred

3.0604857867156

In [90]:
pred = custom_predict(4.9, 3.5, .2)
pred

3.259523796022572

In [91]:
y.head(2)

0    1
1    1
Name: classification, dtype: int64

#### let's us numpy to make this cleaener

In [92]:
reg.intercept_[0] + np.sum(np.multiply(reg.coef_[0], np.array(x.iloc[149,:])))

-3.946886193862824

In [93]:
np.array(x.head(1))

array([[5.1, 3.5, 0.2]])

In [94]:
def pred(features, coefs, intercept):
    return intercept + np.sum(np.multiply(features, coefs))

In [95]:
p = pred(np.array(x.head(1)), reg.coef_[0], reg.intercept_[0])
p

3.0604857880644745

#### predict probability
* Positive class probabilities are computed as
* 1 / (1 + np.exp(-self.decision_function(X))) where decision function is
* .247 + .371*(sepal_length) + 1.409*(sepal_width) + -2.152*(petal_length) + -.954*(petal_width)
* we are just applying the sigmoid function to our decision function

In [96]:
reg.predict_proba(x.head(2))

array([[0.04476692, 0.95523308],
       [0.11336745, 0.88663255]])

#### we can get the decision function using some matrix multiplication then summing across the axis and adding the intercept back in

In [97]:
# we can get the decision function using some matrix multiplication
# then summing across the axis
x.head()*reg.coef_

Unnamed: 0,sepal.length,sepal.width,petal.width
0,-5.075469,8.419093,-0.626062
1,-4.876431,7.216365,-0.626062
2,-4.677393,7.697456,-0.626062
3,-4.577874,7.456911,-0.626062
4,-4.97595,8.659638,-0.626062


In [98]:
(x.head()*reg.coef_).sum(1)

0    2.717562
1    1.713872
2    2.394001
3    2.252975
4    3.057626
dtype: float64

In [99]:
(x*reg.coef_).sum(1)+reg.intercept_

0      3.060486
1      2.056796
2      2.736925
3      2.595899
4      3.400550
         ...   
145   -6.308192
146   -5.860721
147   -5.170062
148   -4.848415
149   -3.946886
Length: 150, dtype: float64

In [100]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

In [101]:
# Positive class probabilities are computed as
sigmoid(-4.801632)

0.008149369173106819

In [102]:
1 - sigmoid(-4.801632)

0.9918506308268932

In [103]:
preds = pd.DataFrame(sigmoid((x*reg.coef_).sum(axis = 1) + reg.intercept_), columns = ["positive"])
preds["negative"] = 1 - preds["positive"]

In [104]:
preds.head(10)

Unnamed: 0,positive,negative
0,0.955233,0.044767
1,0.886633,0.113367
2,0.939171,0.060829
3,0.930597,0.069403
4,0.967722,0.032278
5,0.956812,0.043188
6,0.952777,0.047223
7,0.9488,0.0512
8,0.910017,0.089983
9,0.931526,0.068474


In [105]:
reg.predict_proba(x.head(10))

array([[0.04476692, 0.95523308],
       [0.11336745, 0.88663255],
       [0.06082932, 0.93917068],
       [0.06940283, 0.93059717],
       [0.03227827, 0.96772173],
       [0.04318779, 0.95681221],
       [0.04722279, 0.95277721],
       [0.05119996, 0.94880004],
       [0.08998273, 0.91001727],
       [0.06847384, 0.93152616]])

## Normalize
* remove magnitude of our features
* center data
* standard scaler
* z score norm
* min-max

In [114]:
df = pd.read_csv("../data/iris.csv")
df.head(1)

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety
0,5.1,3.5,1.4,0.2,Setosa


In [115]:
df.describe()

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width
count,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333
std,0.828066,0.435866,1.765298,0.762238
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


#### Standard Scaler
* https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html

In [119]:
from sklearn.preprocessing import StandardScaler

In [120]:
x = df.drop("variety", 1)

In [121]:
x.head(1)

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width
0,5.1,3.5,1.4,0.2


In [122]:
scaler = StandardScaler()
scaler.fit(x)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [123]:
# note the transform returns a numpy array
x_scaler = scaler.transform(x)
x_scale_df = pd.DataFrame(x_scaler, columns = x.columns)
x_scale_df.describe()

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width
count,150.0,150.0,150.0,150.0
mean,-2.775558e-16,-9.695948e-16,-8.652338e-16,-4.662937e-16
std,1.00335,1.00335,1.00335,1.00335
min,-1.870024,-2.433947,-1.567576,-1.447076
25%,-0.9006812,-0.592373,-1.226552,-1.183812
50%,-0.05250608,-0.1319795,0.3364776,0.1325097
75%,0.6745011,0.5586108,0.7627583,0.7906707
max,2.492019,3.090775,1.785832,1.712096


In [124]:
help(scaler.transform(x))

Help on ndarray object:

class ndarray(builtins.object)
 |  ndarray(shape, dtype=float, buffer=None, offset=0,
 |          strides=None, order=None)
 |  
 |  An array object represents a multidimensional, homogeneous array
 |  of fixed-size items.  An associated data-type object describes the
 |  format of each element in the array (its byte-order, how many bytes it
 |  occupies in memory, whether it is an integer, a floating point number,
 |  or something else, etc.)
 |  
 |  Arrays should be constructed using `array`, `zeros` or `empty` (refer
 |  to the See Also section below).  The parameters given here refer to
 |  a low-level method (`ndarray(...)`) for instantiating an array.
 |  
 |  For more information, refer to the `numpy` module and examine the
 |  methods and attributes of an array.
 |  
 |  Parameters
 |  ----------
 |  (for the __new__ method; see Notes below)
 |  
 |  shape : tuple of ints
 |      Shape of created array.
 |  dtype : data-type, optional
 |      Any objec

In [125]:
x_scale_df

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width
0,-0.900681,1.019004,-1.340227,-1.315444
1,-1.143017,-0.131979,-1.340227,-1.315444
2,-1.385353,0.328414,-1.397064,-1.315444
3,-1.506521,0.098217,-1.283389,-1.315444
4,-1.021849,1.249201,-1.340227,-1.315444
...,...,...,...,...
145,1.038005,-0.131979,0.819596,1.448832
146,0.553333,-1.282963,0.705921,0.922303
147,0.795669,-0.131979,0.819596,1.053935
148,0.432165,0.788808,0.933271,1.448832


#### Min Max Scaler
* https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html

In [126]:
from sklearn.preprocessing import MinMaxScaler

In [127]:
minmax = MinMaxScaler()
minmax.fit(x)

x_minmax = minmax.transform(x)
x_minmax = pd.DataFrame(x_minmax, columns = x.columns)
x_minmax.describe()

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width
count,150.0,150.0,150.0,150.0
mean,0.428704,0.440556,0.467458,0.458056
std,0.230018,0.181611,0.299203,0.317599
min,0.0,0.0,0.0,0.0
25%,0.222222,0.333333,0.101695,0.083333
50%,0.416667,0.416667,0.567797,0.5
75%,0.583333,0.541667,0.694915,0.708333
max,1.0,1.0,1.0,1.0


* https://scikit-learn.org/stable/modules/classes.html#module-sklearn.preprocessing

#### center data
* https://scikit-learn.org/stable/modules/preprocessing.html

In [128]:
from sklearn.preprocessing import scale

In [129]:
scaler = scale(x)
x_scale = pd.DataFrame(scaler, columns = x.columns)
x_scale.mean(axis=0)

sepal.length   -2.775558e-16
sepal.width    -9.695948e-16
petal.length   -8.652338e-16
petal.width    -4.662937e-16
dtype: float64

## countvectorizer()
* https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html

In [135]:
from sklearn.feature_extraction.text import CountVectorizer

In [136]:
corpus = [
    'This is the first document.',
    'This document is the second document.',
    'And this is the third one.',
    'Is this the first document?',
]

In [137]:
vectorizer = CountVectorizer()

In [144]:
x = vectorizer.fit_transform(corpus)
x

<4x9 sparse matrix of type '<class 'numpy.float64'>'
	with 21 stored elements in Compressed Sparse Row format>

In [145]:
x.toarray()

array([[0.        , 0.46979139, 0.58028582, 0.38408524, 0.        ,
        0.        , 0.38408524, 0.        , 0.38408524],
       [0.        , 0.6876236 , 0.        , 0.28108867, 0.        ,
        0.53864762, 0.28108867, 0.        , 0.28108867],
       [0.51184851, 0.        , 0.        , 0.26710379, 0.51184851,
        0.        , 0.26710379, 0.51184851, 0.26710379],
       [0.        , 0.46979139, 0.58028582, 0.38408524, 0.        ,
        0.        , 0.38408524, 0.        , 0.38408524]])

In [146]:
df = pd.DataFrame(x.toarray(), columns = vectorizer.get_feature_names())

In [147]:
df

Unnamed: 0,and,document,first,is,one,second,the,third,this
0,0.0,0.469791,0.580286,0.384085,0.0,0.0,0.384085,0.0,0.384085
1,0.0,0.687624,0.0,0.281089,0.0,0.538648,0.281089,0.0,0.281089
2,0.511849,0.0,0.0,0.267104,0.511849,0.0,0.267104,0.511849,0.267104
3,0.0,0.469791,0.580286,0.384085,0.0,0.0,0.384085,0.0,0.384085


In [148]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [149]:
vectorizer = TfidfVectorizer()
x = vectorizer.fit_transform(corpus)
df = pd.DataFrame(x.toarray(), columns = vectorizer.get_feature_names())
df

Unnamed: 0,and,document,first,is,one,second,the,third,this
0,0.0,0.469791,0.580286,0.384085,0.0,0.0,0.384085,0.0,0.384085
1,0.0,0.687624,0.0,0.281089,0.0,0.538648,0.281089,0.0,0.281089
2,0.511849,0.0,0.0,0.267104,0.511849,0.0,0.267104,0.511849,0.267104
3,0.0,0.469791,0.580286,0.384085,0.0,0.0,0.384085,0.0,0.384085


## Featuree Extraction
* https://scikit-learn.org/stable/modules/classes.html#module-sklearn.feature_extraction.text

In [150]:
from sklearn.feature_selection import VarianceThreshold

In [151]:
x = [[0, 2, 0, 3], [0, 1, 4, 3], [0, 1, 1, 3]]
df = pd.DataFrame(x)
df

Unnamed: 0,0,1,2,3
0,0,2,0,3
1,0,1,4,3
2,0,1,1,3


In [152]:
df.describe()

Unnamed: 0,0,1,2,3
count,3.0,3.0,3.0,3.0
mean,0.0,1.333333,1.666667,3.0
std,0.0,0.57735,2.081666,0.0
min,0.0,1.0,0.0,3.0
25%,0.0,1.0,0.5,3.0
50%,0.0,1.0,1.0,3.0
75%,0.0,1.5,2.5,3.0
max,0.0,2.0,4.0,3.0


In [153]:
selector = VarianceThreshold()
tst = selector.fit_transform(df)
tst

array([[2, 0],
       [1, 4],
       [1, 1]])

In [154]:
selector.variances_

array([0.        , 0.22222222, 2.88888889, 0.        ])

In [155]:
df.columns[selector.get_support()]

Int64Index([1, 2], dtype='int64')