## Train Test Split for cross validation 

In [11]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import datasets
from sklearn import svm
from sklearn.model_selection import cross_val_score

In [2]:
X, y = datasets.load_iris(return_X_y=True)

In [4]:
X.shape, y.shape

((150, 4), (150,))

We can now quickly sample a training set while holding out 40% of the data for testing (evaluating) our classifier:

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)

In [6]:
X_train.shape, y_train.shape

((90, 4), (90,))

In [7]:
X_test.shape, y_test.shape

((60, 4), (60,))

Doing a simple SVM regression 

In [8]:
clf = svm.SVC(kernel='linear', C=1).fit(X_train, y_train)

In [9]:
clf.score(X_test, y_test)

0.9666666666666667

In [12]:
scores = cross_val_score(clf, X, y, cv=5)

In [13]:
scores

array([0.96666667, 1.        , 0.96666667, 0.96666667, 1.        ])

In [15]:
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

0.98 accuracy with a standard deviation of 0.02


## Preprocessing

For many machine learning models, various preprocessing techniques not only help improve efficiency, but often are important for ensuring meaningful results.

- StandardScaler (aka Z-score)
- Normalizer (vector normalization)
- Binarizer The typical process is:
    - Choose appropriate preprocessing method and import it
    - Construct a rescale object by fitting the chosen method to the training set only!
    - Transform your training, validation, and test sets using constructed rescale object.


In [None]:
# Example:  Standarization / Z Scoring
#   -- the procedure is the same for Normalizer and Binarizer
from sklearn.preprocessing import StandardScaler
rescale = StandardScalar.fit(x_trn)
xx_trn = rescale.transform(x_trn)
xx_val = rescale.transform(x_val)
xx_tst = rescale.transform(x_tst)

## Modeling / 算法

Scikit-Learn makes modeling really easy. The recipe is:

1. Construct
2. Fit
3. Predict
4. Evaluate



### Supervised Learning 

### Supervised Learning - Linear Regression

In [None]:
from skearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(xx_trn, y_tr)
y_pred = model.predict(xx_val)

### Supervised Learning - Logistic Regression

In [None]:
from skearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(xx_trn, y_tr)
y_pred = model.predict(xx_val)

### Supervised Learning - SVM

In [None]:
from skearn.svm import SVC
model = SVC(kernel='linear') # other kernels: polynomial, rbf, sigmoid
model.fit(xx_trn, y_tr)
y_pred = model.predict(xx_val)

### Supervised Learning - Naive Bayes

In [None]:
from skearn.naive_bayes import GaussianNB
model = GaussianNB()
model.fit(xx_trn, y_tr)
y_pred = model.predict(xx_val)

### Supervised Learning - knn

In [None]:
from skearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier(n_neighbors=3)
model.fit(xx_trn, y_tr)
y_pred = model.predict(xx_val)

### Supervised Learning - decision tree

In [None]:
from skearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier(criterion='entropy', max_depth=10, random_state=0)
model.fit(x_trn, y_tr)  # Vars do not have to be normalized/standardized for DTs!
y_pred = model.predict(x_val)

### Supervised Learning - Gradient Boosting

In [None]:
from skearn.ensemble import GradientBoostinClassifier
model = GradientBoostinClassifier(max_depth=5, n_estimators=1000, 
  subsample=0.5, random_state=0, learning_rate=0.001)
model.fit(x_trn, y_tr)  # Vars do not have to be normalized/standardized for DTs!
y_pred = model.predict(x_val)

### Supervised Learning - Gradient Boosting

In [None]:
from skearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=1000, criterion='entropy', 
   n_jobs=4, max_depth=10)
model.fit(x_trn, y_tr)  # Vars do not have to be normalized/standardized for DTs!
y_pred = model.predict(x_val)

### UnSupervised Learning

### UnSupervised Learning - PCA

In [None]:
from skearn.decomposition import PCA
model = PCA(n_components=0.95)
model.fit(xx_trn, y_tr)
#y_pred = model.predict(xx_val)

### UnSupervised Learning - k-Means Clustering

In [None]:
from skearn.cluster import KMeans
model = KMeans(n_clusters=3, random_state=1)
model.fit(xx_trn, y_tr)
#y_pred = model.predict(xx_val)