In [1]:
import pandas as pd
import numpy as np
import openml
import os

# Download OpenML data

In [2]:
dataset_ids = [61]

In [None]:
for dataset_id in dataset_ids:
    print ('Get dataset id', dataset_id)
    dataset = openml.datasets.get_dataset(dataset_id)
    X, y, categorical_indicator, attribute_names = dataset.get_data(dataset_format='dataframe', target=dataset.default_target_attribute)
    if len(np.unique(y)) != 2:
        print ('Not binary classification')
        #continue
    vals = {}
    for i, name in enumerate(attribute_names):
        vals[name] = X[name]
    vals['target'] = y
    df = pd.DataFrame(vals)
    df.to_csv('./data/{0}.csv'.format(dataset_id), index=False)
    print('Dataset {} saved successfully'.format(dataset_id))

# Train a machine learning model

In [4]:
from sklearn.model_selection import train_test_split

In [5]:
# NOTE: We are using dataset 20 from the test server: https://test.openml.org/d/20
dataset = openml.datasets.get_dataset(61)
X, y, categorical_indicator, attribute_names = dataset.get_data(
    dataset_format='array',
    target=dataset.default_target_attribute
)

In [6]:
dataset.name

'iris'

In [13]:
#split the data into train & test sets
x_train,x_test, y_train, y_test=train_test_split(X,y,test_size=0.30)

In [14]:
from sklearn.svm import SVC
model=SVC( C=100)
model.fit(x_train, y_train)

SVC(C=100)

In [15]:
#Predictions from the trained model
pred=model.predict(x_test)

In [16]:
#Model Evaluation
from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test,pred))

[[14  0  0]
 [ 0 15  0]
 [ 0  0 16]]


In [17]:
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        14
           1       1.00      1.00      1.00        15
           2       1.00      1.00      1.00        16

    accuracy                           1.00        45
   macro avg       1.00      1.00      1.00        45
weighted avg       1.00      1.00      1.00        45



In [18]:
print('Accuracy of SVM classifier on test set: {:.2f}'.format(model.score(x_test, y_test)))

Accuracy of SVM classifier on test set: 1.00


# using GridSearch

In [51]:
from sklearn.model_selection import GridSearchCV 
  
# defining parameter range 
param_grid = {'C': [0.1, 1, 10, 100, 1000],  
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 
              'kernel': ['rbf']}  
  
grid = GridSearchCV(SVC(), param_grid, refit = True, verbose = 3, cv=5) 
  
# fitting the model for grid search 
grid.fit(x_train, y_train) 

Fitting 5 folds for each of 25 candidates, totalling 125 fits
[CV] C=0.1, gamma=1, kernel=rbf ......................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] .......... C=0.1, gamma=1, kernel=rbf, score=0.111, total=   0.6s
[CV] C=0.1, gamma=1, kernel=rbf ......................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.6s remaining:    0.0s


[CV] .......... C=0.1, gamma=1, kernel=rbf, score=0.111, total=   0.6s
[CV] C=0.1, gamma=1, kernel=rbf ......................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    1.2s remaining:    0.0s


[CV] .......... C=0.1, gamma=1, kernel=rbf, score=0.111, total=   0.6s
[CV] C=0.1, gamma=1, kernel=rbf ......................................
[CV] .......... C=0.1, gamma=1, kernel=rbf, score=0.111, total=   0.6s
[CV] C=0.1, gamma=1, kernel=rbf ......................................
[CV] .......... C=0.1, gamma=1, kernel=rbf, score=0.111, total=   0.6s
[CV] C=0.1, gamma=0.1, kernel=rbf ....................................
[CV] ........ C=0.1, gamma=0.1, kernel=rbf, score=0.111, total=   0.7s
[CV] C=0.1, gamma=0.1, kernel=rbf ....................................
[CV] ........ C=0.1, gamma=0.1, kernel=rbf, score=0.111, total=   0.7s
[CV] C=0.1, gamma=0.1, kernel=rbf ....................................
[CV] ........ C=0.1, gamma=0.1, kernel=rbf, score=0.111, total=   0.7s
[CV] C=0.1, gamma=0.1, kernel=rbf ....................................
[CV] ........ C=0.1, gamma=0.1, kernel=rbf, score=0.111, total=   0.7s
[CV] C=0.1, gamma=0.1, kernel=rbf ....................................
[CV] .

[Parallel(n_jobs=1)]: Done 125 out of 125 | elapsed:  1.1min finished


GridSearchCV(cv=5, error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': [0.1, 1, 10, 100, 1000],
                         'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
                         'kernel': ['rbf']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=3)

In [52]:
grid_predictions = grid.predict(x_test) 
  
# print classification report 
print(classification_report(y_test, grid_predictions)) 


              precision    recall  f1-score   support

           0       1.00      1.00      1.00        67
           1       0.97      0.99      0.98        67
           2       0.98      0.98      0.98        60
           3       0.93      0.98      0.96        56
           4       1.00      0.98      0.99        66
           5       0.98      0.95      0.96        57
           6       0.98      0.96      0.97        45
           7       1.00      0.97      0.99        69
           8       0.98      0.98      0.98        59
           9       0.95      0.98      0.96        54

    accuracy                           0.98       600
   macro avg       0.98      0.98      0.98       600
weighted avg       0.98      0.98      0.98       600



In [53]:
print('Accuracy of SVM classifier on test set: {:.2f}'.format(grid.score(x_test, y_test)))

Accuracy of SVM classifier on test set: 0.98


In [54]:
# print best parameter after tuning 
print(grid.best_params_) 

# print how our model looks after hyper-parameter tuning 
print(grid.best_estimator_) 

{'C': 10, 'gamma': 0.001, 'kernel': 'rbf'}
SVC(C=10, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.001, kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)


In [55]:
# Print the tuned parameters and score 
print("Tuned SVM Parameters: {}".format(grid.best_params_))  
print("Best score is {}".format(grid.best_score_)) 

Tuned SVM Parameters: {'C': 10, 'gamma': 0.001, 'kernel': 'rbf'}
Best score is 0.9835714285714285


# Pipeline

In [104]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier


In [94]:
# Construct svm pipeline
pipe_svm = Pipeline([('svm',SVC(C=1000, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.001, kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False))])


# Construct Random Forest pipeline
num_trees = 100
max_features = 1
pipe_rf = Pipeline([('ss4', StandardScaler()),
                    ('rf', RandomForestClassifier(n_estimators=num_trees, max_features=max_features))])


# Construct DT pipeline
pipe_dt = Pipeline([('ss3', StandardScaler()),
                    ('dt', tree.DecisionTreeClassifier(random_state=42))])

In [121]:
pipe_svm.fit(x_train, y_train) 
print('Accuracy of SVM classifier on test set: {:.2f}'.format(pipe_svm.score(x_test, y_test)))

Accuracy of SVM classifier on test set: 0.98


In [122]:
pipe_dt.fit(x_train, y_train) 
print('Accuracy of DT classifier on test set: {:.2f}'.format(pipe_dt.score(x_test, y_test)))

Accuracy of DT classifier on test set: 0.85


In [128]:
pipe_rf.fit(x_train, y_train) 
print('Accuracy of RF classifier on test set: {:.2f}'.format(pipe_rf.score(x_test, y_test)))

Accuracy of RF classifier on test set: 0.96


In [124]:
pipe_dic = {0: 'Decision Tree', 1:'Random Forest', 2:'Support Vector Machines'}

In [125]:
pipelines = [pipe_dt,pipe_rf,pipe_svm]

In [126]:
for pipe in pipelines:
    pipe.fit(x_train, y_train)

In [130]:
for idx, val in enumerate(pipelines):
    print('%s pipeline test accuracy: %.2f' % (pipe_dic[idx], val.score(x_test, y_test)))

Decision Tree pipeline test accuracy: 0.85
Random Forest pipeline test accuracy: 0.96
Support Vector Machines pipeline test accuracy: 0.98


In [136]:
best_accuracy = 0
best_classifier = 0
best_pipeline = ''

for idx, pipe in enumerate(pipelines):
    if pipe.score(x_test, y_test) > best_accuracy:
        best_accuracy = pipe.score(x_test, y_test)
        best_pipeline = pipe
        best_classifier = idx

print('%s is the classifier has the best accuracy of %.2f' % (pipe_dic[best_classifier],best_accuracy))
#print(pipe)

Support Vector Machines is the classifier has the best accuracy of 0.98
