In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn import datasets
from sklearn import svm
import matplotlib.pyplot as plt
%matplotlib inline
sns.set_style('whitegrid')

In [2]:
df = sns.load_dataset('iris')
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [8]:
df = df.loc[:,['petal_length','petal_width','species']]
df.head()

Unnamed: 0,petal_length,petal_width,species
0,1.4,0.2,setosa
1,1.4,0.2,setosa
2,1.3,0.2,setosa
3,1.5,0.2,setosa
4,1.4,0.2,setosa


In [20]:
X = df.iloc[:,0:2]
X.head()

Unnamed: 0,petal_length,petal_width
0,1.4,0.2
1,1.4,0.2
2,1.3,0.2
3,1.5,0.2
4,1.4,0.2


In [34]:
y = df.iloc[:,[-1]]
y.head()

Unnamed: 0,species
0,setosa
1,setosa
2,setosa
3,setosa
4,setosa


In [35]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
y = y.apply(le.fit_transform)


In [36]:
def get_integer_mapping(le):
    '''
    Return a dict mapping labels to their integer values
    from an SKlearn LabelEncoder
    le = a fitted SKlearn LabelEncoder
    '''
    res = {}
    for cl in le.classes_:
        res.update({cl:le.transform([cl])[0]})

    return res
get_integer_mapping(le)
#species_to_num = {'setosa': 0,
                  #'versicolor': 1,
                  #'virginica': 2}
#df['tmp'] = df['species'].map(species_to_num)
#y = df['tmp']

{'setosa': 0, 'versicolor': 1, 'virginica': 2}

In [126]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [140]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

scores = []

for i in range(0,3):
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
    from sklearn.preprocessing import RobustScaler
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.linear_model import LogisticRegression
    scaler = RobustScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    svm_params = {'C':[0.1, 0.5, 1, 2, 5, 10, 20],
                  'gamma':[0.001, 0.01, 0.1, 0.25, 0.5, 0.75, 1],
                  'kernel': ['linear', 'rbf']}
    rf_params =  { 
        'n_estimators': [200, 500],
        'max_features': ['auto', 'sqrt', 'log2'],
        'max_depth' : [4,5,6,7,8],
        'criterion' :['gini', 'entropy']
    }
    log_reg_params = {"C":np.logspace(-3,3,7), "penalty":["l1","l2"]}
    gs_params= {"cv": 3, "n_jobs": -1, "verbose": 1}


    
    for name, clf, params in [("SVM",SVC(), svm_params),
                              ("Random Forrest", RandomForestClassifier(), rf_params),
                              ("Logistic regression", LogisticRegression(), log_reg_params)]:
        grid = GridSearchCV(estimator=clf, param_grid=params, **gs_params)
        grid.fit(X_train, y_train)
        scores.append((name, grid.score(X_test, y_test), grid.best_params_))
    for name, score, params in scores:
        print("Score {0}: {1:0.2f}\t".format(name, score), params)

Fitting 3 folds for each of 98 candidates, totalling 294 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done 294 out of 294 | elapsed:    1.5s finished
  y = column_or_1d(y, warn=True)
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 3 folds for each of 60 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   20.7s
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:  1.6min finished
  self.best_estimator_.fit(X, y, **fit_params)


Fitting 3 folds for each of 14 candidates, totalling 42 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 out of  42 | elapsed:    0.2s finished
  y = column_or_1d(y, warn=True)
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Score SVM: 0.93	 {'C': 0.5, 'gamma': 0.25, 'kernel': 'rbf'}
Score Random Forrest: 0.96	 {'criterion': 'gini', 'max_depth': 4, 'max_features': 'auto', 'n_estimators': 200}
Score Logistic regression: 0.96	 {'C': 1000.0, 'penalty': 'l2'}
Fitting 3 folds for each of 98 candidates, totalling 294 fits


[Parallel(n_jobs=-1)]: Done 294 out of 294 | elapsed:    1.3s finished
  y = column_or_1d(y, warn=True)
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 3 folds for each of 60 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   20.3s
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:  1.6min finished
  self.best_estimator_.fit(X, y, **fit_params)


Fitting 3 folds for each of 14 candidates, totalling 42 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 out of  42 | elapsed:    0.2s finished
  y = column_or_1d(y, warn=True)
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Score SVM: 0.93	 {'C': 0.5, 'gamma': 0.25, 'kernel': 'rbf'}
Score Random Forrest: 0.96	 {'criterion': 'gini', 'max_depth': 4, 'max_features': 'auto', 'n_estimators': 200}
Score Logistic regression: 0.96	 {'C': 1000.0, 'penalty': 'l2'}
Score SVM: 0.96	 {'C': 20, 'gamma': 0.75, 'kernel': 'rbf'}
Score Random Forrest: 0.96	 {'criterion': 'gini', 'max_depth': 4, 'max_features': 'auto', 'n_estimators': 200}
Score Logistic regression: 0.96	 {'C': 100.0, 'penalty': 'l1'}
Fitting 3 folds for each of 98 candidates, totalling 294 fits


[Parallel(n_jobs=-1)]: Done 294 out of 294 | elapsed:    1.2s finished
  y = column_or_1d(y, warn=True)
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 3 folds for each of 60 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   21.4s
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:  1.6min finished
  self.best_estimator_.fit(X, y, **fit_params)


Fitting 3 folds for each of 14 candidates, totalling 42 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Score SVM: 0.93	 {'C': 0.5, 'gamma': 0.25, 'kernel': 'rbf'}
Score Random Forrest: 0.96	 {'criterion': 'gini', 'max_depth': 4, 'max_features': 'auto', 'n_estimators': 200}
Score Logistic regression: 0.96	 {'C': 1000.0, 'penalty': 'l2'}
Score SVM: 0.96	 {'C': 20, 'gamma': 0.75, 'kernel': 'rbf'}
Score Random Forrest: 0.96	 {'criterion': 'gini', 'max_depth': 4, 'max_features': 'auto', 'n_estimators': 200}
Score Logistic regression: 0.96	 {'C': 100.0, 'penalty': 'l1'}
Score SVM: 0.93	 {'C': 0.1, 'gamma': 0.75, 'kernel': 'rbf'}
Score Random Forrest: 0.93	 {'criterion': 'gini', 'max_depth': 4, 'max_features': 'auto', 'n_estimators': 200}
Score Logistic regression: 0.93	 {'C': 100.0, 'penalty': 'l2'}


[Parallel(n_jobs=-1)]: Done  42 out of  42 | elapsed:    0.3s finished
  y = column_or_1d(y, warn=True)


In [186]:

print(np.array(scores)[0][0], np.array(scores)[0][1])
pouet = np.array(scores)
pouet=pd.DataFrame(pouet)
_ = pouet.loc[pouet[0]=='SVM',:]
__=pouet.loc[pouet[0]=='Random Forrest',:]
___=pouet.loc[pouet[0]=='Logistic regression',:]



print(np.mean(___[1]),___[0].unique()[0])
print(np.mean(__[1]),__[0].unique()[0])
print(np.mean(_[1]),_[0].unique()[0])

SVM 0.9333333333333333
0.9481481481481482 Logistic regression
0.9481481481481482 Random Forrest
0.9407407407407407 SVM


In [109]:
for name, score, params in scores:
    print("Score {0}: {1:0.2f}\t".format(name, score), params)

Score SVM: 0.96	 {'C': 0.1, 'gamma': 1, 'kernel': 'rbf'}
Score Random Forrest: 0.96	 {'criterion': 'gini', 'max_depth': 4, 'max_features': 'auto', 'n_estimators': 200}
Score Logistic regression: 0.93	 {'C': 10.0, 'penalty': 'l2'}


In [90]:
svm = SVC(C=0.1, gamma=1,kernel='rbf')
svm.fit(X_train, y_train)

y__test_pred=svm.predict(X_test)

  y = column_or_1d(y, warn=True)


In [91]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score, f1_score
confusion_matrix(y_test, y_test_pred)

array([[19,  0,  0],
       [ 0, 13,  0],
       [ 0,  0, 13]], dtype=int64)

In [92]:
print("Precision Score: \t {0:.4f}".format(precision_score(y_test, 
                                                           y_test_pred, 
                                                           average='weighted')))
print("Recall Score: \t\t {0:.4f}".format(recall_score(y_test,
                                                     y_test_pred, 
                                                     average='weighted')))
print("F1 Score: \t\t {0:.4f}".format(f1_score(y_test,
                                             y_test_pred, 
                                             average='weighted')))

Precision Score: 	 1.0000
Recall Score: 		 1.0000
F1 Score: 		 1.0000


In [94]:
from sklearn.model_selection import cross_val_score
print(cross_val_score(svm,X_test,y_test,cv=3))
print(f"f1:{cross_val_score(svm,X_test,y_test,cv=3,scoring='f1_weighted')}")

[1.         0.92857143 1.        ]
f1:[1.         0.92743764 1.        ]


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
