# Exercise
This is a continuation of exercise_1.ipynb
1. Compute the feature selection procedure VarianceTreshold with threshold 0.3 and use the reduced data for both SVM and Random forest. How does the performance change?
2. Compute the three feature extraction procedures PCA, LDA and KPCA and use the reduced data for both SVM and Random forest. How does the performance change?


In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder,StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split,GridSearchCV,StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import VarianceThreshold
from sklearn.metrics import make_scorer,f1_score,confusion_matrix,roc_curve,auc,precision_score,recall_score,accuracy_score
from sklearn.decomposition import KernelPCA,PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data',header=None)
X = df.loc[: , 2:].values
y = df.loc[:, 1].values
le = LabelEncoder()
y = le.fit_transform(y)
np.bincount(y)

array([357, 212], dtype=int64)

## 1 - Variance Threshold

In [2]:
sel = VarianceThreshold(threshold=0.3)
X_th=sel.fit_transform(X)

In [3]:
X_train_th , X_test_th , y_train_th , y_test_th = train_test_split(X_th,
                                        y, 
                                        test_size=0.25, 
                                        stratify=y, 
                                        random_state=1)

In [4]:
param_range = [0.001 , 0.01 , 0.1 , 1.0 , 10.0 , 100.0 , 1000.0]
degree=[2,3,4,5,6,7]

pipe_svm = make_pipeline(StandardScaler(), 
            SVC(random_state=1, probability=True))

param_grid = [{'svc__C': param_range, 'svc__kernel': ['linear']},
                {'svc__C': param_range,'svc__gamma': param_range,'svc__kernel': ['rbf']}, 
                {'svc__C': param_range , 'svc__degree': degree , 'svc__kernel':['poly']}]

scorer=make_scorer(f1_score)

gs = GridSearchCV(estimator=pipe_svm, 
        param_grid=param_grid, 
        scoring=scorer, 
        n_jobs=-1, 
        refit=True, 
        cv=10)

gs=gs.fit(X_train_th , y_train_th)
print(gs.best_score_)
print(gs.best_params_)

0.9517558651026393
{'svc__C': 1000.0, 'svc__gamma': 0.01, 'svc__kernel': 'rbf'}


In [5]:
n_estimators = [10 , 50 , 100]
max_depth=[10 , 100 , 200]

pipe_forest = make_pipeline(RandomForestClassifier(random_state=1))

param_grid_tree = [{'randomforestclassifier__n_estimators': n_estimators, 'randomforestclassifier__criterion': ['gini'] , 'randomforestclassifier__max_depth': max_depth},
            {'randomforestclassifier__n_estimators': n_estimators, 'randomforestclassifier__criterion': ['entropy'] , 'randomforestclassifier__max_depth': max_depth}]
                
gs_t = GridSearchCV(estimator=pipe_forest, 
        param_grid=param_grid_tree, 
        scoring=scorer, 
        n_jobs=-1, 
        refit=True, 
        cv=10)

gs_t=gs_t.fit(X_train_th , y_train_th)
print(gs_t.best_score_)
print(gs_t.best_params_)

0.9286959837529917
{'randomforestclassifier__criterion': 'gini', 'randomforestclassifier__max_depth': 10, 'randomforestclassifier__n_estimators': 50}


## 2 - PCA

In [6]:
X_train_ex , X_test_ex , y_train_ex , y_test_ex = train_test_split(X,
                                        y, 
                                        test_size=0.25, 
                                        stratify=y, 
                                        random_state=1)

In [7]:
from sklearn.decomposition import PCA
param_range=[0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0]
degree = [3,4,5,6]
n_comps=[5,6,7,8,9,10,11,12]
param_grid_svm = [{'pca__n_components':n_comps,'svc__C':param_range,'svc__kernel':['linear']},
                    {'pca__n_components':n_comps,'svc__C':param_range,'svc__gamma':param_range,'svc__kernel':['rbf']},
                    {'pca__n_components':n_comps,'svc__C': param_range , 'svc__degree': degree , 'svc__kernel':['poly']}]
scorer=make_scorer(f1_score, average='weighted')

pipe_svm=make_pipeline(StandardScaler(),
                PCA(),
                SVC(probability=True, random_state=1))

gs1=GridSearchCV(estimator=pipe_svm,
        param_grid=param_grid_svm,
        scoring=scorer,
        n_jobs=-1,
        refit=True,
        cv=10)
gs1=gs1.fit(X_train_ex, y_train_ex)
print(gs1.best_score_)
print(gs1.best_params_)

0.9811499029476165
{'pca__n_components': 11, 'svc__C': 1000.0, 'svc__gamma': 0.001, 'svc__kernel': 'rbf'}


In [8]:
param_range=[0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0]
n_estimators = [10 , 50 , 100, 200, 300]
max_depth=[10 , 100 , 200]
param_grid_tree =[{'pca__n_components':n_comps,'randomforestclassifier__n_estimators': n_estimators, 'randomforestclassifier__criterion': ['gini'] , 'randomforestclassifier__max_depth': max_depth},
            {'pca__n_components':n_comps,'randomforestclassifier__n_estimators': n_estimators, 'randomforestclassifier__criterion': ['entropy'] , 'randomforestclassifier__max_depth': max_depth}]

pipe_tree=make_pipeline(PCA(),
                RandomForestClassifier(random_state=1))

gs2=GridSearchCV(estimator=pipe_tree,
        param_grid=param_grid_tree,
        scoring=scorer,
        n_jobs=-1,
        refit=True,
        cv=10)

gs2=gs2.fit(X_train_ex, y_train_ex)
print(gs.best_score_)
print(gs.best_params_)

0.9517558651026393
{'svc__C': 1000.0, 'svc__gamma': 0.01, 'svc__kernel': 'rbf'}


## 2 - LDA

In [9]:
param_grid_svm = [{'svc__C':param_range,'svc__kernel':['linear']},
                    {'svc__C':param_range,'svc__gamma':param_range,'svc__kernel':['rbf']},
                    {'svc__C': param_range , 'svc__degree': degree , 'svc__kernel':['poly']}]   

pipe_svm=make_pipeline(StandardScaler(),
            LDA(n_components=1),
            SVC(random_state=1))

gs3=GridSearchCV(estimator=pipe_svm,
        param_grid=param_grid_svm,
        scoring=scorer,
        n_jobs=-1,
        refit=True,
        cv=10)
gs3=gs3.fit(X_train_ex, y_train_ex)
print(gs.best_score_)
print(gs.best_params_)

0.9517558651026393
{'svc__C': 1000.0, 'svc__gamma': 0.01, 'svc__kernel': 'rbf'}


In [10]:
param_range=[0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0]
n_estimators = [10 , 50 , 100 , 200, 300]
max_depth=[10 , 100 , 200]
param_grid_tree =[{'randomforestclassifier__n_estimators': n_estimators, 'randomforestclassifier__criterion': ['gini'] , 'randomforestclassifier__max_depth': max_depth},
            {'randomforestclassifier__n_estimators': n_estimators, 'randomforestclassifier__criterion': ['entropy'] , 'randomforestclassifier__max_depth': max_depth}]

pipe_tree=make_pipeline(LDA(n_components=1),
                RandomForestClassifier(random_state=1))

gs4=GridSearchCV(estimator=pipe_tree,
        param_grid=param_grid_tree,
        scoring=scorer,
        n_jobs=-1,
        refit=True,
        cv=10)

gs4=gs4.fit(X_train_ex, y_train_ex)
print(gs.best_score_)
print(gs.best_params_)

0.9517558651026393
{'svc__C': 1000.0, 'svc__gamma': 0.01, 'svc__kernel': 'rbf'}


## 2 - KPCA

In [11]:
param_range=[0.001, 0.01, 0.1, 1.0, 10.0, 100.0]
degree = [4,5,6,7,8,9]
n_comps=[4,5,6,7,8,9,10,11,12]

param_grid_svm = [{'kernelpca__n_components':n_comps,'kernelpca__gamma':param_range,'svc__C':param_range,'svc__kernel':['linear']},
                    {'kernelpca__n_components':n_comps,'kernelpca__gamma':param_range,'svc__C':param_range,'svc__gamma':param_range,'svc__kernel':['rbf']},
                    {'kernelpca__n_components':n_comps,'kernelpca__gamma':param_range,'svc__C': param_range , 'svc__degree': degree , 'svc__kernel':['poly']}]   

pipe_svm=make_pipeline(StandardScaler(),
            KernelPCA(kernel='rbf'),
            SVC(random_state=1))

gs5=GridSearchCV(estimator=pipe_svm,
        param_grid=param_grid_svm,
        scoring=scorer,
        n_jobs=-1,
        refit=True,
        cv=10)
        
gs5=gs5.fit(X_train_ex, y_train_ex)
print(gs.best_score_)
print(gs.best_params_)

0.9517558651026393
{'svc__C': 1000.0, 'svc__gamma': 0.01, 'svc__kernel': 'rbf'}




In [12]:
param_grid_tree =[{'kernelpca__n_components':n_comps,'kernelpca__gamma':param_range,'randomforestclassifier__n_estimators': n_estimators, 'randomforestclassifier__criterion': ['gini'] , 'randomforestclassifier__max_depth': max_depth},
            {'kernelpca__n_components':n_comps,'kernelpca__gamma':param_range,'randomforestclassifier__n_estimators': n_estimators, 'randomforestclassifier__criterion': ['entropy'] , 'randomforestclassifier__max_depth': max_depth}]

pipe_tree=make_pipeline(KernelPCA(kernel='rbf'),
                RandomForestClassifier(random_state=1))

gs6=GridSearchCV(estimator=pipe_tree,
        param_grid=param_grid_tree,
        scoring=scorer,
        n_jobs=-1,
        refit=True,
        cv=10)
        
gs6=gs6.fit(X_train_ex, y_train_ex)
print(gs.best_score_)
print(gs.best_params_)

KeyboardInterrupt: 