In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.svm import NuSVC, SVC
from sklearn import model_selection
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn import metrics

from sklearn.svm import NuSVC, SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from mlxtend.classifier import StackingCVClassifier # <- Here is our boy


In [3]:
# 0. load our PCA 4200 data_set
data_train = pd.read_csv('PCA_X_train_4200.csv')
X = data_train.drop(['id', 'label'], axis=1)
y = data_train['label']

In [4]:
data_test = pd.read_csv('PCA_X_test_4200.csv')
X_test = data_test.drop(['id'], axis=1)


In [8]:
# 1. create a Log_reg class 
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted

class Log_Reg(BaseEstimator, RegressorMixin):
    def __init__(self,bs=5,epochs=100,lr=0.01):
        self.bs = bs
        self.epochs = epochs
        self.lr = lr
        self.w = 0
        self.b = 0
        self.losses = []

    #sigmoid function 
    def sigmoid(self,z):
        return 1.0/(1 + np.exp(-z))

    #loss function
    def loss(self,y, y_hat):
        loss = -np.mean(y*(np.log(y_hat)) - (1-y)*np.log(1-y_hat))
        return loss


    #gradient descent 
    def gradients(self,X, y, y_hat):
        m = X.shape[0]
        dw = (1/m)*np.dot(X.T, (y_hat - y))
        db = (1/m)*np.sum((y_hat - y)) 
        return dw, db

    def fit(self,X, y):
       
        if type(X) != np.array:
            X = np.array(X)
        if type(y) != np.array:
            y = np.array(y)
        
        X, y = check_X_y(X, y)
        m, n = X.shape
    
    # Initializing weights and bias to zeros.
        w = np.zeros((n,1))
        b = 0
        y = y.reshape(m,1)
        losses = []
        for epoch in range(self.epochs):
            for i in range((m-1)//self.bs + 1):
                
                # random batch of size bs.
                index = np.random.randint(0,(m-1)//self.bs + 1)
                # Defining batches. SGD.
                start_i = index*self.bs
                end_i = start_i + self.bs
                xb = X[start_i:end_i]
                yb = y[start_i:end_i]
                
                # Calculating hypothesis/prediction.
                y_hat = self.sigmoid(np.dot(xb, w) + b)
                
                # Getting the gradients of loss w.r.t parameters.
                dw, db = self.gradients(xb, yb, y_hat)
                
                # Updating the parameters.
                w -= self.lr*dw
                b -= self.lr*db
            
            # Calculating loss and appending it in the list.
            l = self.loss(y, self.sigmoid(np.dot(X, w) + b))
            losses.append(l)
            
        # returning weights, bias and losses(List).
        self.w = w
        self.b = b
        self.losses.append(losses)
        return w, b, losses

    #predict function 
    def predict(self,X):
        if type(X) != np.array:
            X = np.array(X)
        preds = self.sigmoid(np.dot(X, self.w) + self.b)
       
        pred_class = []
        pred_class = [1 if i > 0.5 else 0 for i in preds]
        return np.array(pred_class)
        

In [4]:
# 2. Create train and test set                         #
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20,
                                                    random_state = 42)

In [14]:
## SVC hyperparameters tuning
from sklearn.model_selection import GridSearchCV
 
# defining parameter range
param_grid_svc = {'C': [0.1, 1, 10],
              'gamma': [1, 0.1, 0.01],
              'kernel': ['linear', 'sigmoid']}
 
grid_svc = GridSearchCV(SVC(), param_grid_svc, refit = True, verbose = 3)
 
# fitting the model for grid search
grid_svc.fit(X_train, y_train)

Fitting 5 folds for each of 18 candidates, totalling 90 fits
[CV 1/5] END .....C=0.1, gamma=1, kernel=linear;, score=0.664 total time= 4.9min
[CV 2/5] END .....C=0.1, gamma=1, kernel=linear;, score=0.657 total time= 4.5min
[CV 3/5] END .....C=0.1, gamma=1, kernel=linear;, score=0.655 total time= 4.3min
[CV 4/5] END .....C=0.1, gamma=1, kernel=linear;, score=0.653 total time= 4.3min
[CV 5/5] END .....C=0.1, gamma=1, kernel=linear;, score=0.658 total time= 4.3min
[CV 1/5] END ....C=0.1, gamma=1, kernel=sigmoid;, score=0.663 total time= 4.2min
[CV 2/5] END ....C=0.1, gamma=1, kernel=sigmoid;, score=0.655 total time= 4.3min
[CV 3/5] END ....C=0.1, gamma=1, kernel=sigmoid;, score=0.654 total time= 4.3min
[CV 4/5] END ....C=0.1, gamma=1, kernel=sigmoid;, score=0.653 total time= 4.3min
[CV 5/5] END ....C=0.1, gamma=1, kernel=sigmoid;, score=0.656 total time= 4.2min
[CV 1/5] END ...C=0.1, gamma=0.1, kernel=linear;, score=0.664 total time= 4.2min
[CV 2/5] END ...C=0.1, gamma=0.1, kernel=linear;

In [17]:
## best parameters for SVC
grid_svc.best_params_

{'C': 1, 'gamma': 1, 'kernel': 'linear'}

In [10]:
# 3. initializing classifiers
classifier1 = SVC(C = 1, gamma = 1, kernel = "linear")


classifier3 = NuSVC(gamma='auto', kernel = "linear", nu = 0.6)

# Initializing Random Forest classifier
classifier4 = RandomForestClassifier(n_estimators = 100, criterion = "gini", max_depth = 10,
                                      min_samples_leaf = 0.05,
                                     min_samples_split = 0.005, n_jobs = -1, random_state = 42)

classifier5 = Log_Reg(bs=10, epochs=10, lr=1)
# from sklearn.linear_model import LogisticRegression
# classifier5 = LogisticRegression(C = 50, solver = "lbfgs", max_iter = 2000, random_state = 1000)


In [11]:
# 4. Stacking Classifier                          #

from distutils.log import Log

sclf = StackingCVClassifier(classifiers = [classifier3, classifier4],
                            shuffle = False,
                            cv = 5,
                            meta_classifier = classifier5)

In [17]:
sclf.fit(X, y)

In [15]:
y_pred = sclf.predict(X_test)

In [16]:
print("Accuracy of SVC:", metrics.accuracy_score(y_test, y_pred))
print("Precision of SVC:", metrics.precision_score(y_test, y_pred))
print("Recall of SVC:", metrics.recall_score(y_test, y_pred))
print("F1 score of SVC:", metrics.f1_score(y_test, y_pred))

Accuracy of SVC: 0.7262147221414024
Precision of SVC: 0.6689655172413793
Recall of SVC: 0.528816199376947
F1 score of SVC: 0.590691605045672
