<h1> Sound Classification - part 5 </h1>

In this part we will ensamble our models.

In [1]:
import pandas as pd
pd.options.display.max_columns = 500
pd.options.display.max_rows = 500
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import pickle

import warnings
warnings.filterwarnings("ignore")

In [2]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score

In [3]:
## Loading saved X (features) and Y (labels) from files ()
# mfcc features from part 1
with open('part1_X.pickle', 'rb') as f:
            X_mfcc = pickle.load(f)

# other features from part 3
with open('part3_X.pickle', 'rb') as f:
            X = pickle.load(f)
with open('part3_Y.pickle', 'rb') as f:
            Y = pickle.load(f)

# getting all features together
X_all = pd.concat([X, X_mfcc], axis=1)

# read metadata writen to file in part 1
with open('part1_df.pickle', 'rb') as f:
            df = pickle.load(f)

My transformers from part 4.

In [4]:
from sklearn.base import BaseEstimator, TransformerMixin
# The CorrelatedFeaturesRemover class inherits from the sklearn.base classes 
# (BaseEstimator, TransformerMixin). This makes it compatible with 
# scikit-learn’s Pipelines

class DropColumns(BaseEstimator, TransformerMixin):
    # initializer 
    def __init__(self, percentage_treshold):
        self.percentage_treshold = percentage_treshold
        
    def fit(self, X, y = None):
        self.cols_to_drop_ = []
        X_ = X.copy()
        num_of_samples = len(X_)

        for col in X_.columns:
            temp = X_[col].value_counts().sort_values(ascending=False)
            if (list(temp)[0]/num_of_samples) > self.percentage_treshold[0]:
                self.cols_to_drop_.append(col)
                
        return self
    
    def transform(self, X, y = None):
        X2 = X.copy()
        X2.drop(self.cols_to_drop_, axis=1, inplace=True)
        return X2

class ReplaceOutliers(BaseEstimator, TransformerMixin):
    # initializer 
    def __init__(self, factor):
        self.factor = factor
        
    def fit(self, X, y = None):
        return self
    
    def transform(self, X, y = None):
        X2 = X.copy()
        for col in X2.columns:
            col_mean = np.mean(X2[col])
            col_std = np.std(X2[col])
            col_tres = col_std * self.factor[0]
            X2[col] = X2[col].apply(lambda x: x if np.abs(x)<col_tres else col_tres*x/np.abs(x))
        return X2
    
class CorrelatedFeaturesRemover(BaseEstimator, TransformerMixin):
    # initializer 
    def __init__(self, factor):
        self.factor = factor
        
    def reduce_correlated_features(self, X):
        X_ = X.copy()
        # if X is not DataFrame we have to make DataFrame
        if type(X_)!=type(pd.DataFrame()):
            X_ = pd.DataFrame(X_, columns=['f'+str(i) for i in range(X_.shape[1])])
        run = True
        while run:
            corr = X_.corr()
            l = len(corr)
            for i in range(len(corr)):
                corr.iloc[i,i]=0
            sorted_features = np.max(corr).sort_values(ascending=False)
            if sorted_features[0]>self.factor[0]:
                feat_to_drop = sorted_features.index[0]
                X_.drop([feat_to_drop], axis=1, inplace=True)
            else:
                run = False
        return X_
        
    def fit(self, X, y = None):
        X_ = self.reduce_correlated_features(X)
        self.features_ = X_.columns
        # return transformer object
        return self
    
    def transform(self, X, y = None):
        X_ = X.copy()
        # if X is not DataFrame we have to make DataFrame
        if type(X_)!=type(pd.DataFrame()):
            X_ = pd.DataFrame(X_, columns=['f'+str(i) for i in range(X_.shape[1])])
        if len(self.features_)>0:
            X_ = X_[self.features_]
            # return the dataframe with the specified features
            return X_
        else:
            return X

Read best models from files.

In [5]:
with open('part4_best_ridge_model.pickle', 'rb') as f:
            best_ridge_model = pickle.load(f)

with open('part4_best_logistic_model.pickle', 'rb') as f:
            best_logistic_model = pickle.load(f)

with open('part4_best_mlp_model.pickle', 'rb') as f:
            best_mlp_model = pickle.load(f)

with open('part4_best_rf_model.pickle', 'rb') as f:
            best_rf_model = pickle.load(f)

with open('part4_best_svc_model.pickle', 'rb') as f:
            best_svc_model = pickle.load(f)

Prepare indexes of training and testing sets for cross validation.

In [6]:
# defining training and testing sets 90% / 10%
train_X = X_all.loc[df['fold']!=10].copy()
train_df = df.loc[df['fold']!=10].copy()
train_Y = Y.loc[train_X.index].copy()
test_X = X_all.loc[df['fold']==10].copy()
test_X = Y.loc[test_X.index].copy()

# generating 3 groups for cross validation
cv_groups = []
for i in range(1, 10, 3):
    a , b, c = i, i+1, i+2
    temp_train_X = train_X.loc[(train_df['fold']!=a) & (train_df['fold']!=b) & (train_df['fold']!=c)].copy()
    temp_test_X = train_X.loc[(train_df['fold']==a) | (train_df['fold']==b) | (train_df['fold']==c)].copy()
    cv_groups.append((temp_train_X.index, temp_test_X.index))
    
# defining 10 groups for corss validation
cv_groups_10_fold = []
for i in range(1, 11):
    temp_train_X = X_all.loc[df['fold']!=i]
    temp_test_X = X_all.loc[df['fold']==i]
    cv_groups_10_fold.append((temp_train_X.index, temp_test_X.index))

We can see that the predictions aren't the same so we can expect that combining them into one system can give better results then those from individual models.

In [7]:
from sklearn.ensemble import VotingClassifier

eclf = VotingClassifier(estimators=[('ridge', best_ridge_model), ('lr', best_logistic_model),
                                    ('mlp', best_mlp_model), ('rf', best_rf_model),
                                    ('svc', best_svc_model)], voting='hard')

In [8]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(eclf, X_all, Y, cv=cv_groups, n_jobs=-1)
print(scores)
print(np.mean(scores))

[0.67498138 0.71771553 0.73536585]
0.7093542571795486


In [9]:
scores = cross_val_score(eclf, X_all, Y, cv=cv_groups_10_fold, n_jobs=-1)
print(scores)
print(np.mean(scores))

[0.75143184 0.72184685 0.63351351 0.74242424 0.79487179 0.74848117
 0.75536993 0.71588089 0.78186275 0.78136201]
0.7427044982303604


Voting Classifier with voting parameter set to 'soft' uses the models predicted probabilities. Ridge classifier don't support this functionality so we can not use it this way. If we want SVC to return predicted probability we need to set its parameter 'probability' to True. We will do it now.

In [7]:
best_svc_model.get_params()

{'memory': None,
 'steps': [('dc', DropColumns(percentage_treshold=[0.9])),
  ('cfr', CorrelatedFeaturesRemover(factor=[0.97])),
  ('ro', ReplaceOutliers(factor=[50])),
  ('scaler', 'passthrough'),
  ('qt', QuantileTransformer(output_distribution='normal', random_state=44)),
  ('clf', SVC(C=1.467799267622069, random_state=44))],
 'verbose': False,
 'dc': DropColumns(percentage_treshold=[0.9]),
 'cfr': CorrelatedFeaturesRemover(factor=[0.97]),
 'ro': ReplaceOutliers(factor=[50]),
 'scaler': 'passthrough',
 'qt': QuantileTransformer(output_distribution='normal', random_state=44),
 'clf': SVC(C=1.467799267622069, random_state=44),
 'dc__percentage_treshold': [0.9],
 'cfr__factor': [0.97],
 'ro__factor': [50],
 'qt__copy': True,
 'qt__ignore_implicit_zeros': False,
 'qt__n_quantiles': 1000,
 'qt__output_distribution': 'normal',
 'qt__random_state': 44,
 'qt__subsample': 100000,
 'clf__C': 1.467799267622069,
 'clf__break_ties': False,
 'clf__cache_size': 200,
 'clf__class_weight': None,
 'c

In [8]:
best_svc_model.set_params(clf__probability=True)

Pipeline(steps=[('dc', DropColumns(percentage_treshold=[0.9])),
                ('cfr', CorrelatedFeaturesRemover(factor=[0.97])),
                ('ro', ReplaceOutliers(factor=[50])), ('scaler', 'passthrough'),
                ('qt',
                 QuantileTransformer(output_distribution='normal',
                                     random_state=44)),
                ('clf',
                 SVC(C=1.467799267622069, probability=True, random_state=44))])

In [12]:
eclf2 = VotingClassifier(estimators=[('lr', best_logistic_model),
                                    ('mlp', best_mlp_model), ('rf', best_rf_model),
                                    ('svc', best_svc_model)],
                         voting='soft',
                         n_jobs=-1)

In [13]:
scores = cross_val_score(eclf2, X_all, Y, cv=cv_groups, n_jobs=-1)
print(scores)
print(np.mean(scores))

[0.67609829 0.71516915 0.7199187 ]
0.7037287130074286


In [14]:
scores = cross_val_score(eclf2, X_all, Y, cv=cv_groups_10_fold, n_jobs=-1)
print(scores)
print(np.mean(scores))

[0.73310424 0.72635135 0.63891892 0.75656566 0.80555556 0.74848117
 0.76610979 0.71091811 0.78921569 0.7921147 ]
0.7467335168076312


In [12]:
from sklearn.ensemble import StackingClassifier

eclf3 = StackingClassifier(estimators=[('lr', best_logistic_model),
                                    ('mlp', best_mlp_model), ('rf', best_rf_model),
                                    ('svc', best_svc_model)], final_estimator=LogisticRegression(), n_jobs=-1)

In [16]:
scores = cross_val_score(eclf3, X_all, Y, cv=cv_groups, n_jobs=-1)
print(scores)
print(np.mean(scores))

[0.67237528 0.69734449 0.72398374]
0.6979011693226896


In [13]:
scores = cross_val_score(eclf3, X_all, Y, cv=cv_groups_10_fold, n_jobs=-1)
print(scores)
print(np.mean(scores))

[0.72852234 0.7027027  0.62702703 0.75959596 0.79273504 0.72660996
 0.76610979 0.69851117 0.79534314 0.77419355]
0.7371350669476451


In [11]:
eclf4 = StackingClassifier(estimators=[('lr', best_logistic_model),
                                    ('mlp', best_mlp_model), ('rf', best_rf_model),
                                    ('svc', best_svc_model)],
                           final_estimator=RandomForestClassifier(random_state=44, n_jobs=-1),
                           n_jobs=-1)

In [14]:
scores = cross_val_score(eclf4, X_all, Y, cv=cv_groups, n_jobs=-1)
print(scores)
print(np.mean(scores))

[0.6827997  0.67806475 0.71666667]
0.6925103732148302


In [15]:
scores = cross_val_score(eclf4, X_all, Y, cv=cv_groups_10_fold, n_jobs=-1)
print(scores)
print(np.mean(scores))

[0.71134021 0.73085586 0.63783784 0.74343434 0.78205128 0.71931956
 0.74821002 0.71960298 0.79166667 0.78136201]
0.7365680763309795
