In [1]:
# encoding: utf-8
import pandas as pd
import numpy as np
from sklearn import model_selection

from sklearn import tree #C4.5
from sklearn.ensemble import BaggingClassifier #Bagging
from sklearn.ensemble import AdaBoostClassifier #Boosting
from sklearn.ensemble import RandomForestClassifier #Random Forests
from sklearn.ensemble import GradientBoostingClassifier #XGBoost
from mlxtend.classifier import StackingClassifier #Stacking

from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import accuracy_score
import time
from sklearn.metrics import confusion_matrix
from sklearn.utils import shuffle
from sklearn.preprocessing import MinMaxScaler, StandardScaler

pd.options.display.max_colwidth = 150

data_glass = pd.read_csv("glass.data")

# to store the models and their results (to report)
models, results, tempos = list(), list(), list()

print('Glass:', np.shape(data_glass))

Glass: (214, 11)


In [2]:
data_glass = shuffle(data_glass)
data_glass.head()

Unnamed: 0,a,b,c,d,e,f,g,h,i,j,target
166,167,1.52151,11.03,1.71,1.56,73.44,0.58,11.62,0.0,0.0,5
64,65,1.52172,13.48,3.74,0.9,72.01,0.18,9.61,0.0,0.07,1
197,198,1.51727,14.7,0.0,2.34,73.28,0.0,8.95,0.66,0.0,7
123,124,1.51707,13.48,3.48,1.71,72.52,0.62,7.99,0.0,0.0,2
213,214,1.51711,14.23,0.0,2.08,73.36,0.0,8.62,1.67,0.0,7


In [3]:
y = data_glass['target']
data_glass = data_glass.drop(columns = ['target', 'a'])

In [4]:
# Normalizing the Glass dataset: standardize features by removing the mean and scaling to unit variance
scaler = StandardScaler()
scaler.fit(data_glass)
data_glass[:] = scaler.transform(data_glass)
data_glass.head()

Unnamed: 0,b,c,d,e,f,g,h,i,j
166,1.037897,-2.918706,-0.677213,0.231064,1.021135,0.127475,1.87561,-0.352877,-0.586451
64,1.10721,0.08856,0.733456,-1.093966,-0.829438,-0.487279,0.459942,-0.352877,0.133634
197,-0.361554,1.586056,-1.865511,1.797009,0.814078,-0.763919,-0.004904,0.977618,-0.586451
123,-0.427565,0.08856,0.552779,0.532207,-0.169443,0.188951,-0.681044,-0.352877,-0.586451
213,-0.414363,1.009152,-1.865511,1.275028,0.917606,-0.763919,-0.237327,3.013677,-0.586451


In [5]:
C45 = tree.DecisionTreeClassifier() 

clf1 = KNeighborsClassifier(n_neighbors=1)
clf2 = GaussianNB()
lr = LogisticRegression()

all_models=[C45, 
            BaggingClassifier(C45, max_samples=0.4),
            AdaBoostClassifier(C45, n_estimators=100),
            RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0),
            GradientBoostingClassifier(learning_rate=0.1, n_estimators=100, subsample=1.0),
            clf1,
            clf2,
            StackingClassifier(classifiers=[clf1, clf2, C45], meta_classifier=lr)]

models.extend(all_models)

In [6]:
kf = KFold(n_splits=10, shuffle=True, random_state=2019)    

Running models for Glass dataset:

In [7]:
#For each model:
for model in all_models:
    print('\n')
    print('======= Model: ', model, '=======')
    # for each fold:
    accuracies = list()
    times = list()
    
    for i, (train_index, test_index) in enumerate(kf.split(data_glass)):
        #Builds the train and validation dataset, according to the current fold:
        y_train, y_valid = y.iloc[train_index].copy(), y.iloc[test_index]
        X_train, X_valid = data_glass.iloc[train_index,:].copy(), data_glass.iloc[test_index,:].copy()
        start = time.time()
        model.fit(X_train, y_train)
        end = time.time()
        times.append(end - start)
        pred = model.predict(X_valid)
        # print(confusion_matrix(y_valid, pred, labels=[1,2,3,5,6,7]))
        acc = accuracy_score(pred, y_valid)
        accuracies.append(acc) 
        
    print('Final results:')
    print('Mean accuracy:', np.mean(accuracies))      
    print('Mean traning model time: ', np.mean(times))
    results.append(np.mean(accuracies))
    tempos.append(np.mean(times))
    




                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
Final results:
Mean accuracy: 0.6636363636363637
Mean traning model time:  0.0034397602081298827


                                                        criterion='gini',
                                                        max_depth=None,
                                                        max_features=None,
                                                        max_leaf_nodes=None,
                                                        min_impurity_decrease=0.0,
                                                        min_impurity_split=None,
                                                        min_samples_leaf=1,
                                                        min_samples_split=2,
              



Final results:
Mean accuracy: 0.6484848484848484
Mean traning model time:  0.01843979358673096




In [8]:
for model in all_models:
    print('\n')
    print('======= Model: ', model, '=======' + "\n\n")



                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,




                                                        criterion='gini',
                                                        max_depth=None,
                                                        max_features=None,
                                                        max_leaf_nodes=None,
                                                        min_impurity_decrease=0.0,
                                                        min_impurity_split=None,
                                                        min_samples_leaf=1,
                                                        min_samples_split=2,
                                                        min_weight_fraction_leaf=0.0,
                       

In [9]:
data = {'Accuracy': results, 'Time': tempos, 'Model': models}
dataframe = pd.DataFrame(data=data)
dataframe = dataframe.sort_values(by=['Accuracy'], ascending=False)
dataframe

Unnamed: 0,Accuracy,Time,Model
4,0.739177,0.510136,"([DecisionTreeRegressor(criterion='friedman_mse', max_depth=3, max_features=None,\n max_leaf_nodes=None, min_impurity_decreas..."
1,0.716234,0.012695,"(DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,\n max_features=None, max_leaf_nodes=None,\n ..."
5,0.700649,0.001242,"KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',\n metric_params=None, n_jobs=None, n_neighbors=1, p=2..."
0,0.663636,0.00344,"DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,\n max_features=None, max_leaf_nodes=None,\n ..."
3,0.655195,0.093541,"(DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=2,\n max_features='auto', max_leaf_nodes=None,\n ..."
2,0.654762,0.002944,"(DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,\n max_features=None, max_leaf_nodes=None,\n ..."
7,0.648485,0.01844,"StackingClassifier(average_probas=False,\n classifiers=[KNeighborsClassifier(algorithm='auto',\n ..."
6,0.447186,0.00171,"GaussianNB(priors=None, var_smoothing=1e-09)"


In [10]:
dataframe.to_csv('glass_results.csv',index=False)