In [1]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [2]:
df1 = pd.read_csv('/workspaces/codespaces-jupyter/mushrooms.csv')

In [3]:
df2 = df1.sample(frac=1)

In [4]:
df2.shape

(8124, 23)

In [5]:
df3 = df1.sample(frac=0.5)

In [6]:
df3.shape

(4062, 23)

In [7]:
df1.columns

Index(['class', 'cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor',
       'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color',
       'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
       'stalk-surface-below-ring', 'stalk-color-above-ring',
       'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number',
       'ring-type', 'spore-print-color', 'population', 'habitat'],
      dtype='object')

In [8]:
for label in df1.columns:
    df1[label] = LabelEncoder().fit_transform(df1[label])

In [9]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8124 entries, 0 to 8123
Data columns (total 23 columns):
 #   Column                    Non-Null Count  Dtype
---  ------                    --------------  -----
 0   class                     8124 non-null   int64
 1   cap-shape                 8124 non-null   int64
 2   cap-surface               8124 non-null   int64
 3   cap-color                 8124 non-null   int64
 4   bruises                   8124 non-null   int64
 5   odor                      8124 non-null   int64
 6   gill-attachment           8124 non-null   int64
 7   gill-spacing              8124 non-null   int64
 8   gill-size                 8124 non-null   int64
 9   gill-color                8124 non-null   int64
 10  stalk-shape               8124 non-null   int64
 11  stalk-root                8124 non-null   int64
 12  stalk-surface-above-ring  8124 non-null   int64
 13  stalk-surface-below-ring  8124 non-null   int64
 14  stalk-color-above-ring    8124 non-null 

In [10]:
df1.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,1,5,2,4,1,6,1,0,1,4,...,2,7,7,0,2,1,4,2,3,5
1,0,5,2,9,1,0,1,0,0,4,...,2,7,7,0,2,1,4,3,2,1
2,0,0,2,8,1,3,1,0,0,5,...,2,7,7,0,2,1,4,3,2,3
3,1,5,3,8,1,6,1,0,1,5,...,2,7,7,0,2,1,4,2,3,5
4,0,5,2,3,0,5,1,1,0,4,...,2,7,7,0,2,1,0,3,0,1


In [11]:
X = df1.drop('class', axis=1)
Y = df1['class']

In [12]:
X.shape

(8124, 22)

In [13]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=1)

In [14]:
model = DecisionTreeClassifier(criterion='entropy', max_depth=1)

In [15]:
AdaBoost = AdaBoostClassifier(base_estimator=model, n_estimators=400, learning_rate=1)

In [16]:
boost_model = AdaBoost.fit(X_train, Y_train)



In [17]:
y_pred = boost_model.predict(X_test)

In [18]:
predictions = metrics.accuracy_score(Y_test, y_pred)

In [20]:
print('The accuracy of the AdaBoost is: ', predictions*100,'%')

The accuracy of the AdaBoost is:  100.0 %


In [21]:
#now compare just decision tree vs adaboost
model = DecisionTreeClassifier(criterion='entropy', max_depth=1)
model.fit(X_train, Y_train)
y_pred = model.predict(X_test)
predictions = metrics.accuracy_score(Y_test, y_pred)
print('The accuracy of the Decision Tree is: ', predictions*100,'%')


The accuracy of the Decision Tree is:  72.64150943396226 %


In [28]:
import warnings
warnings.filterwarnings('ignore')
## Now use decision treen with different criterions and depths , min_samples_leaf,min_samples _spli and log the results

#Create a dataframe to log the results
results = pd.DataFrame(columns=['Criterion', 'Depth','min_samples_leaf','min_samples_split', 'Accuracy'])
for criterion in ['gini', 'entropy', 'log_loss']:
    for depth in range(1, 10):
        for min_samples_leaf in range(1, 5):
            for min_samples_split in range(2, 5):
                model = DecisionTreeClassifier(criterion=criterion, max_depth=depth, min_samples_leaf=min_samples_leaf, min_samples_split=min_samples_split)
                model.fit(X_train, Y_train)
                y_pred = model.predict(X_test)
                predictions = metrics.accuracy_score(Y_test, y_pred)
                results = results.append({'Criterion': criterion, 'Depth': depth, 'min_samples_leaf': min_samples_leaf, 'min_samples_split': min_samples_split, 'Accuracy': predictions*100}, ignore_index=True)
print(results)

    Criterion Depth min_samples_leaf min_samples_split   Accuracy
0        gini     1                1                 2  78.917145
1        gini     1                1                 3  78.917145
2        gini     1                1                 4  78.917145
3        gini     1                2                 2  78.917145
4        gini     1                2                 3  78.917145
..        ...   ...              ...               ...        ...
139  log_loss     4                3                 3  95.118950
140  log_loss     4                3                 4  95.118950
141  log_loss     4                4                 2  95.118950
142  log_loss     4                4                 3  95.118950
143  log_loss     4                4                 4  95.118950

[144 rows x 5 columns]


In [None]:
########################################################

In [32]:
from sklearn.datasets import make_classification
x, y = make_classification(n_samples=1000, n_features=20, n_informative=15, n_redundant=5, random_state=1)

In [40]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
#import Pipeline
from sklearn.pipeline import Pipeline

In [33]:
from collections import Counter
counter = Counter(y)

In [34]:
counter

Counter({0: 501, 1: 499})

In [35]:
x.shape

(1000, 20)

In [36]:
y.shape

(1000,)

In [38]:
#get models
#get a voting ensemble of models
#define base models
models = list()

In [41]:
DT1 = Pipeline([('m', DecisionTreeClassifier())])
models.append(('decision', DT1))

In [42]:
RF1 = Pipeline([('m', RandomForestClassifier())])
models.append(('RandomForest', RF1))

In [43]:
svc = Pipeline([('m', SVC())])
models.append(('svc', svc))

In [44]:
#define voting ensemble
from sklearn.ensemble import VotingClassifier
ensemble = VotingClassifier(estimators=models)

In [45]:
ensemble