In [32]:
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
import pandas as pd

In [5]:
data = pd.read_csv("training_mush.csv")

In [6]:
data.head()

Unnamed: 0,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,...,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat,class
0,2,0,3,1,5,1,0,0,9,1,...,3,7,0,2,1,4,3,5,0,0
1,2,0,4,0,5,1,0,1,10,0,...,7,4,0,2,1,0,7,4,0,1
2,2,0,3,0,2,1,0,0,7,0,...,0,4,0,2,1,2,1,5,1,1
3,0,0,3,0,5,1,1,0,2,0,...,7,7,0,2,2,4,7,3,1,0
4,2,3,3,1,5,1,0,0,10,1,...,3,6,0,2,1,4,2,5,0,0


In [8]:
X_train = data.drop(["class"], axis=1)
y_train = data["class"]

In [9]:
X_train.head()

Unnamed: 0,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,2,0,3,1,5,1,0,0,9,1,...,2,3,7,0,2,1,4,3,5,0
1,2,0,4,0,5,1,0,1,10,0,...,3,7,4,0,2,1,0,7,4,0
2,2,0,3,0,2,1,0,0,7,0,...,1,0,4,0,2,1,2,1,5,1
3,0,0,3,0,5,1,1,0,2,0,...,2,7,7,0,2,2,4,7,3,1
4,2,3,3,1,5,1,0,0,10,1,...,2,3,6,0,2,1,4,2,5,0


In [10]:
y_train.head()

0    0
1    1
2    1
3    0
4    0
Name: class, dtype: int64

In [12]:
rf = RandomForestClassifier(random_state=0)

In [13]:
parametrs = {"n_estimators": range(10, 60, 10), "max_depth": range(1, 12, 2), "min_samples_leaf": range(1, 8), \
            "min_samples_split": range(2, 9, 2)}

In [14]:
search = GridSearchCV(rf, parametrs, cv=3, n_jobs=-1)

In [15]:
search.fit(X_train, y_train)

GridSearchCV(cv=3, estimator=RandomForestClassifier(random_state=0), n_jobs=-1,
             param_grid={'max_depth': range(1, 12, 2),
                         'min_samples_leaf': range(1, 8),
                         'min_samples_split': range(2, 9, 2),
                         'n_estimators': range(10, 60, 10)})

In [19]:
search.best_params_

{'max_depth': 9,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 10}

In [20]:
best_tree = search.best_estimator_

In [22]:
feature_importances = best_tree.feature_importances_

In [23]:
feature_importance_df = pd.DataFrame({"features": list(X_train), "feature_importance": feature_importances})

In [24]:
feature_importance_df.sort_values("feature_importance", ascending=False)

Unnamed: 0,features,feature_importance
4,odor,0.188376
8,gill-color,0.103861
10,stalk-root,0.103793
19,spore-print-color,0.083564
6,gill-spacing,0.08084
18,ring-type,0.070726
3,bruises,0.070109
7,gill-size,0.068461
12,stalk-surface-below-ring,0.048296
20,population,0.043783


In [25]:
data_test = pd.read_csv("testing_mush.csv")

In [26]:
data_test.head()

Unnamed: 0,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,0,3,8,1,3,1,0,0,4,0,...,2,7,7,0,2,1,4,2,3,3
1,5,3,4,1,5,1,0,0,10,1,...,2,6,6,0,2,1,4,3,5,0
2,3,3,4,0,8,1,0,1,0,1,...,2,7,6,0,2,1,0,7,4,4
3,3,2,2,0,7,1,0,1,0,1,...,1,6,7,0,2,1,0,7,4,0
4,3,2,2,0,8,1,0,1,0,1,...,1,6,6,0,2,1,0,7,4,4


In [27]:
predictions = best_tree.predict(data_test)

In [31]:
list(predictions).count(1)

976

In [33]:
testing_y_mush = pd.read_csv("testing_y_mush.csv")

In [34]:
testing_y_mush.head()

Unnamed: 0,class
0,0
1,0
2,1
3,1
4,1


In [35]:
confusion_matrix(testing_y_mush, predictions)

array([[1055,    0],
       [   0,  976]], dtype=int64)