In [1]:
import pandas as pd
column_names = ['class',
                'cap-shape',
                'cap-surface',
                'cap-color',
                'bruises?',
                'odor',
                'gill-attachment',
                'gill-spacing',
                'gill-size',
                'gill-color',
                'stalk-shape',
                'stalk-root',
                'stalk-surface-above-ring',
                'stalk-surface-below-ring',
                'stalk-color-above-ring',
                'stalk-color-below-ring',
                'veil-type',
                'veil-color',
                'ring-number',
                'ring-type',
                'spore-print-color',
                'population',
                'habitat']


In [2]:
df = pd.read_csv('agaricus-lepiota.data',header=None, names=column_names)
df.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises?,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [3]:
pip install category_encoders

Note: you may need to restart the kernel to use updated packages.


In [4]:
import category_encoders as ce

X = df.drop(columns='class') 
X = ce.OneHotEncoder(use_cat_names=True).fit_transform(X)
y = df['class'].replace({'p':0, 'e':1})

In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=.2, stratify=y)

In [6]:
import numpy as np
df = df.replace({'?':np.NaN})
print(df.isna().sum())

class                          0
cap-shape                      0
cap-surface                    0
cap-color                      0
bruises?                       0
odor                           0
gill-attachment                0
gill-spacing                   0
gill-size                      0
gill-color                     0
stalk-shape                    0
stalk-root                  2480
stalk-surface-above-ring       0
stalk-surface-below-ring       0
stalk-color-above-ring         0
stalk-color-below-ring         0
veil-type                      0
veil-color                     0
ring-number                    0
ring-type                      0
spore-print-color              0
population                     0
habitat                        0
dtype: int64


In [7]:
missing_values = df['stalk-root'].replace(np.NaN,'m')
print(missing_values.head(10))

0    e
1    c
2    c
3    e
4    e
5    c
6    c
7    c
8    e
9    c
Name: stalk-root, dtype: object


In [65]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import time

In [83]:
def logistic(X_test, y_test):
    logistic_reg = LogisticRegression()
    logistic_reg.fit(X_test, y_test)
    logistic_predictions = logistic_reg.predict(X_train)
    

In [84]:
def random(X_test, y_test):
    random_forest = RandomForestClassifier()
    random_forest.fit(X_test, y_test)
    random_predictions = random_forest.predict(X_train)


In [95]:
%%time
logistic(X_test, y_test)


Wall time: 27.3 ms


In [96]:
%%time
random(X_test, y_test)

Wall time: 156 ms


In [87]:
from sklearn.metrics import accuracy_score, precision_score, recall_score

acc_log=accuracy_score(y_train, logistic_predictions)
pre_log=precision_score(y_train, logistic_predictions)
rec_log=recall_score(y_train, logistic_predictions)
acc_rand=accuracy_score(y_train, random_predictions)
pre_rand=precision_score(y_train, random_predictions)
rec_rand=recall_score(y_train, random_predictions)

print(accuracy_score(y_train, logistic_predictions))
print(precision_score(y_train, logistic_predictions))
print(recall_score(y_train, logistic_predictions))
print(accuracy_score(y_train, random_predictions))
print(precision_score(y_train, random_predictions))
print(recall_score(y_train, random_predictions))

0.9973842129558393
0.9949748743718593
1.0
1.0
1.0
1.0


In [12]:
from sklearn.decomposition import PCA
pca=PCA()
X_fit=pca.fit_transform(X_train)

In [99]:
def logistic_final(X_test, y_test):
    logistic_reg_final = LogisticRegression()
    logistic_reg_final.fit(X_test, y_test)
    logistic_predictions_final = logistic_reg_final.predict(X_train)

In [100]:
%%time
logistic_final(X_test, y_test)

Wall time: 20 ms


In [88]:
def rand_final(X_test, y_test):
    random_forest_final= RandomForestClassifier(random_state=42)
    random_forest_final.fit(X_test, y_test)
    random_predictions_final = random_forest_final.predict(X_fit)

In [102]:
%%time
rand_final(X_test, y_test)

Wall time: 148 ms


In [104]:
value=np.array([[acc_rand,accuracy_score(y_train, random_predictions_final)],
                [pre_rand,precision_score(y_train, random_predictions_final)],
                [rec_rand,recall_score(y_train, random_predictions_final)],
                ['156 ms','148 ms'],
                [acc_log,accuracy_score(y_train, logistic_predictions_final)],
                [pre_log,precision_score(y_train, logistic_predictions_final)],
                [rec_log,recall_score(y_train, logistic_predictions_final)],
                ['27.3 ms','20 ms']])

In [108]:
df1=pd.DataFrame(value,columns=['Full Data','PCA reduced'],index=['Accuracy','Precision','Recall','Time','Accuracy','Precision','Recall','Time'])
df2=pd.DataFrame()
#['Random Forest''Logistic Regression']
#['Accuracy','Precision','Recall','Time']

df1                

Unnamed: 0,Full Data,PCA reduced
Accuracy,1.0,0.4729958455146946
Precision,1.0,0.47396293027360986
Recall,1.0,0.15953654188948307
Time,156 ms,148 ms
Accuracy,0.9973842129558393,0.49099861517156485
Precision,0.9949748743718593,0.5110266159695818
Recall,1.0,0.39928698752228164
Time,27.3 ms,20 ms
