In [18]:
import pandas as pd
import wittgenstein as lw
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, accuracy_score, balanced_accuracy_score
import time

In [19]:
def convert_label(char):
    return (char == 'e') # p

LABEL = 'Poisonous/Edible'

In [20]:
df = pd.read_csv('data/mushroom.csv')
print(df[LABEL].value_counts())
df[LABEL] = df[LABEL].map(convert_label) # swap label or convert if needed
print(df[LABEL].value_counts())
df.info()
# df.to_csv('out.csv', index=False)


e    4208
p    3916
Name: Poisonous/Edible, dtype: int64
True     4208
False    3916
Name: Poisonous/Edible, dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8124 entries, 0 to 8123
Data columns (total 23 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Poisonous/Edible          8124 non-null   bool  
 1   Cap-shape                 8124 non-null   object
 2   Cap-surface               8124 non-null   object
 3   Cap-color                 8124 non-null   object
 4   Bruises?                  8124 non-null   object
 5   Odor                      8124 non-null   object
 6   Gill-attachment           8124 non-null   object
 7   Gill-spacing              8124 non-null   object
 8   Gill-size                 8124 non-null   object
 9   Gill-color                8124 non-null   object
 10  Stalk-shape               8124 non-null   object
 11  Stalk-root                8124 non-null   object
 12  Stalk-surf

In [21]:
train, test = train_test_split(df, test_size=.33, random_state=42)
# test.info()
print('Training set:')
print(train[LABEL].value_counts())
print('Test set:')
print(test[LABEL].value_counts())
X_test = test.drop(LABEL, axis=1)
y_test = test[LABEL]
# print(y_test)

Training set:
True     2830
False    2613
Name: Poisonous/Edible, dtype: int64
Test set:
True     1378
False    1303
Name: Poisonous/Edible, dtype: int64


In [22]:
start_time = time.time()
print('Job started at {}.'.format(time.ctime()))

clf = lw.RIPPER(random_state=42, k=4, prune_size=0.33, n_discretize_bins=40, verbosity=0) # n_discretize_bins=None, max_rules=2
# Or irep_clf = lw.IREP() to build a model using IREP
clf.fit(train, class_feat=LABEL) # Or pass X and y data to .fit

end_time = time.time()
elapsed = end_time - start_time
print('Job finished at {} in {} seconds.'.format(time.ctime(), round(elapsed, 1)))

print('Test accuracy: ', clf.score(X_test, y_test))

accuracy = clf.score(X_test, y_test, accuracy_score)
print('Scikit accuracy: ', accuracy)
balanced_accuracy = clf.score(X_test, y_test, balanced_accuracy_score)
print('Scikit balanced accuracy: ', balanced_accuracy)

precision = clf.score(X_test, y_test, precision_score)
recall = clf.score(X_test, y_test, recall_score)
print(f'precision: {precision} recall: {recall}')
print('last column', df.columns[-1:])

clf.out_model()
clf

Job started at Wed Apr 28 22:00:53 2021.
Job finished at Wed Apr 28 22:00:54 2021 in 0.7 seconds.
Test accuracy:  1.0
Scikit accuracy:  1.0
Scikit balanced accuracy:  1.0
precision: 1.0 recall: 1.0
last column Index(['Habitat'], dtype='object')
[[Odor=n ^ Stalk-shape=t] V
[Ring-type=p ^ Gill-size=b ^ Stalk-shape=e ^ Ring-number=o] V
[Odor=n ^ Stalk-root=? ^ Gill-size=b] V
[Odor=n ^ Bruises?=f ^ Stalk-surface-above-ring=s] V
[Gill-spacing=w ^ Stalk-shape=t] V
[Odor=n ^ Stalk-color-below-ring=n ^ Stalk-root=b] V
[Ring-number=t ^ Spore-print-color=w]]


<RIPPER(max_rules=None, k=4, prune_size=0.33, n_discretize_bins=40, verbosity=0, dl_allowance=64, max_rule_conds=None, random_state=42, max_total_conds=None)>

In [186]:
df['predict'] = ripper_clf.predict(df)
df.to_csv('out.csv', index=False)

In [187]:
df.info()
df.to_csv('out.csv', index=False)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Loan_ID            614 non-null    object
 1   Gender             614 non-null    object
 2   Married            614 non-null    object
 3   Dependents         614 non-null    object
 4   Education          614 non-null    object
 5   Self_Employed      614 non-null    object
 6   ApplicantIncome    614 non-null    int64 
 7   CoapplicantIncome  614 non-null    int64 
 8   LoanAmount         614 non-null    int64 
 9   Loan_Amount_Term   614 non-null    int64 
 10  Property_Area      614 non-null    object
 11  Loan_Status        614 non-null    bool  
 12  predict            614 non-null    bool  
dtypes: bool(2), int64(4), object(7)
memory usage: 54.1+ KB
