In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pandas_profiling
import time

from sklearn.preprocessing import LabelBinarizer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [7]:
#CATEGORICAL_COLUMNS = ['Location', 'WindGustDir', 'WindDir9am', 'WindDir3pm', 'RainToday']

In [8]:
data = pd.read_csv('mushrooms.csv')

In [9]:
data = data.fillna(data.mean())
data = data.dropna()

data.drop_duplicates(inplace = True)

In [10]:
data = data.drop('veil-type',1)
# data.profile_report()

In [11]:
data['class'] = np.where(data['class'] == 'p', 0, 1)

In [12]:
features = {}

features[2] = list(data.drop('class', axis=1).columns)
features[0] = [f for f in features[2] if f not in ['stalk-shape', 'ring-number', 'stalk-root', 'veil-color', 'gill-attachment', 'habitat', 'stalk-color-below-ring', 'stalk-color-above-ring', 'cap-surface', 'cap-shape', 'cap-color']]
features[1] = [f for f in features[2] if f not in ['stalk-shape', 'ring-number', 'stalk-root', 'veil-color', 'gill-attachment']]

In [13]:
features

{2: ['cap-shape',
  'cap-surface',
  'cap-color',
  'bruises',
  'odor',
  'gill-attachment',
  'gill-spacing',
  'gill-size',
  'gill-color',
  'stalk-shape',
  'stalk-root',
  'stalk-surface-above-ring',
  'stalk-surface-below-ring',
  'stalk-color-above-ring',
  'stalk-color-below-ring',
  'veil-color',
  'ring-number',
  'ring-type',
  'spore-print-color',
  'population',
  'habitat'],
 0: ['bruises',
  'odor',
  'gill-spacing',
  'gill-size',
  'gill-color',
  'stalk-surface-above-ring',
  'stalk-surface-below-ring',
  'ring-type',
  'spore-print-color',
  'population'],
 1: ['cap-shape',
  'cap-surface',
  'cap-color',
  'bruises',
  'odor',
  'gill-spacing',
  'gill-size',
  'gill-color',
  'stalk-surface-above-ring',
  'stalk-surface-below-ring',
  'stalk-color-above-ring',
  'stalk-color-below-ring',
  'ring-type',
  'spore-print-color',
  'population',
  'habitat']}

In [14]:
def encode_column(data, column):
    encoded_columns = pd.get_dummies(data[column], prefix=column)
    return pd.concat([data, encoded_columns], axis=1).drop(column, axis=1)

In [15]:
records = []

encoded_data = data

for category in data.columns[1:]:
    encoded_data = encode_column(encoded_data, category)

for train_size in [0.2, 0.5, 0.8]:

    train, test = train_test_split(encoded_data, test_size=1 - train_size)
    print(len(train), 'train examples')
    print(len(test), 'test examples')
    
    for features_version in [0, 1, 2]:
        
        print("Train size: ", train_size, "Features version: ", features_version)
        
        columns_X = [col for col in encoded_data if any([col.startswith(feature) for feature in features[features_version]])]
        columns_Y = ["class"]

        train_ds = train[[*columns_X, *columns_Y]]
        test_ds = test[[*columns_X, *columns_Y]]
        
        train_X = train_ds[columns_X]
        train_Y = train_ds[columns_Y]
        test_X = test_ds[columns_X]
        test_Y = test_ds[columns_Y]
        
        time_before = time.perf_counter()
        model = RandomForestClassifier(n_estimators = 50).fit(train_X, train_Y)
        time_taken = time.perf_counter() - time_before
        
        train_score = model.score(train_X, train_Y)
        test_score = model.score(test_X, test_Y)
        
        records.append([time_taken, train_score, test_score])

1624 train examples
6500 test examples
Train size:  0.2 Features version:  0
Train size:  0.2 Features version:  1




Train size:  0.2 Features version:  2
4062 train examples
4062 test examples
Train size:  0.5 Features version:  0




Train size:  0.5 Features version:  1
Train size:  0.5 Features version:  2




6499 train examples
1625 test examples
Train size:  0.8 Features version:  0
Train size:  0.8 Features version:  1




Train size:  0.8 Features version:  2




In [21]:
results = []
i = 0
for train_size in [0.2, 0.5, 0.8]:
    for features_version in [0, 1, 2]:
        results.append({
            'split': train_size, 
            'features': features_version,
            'time': round(records[i][0], 3) ,
            'train_acc': round(records[i][1], 3),
            'test_acc': round(records[i][2], 3)
        })
        
        i = i+1

In [22]:
results_df = pd.DataFrame(results)
results_df

Unnamed: 0,split,features,time,train_acc,test_acc
0,0.2,0,0.067,1.0,1.0
1,0.2,1,0.072,1.0,1.0
2,0.2,2,0.07,1.0,1.0
3,0.5,0,0.093,1.0,1.0
4,0.5,1,0.101,1.0,1.0
5,0.5,2,0.103,1.0,1.0
6,0.8,0,0.113,1.0,1.0
7,0.8,1,0.127,1.0,1.0
8,0.8,2,0.126,1.0,1.0


In [23]:
results_df.to_csv('RF-Australia.csv')
print(results_df.to_latex())

\begin{tabular}{lrrrrr}
\toprule
{} &  split &  features &   time &  train\_acc &  test\_acc \\
\midrule
0 &    0.2 &         0 &  0.067 &        1.0 &       1.0 \\
1 &    0.2 &         1 &  0.072 &        1.0 &       1.0 \\
2 &    0.2 &         2 &  0.070 &        1.0 &       1.0 \\
3 &    0.5 &         0 &  0.093 &        1.0 &       1.0 \\
4 &    0.5 &         1 &  0.101 &        1.0 &       1.0 \\
5 &    0.5 &         2 &  0.103 &        1.0 &       1.0 \\
6 &    0.8 &         0 &  0.113 &        1.0 &       1.0 \\
7 &    0.8 &         1 &  0.127 &        1.0 &       1.0 \\
8 &    0.8 &         2 &  0.126 &        1.0 &       1.0 \\
\bottomrule
\end{tabular}

