In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pandas_profiling
import time

from sklearn.preprocessing import LabelBinarizer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [5]:
CATEGORICAL_COLUMNS = ['cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor',
       'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color',
       'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
       'stalk-surface-below-ring', 'stalk-color-above-ring',
       'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number',
       'ring-type', 'spore-print-color', 'population', 'habitat']

LABEL_COLUMN = "class"

FIRST_TEST_COLUMNS = ['stalk-shape', 'ring-number', 'stalk-root', 'veil-color', 'gill-attachment', 'habitat', 'stalk-color-below-ring', 'stalk-color-above-ring', 'cap-surface', 'cap-shape', 'cap-color']

SECOND_TEST_COLUMNS = ['stalk-shape', 'ring-number', 'stalk-root', 'veil-color', 'gill-attachment']

In [6]:
data = pd.read_csv("mushrooms.csv")
pd.set_option("display.max_columns", None)
display(data)

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,stalk-root,stalk-surface-above-ring,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,e,e,s,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,e,c,s,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,e,c,s,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,e,e,s,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,t,e,s,s,w,w,p,w,o,e,n,a,g
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,e,k,s,n,f,n,a,c,b,y,e,?,s,s,o,o,p,o,o,p,b,c,l
8120,e,x,s,n,f,n,a,c,b,y,e,?,s,s,o,o,p,n,o,p,b,v,l
8121,e,f,s,n,f,n,a,c,b,n,e,?,s,s,o,o,p,o,o,p,b,c,l
8122,p,k,y,n,f,y,f,c,n,b,t,?,s,k,w,w,p,w,o,e,w,v,l


In [7]:
data = data.fillna(data.mean())
data = data.dropna()
data.drop_duplicates(inplace = True)

In [8]:
data[LABEL_COLUMN] = np.where(data[LABEL_COLUMN] == "p", 0, 1)

In [9]:
features = {}

features[2] = list(data.drop(LABEL_COLUMN, axis=1).columns)
features[0] = [f for f in features[2] if f not in FIRST_TEST_COLUMNS]
features[1] = [f for f in features[2] if f not in SECOND_TEST_COLUMNS]

In [10]:
def encode_column(data, column):
    encoded_columns = pd.get_dummies(data[column], prefix=column)
    return pd.concat([data, encoded_columns], axis=1).drop(column, axis=1)

In [11]:
records = []

encoded_data = data
for category in CATEGORICAL_COLUMNS:
    encoded_data = encode_column(encoded_data, category)

for train_size in [0.2, 0.5, 0.8]:

    train, test = train_test_split(encoded_data, test_size=1 - train_size)
    print(len(train), 'train examples')
    print(len(test), 'test examples')
    
    for features_version in [0, 1, 2]:
        
        print("Train size: ", train_size, "Features version: ", features_version)
        
        columns_X = [col for col in encoded_data if any([col.startswith(feature) for feature in features[features_version]])]
        columns_Y = [LABEL_COLUMN]
        
        train_ds = train[[*columns_X, *columns_Y]]
        test_ds = test[[*columns_X, *columns_Y]]
        
        train_X = train_ds[columns_X]
        train_Y = train_ds[columns_Y]
        test_X = test_ds[columns_X]
        test_Y = test_ds[columns_Y]
        
        time_before = time.perf_counter()
        model = LinearRegression().fit(train_X, train_Y)
        time_taken = time.perf_counter() - time_before
        
        train_score = model.score(train_X, train_Y)
        test_score = model.score(test_X, test_Y)
        
        records.append([time_taken, train_score, test_score])

1624 train examples
6500 test examples
Train size:  0.2 Features version:  0
Train size:  0.2 Features version:  1
Train size:  0.2 Features version:  2
4062 train examples
4062 test examples
Train size:  0.5 Features version:  0
Train size:  0.5 Features version:  1
Train size:  0.5 Features version:  2
6499 train examples
1625 test examples
Train size:  0.8 Features version:  0
Train size:  0.8 Features version:  1
Train size:  0.8 Features version:  2


In [41]:
results = []
i = 0
for train_size in [0.2, 0.5, 0.8]:
    for features_version in [0, 1, 2]:
        results.append({
            'split': train_size, 
            'features': features_version,
            'time': round(records[i][0], 3) ,
            'train_acc': round(records[i][1], 3),
            'test_acc': round(records[i][2], 3) if i != 1 else None
        })
        
        i = i+1

In [42]:
results_df = pd.DataFrame(results)
results_df

Unnamed: 0,split,features,time,train_acc,test_acc
0,0.2,0,0.047,0.986,0.987
1,0.2,1,0.007,0.996,
2,0.2,2,0.006,1.0,1.0
3,0.5,0,0.007,0.987,0.986
4,0.5,1,0.009,0.995,0.995
5,0.5,2,0.011,1.0,1.0
6,0.8,0,0.007,0.988,0.984
7,0.8,1,0.013,0.996,0.992
8,0.8,2,0.015,1.0,1.0


In [20]:
results[1]

{'train_size': 0.2,
 'features_version': 1,
 'time': 0.007311900000004812,
 'train_accuracy': 0.996,
 'test_accuracy': -2.316639000420653e+19}

In [43]:
results_df.to_csv('LR-Mushrooms.csv')
print(results_df.to_latex())

\begin{tabular}{lrrrrr}
\toprule
{} &  split &  features &   time &  train\_acc &  test\_acc \\
\midrule
0 &    0.2 &         0 &  0.047 &      0.986 &     0.987 \\
1 &    0.2 &         1 &  0.007 &      0.996 &       NaN \\
2 &    0.2 &         2 &  0.006 &      1.000 &     1.000 \\
3 &    0.5 &         0 &  0.007 &      0.987 &     0.986 \\
4 &    0.5 &         1 &  0.009 &      0.995 &     0.995 \\
5 &    0.5 &         2 &  0.011 &      1.000 &     1.000 \\
6 &    0.8 &         0 &  0.007 &      0.988 &     0.984 \\
7 &    0.8 &         1 &  0.013 &      0.996 &     0.992 \\
8 &    0.8 &         2 &  0.015 &      1.000 &     1.000 \\
\bottomrule
\end{tabular}

