# Binary Prediction of Poisonous Mushrooms

1. Exploratory data analysis
2. Data Preprocessing for Model.
3. Basic model building.
4. Model tunning.
5. Ensamble model building.
6. Results.

In [1]:
import pandas as pd
import xgboost as xgb
import numpy as np

from tqdm import tqdm
import time 
from datetime import datetime

import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping


from xgboost import XGBClassifier

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector as selector
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer

from sklearn.model_selection import train_test_split, cross_val_score , GridSearchCV
from sklearn.metrics import accuracy_score, matthews_corrcoef

from sklearn.tree import DecisionTreeClassifier

from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier, RandomForestClassifier, GradientBoostingClassifier

## Exploratory data analysis. 

In [22]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
print(f"Size of the training set: {train.shape[0]}")
print(f"Size of the test set: {test.shape[0]}")

Size of the training set: 3116945
Size of the test set: 2077964


In [3]:
train.head()

Unnamed: 0,id,class,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,...,stem-root,stem-surface,stem-color,veil-type,veil-color,has-ring,ring-type,spore-print-color,habitat,season
0,0,e,8.8,f,s,u,f,a,c,w,...,,,w,,,f,f,,d,a
1,1,p,4.51,x,h,o,f,a,c,n,...,,y,o,,,t,z,,d,w
2,2,e,6.94,f,s,b,f,x,c,w,...,,s,n,,,f,f,,l,w
3,3,e,3.88,f,y,g,f,s,,g,...,,,w,,,f,f,,d,u
4,4,e,5.85,x,l,w,f,d,,w,...,,,w,,,f,f,,g,a


In [80]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3116945 entries, 0 to 3116944
Data columns (total 22 columns):
 #   Column                Dtype  
---  ------                -----  
 0   id                    int64  
 1   class                 object 
 2   cap-diameter          float64
 3   cap-shape             object 
 4   cap-surface           object 
 5   cap-color             object 
 6   does-bruise-or-bleed  object 
 7   gill-attachment       object 
 8   gill-spacing          object 
 9   gill-color            object 
 10  stem-height           float64
 11  stem-width            float64
 12  stem-root             object 
 13  stem-surface          object 
 14  stem-color            object 
 15  veil-type             object 
 16  veil-color            object 
 17  has-ring              object 
 18  ring-type             object 
 19  spore-print-color     object 
 20  habitat               object 
 21  season                object 
dtypes: float64(3), int64(1), object(18)
memory

In [81]:
print(train.isnull().sum()) # Lots of null values in various columns.

id                            0
class                         0
cap-diameter                  4
cap-shape                    40
cap-surface              671023
cap-color                    12
does-bruise-or-bleed          8
gill-attachment          523936
gill-spacing            1258435
gill-color                   57
stem-height                   0
stem-width                    0
stem-root               2757023
stem-surface            1980861
stem-color                   38
veil-type               2957493
veil-color              2740947
has-ring                     24
ring-type                128880
spore-print-color       2849682
habitat                      45
season                        0
dtype: int64


## Feature engineering

In [23]:
train['cap_shape_surface'] = train['cap-shape'] + '_' + train['cap-surface']
train['cap_stem_ratio'] = train['cap-diameter'] / (train['stem-height'] + 1e-6)
train['stem_area'] = train['stem-height'] * train['stem-width']

test['cap_shape_surface'] = test['cap-shape'] + '_' + test['cap-surface']
test['cap_stem_ratio'] = test['cap-diameter'] / (test['stem-height'] + 1e-6)
test['stem_area'] = test['stem-height'] * test['stem-width']

# Data processing for the model.

In [24]:
numeric_features = ['cap-diameter', 'stem-height', 'stem-width',    'cap_stem_ratio', 'stem_area']
categorical_features = ['cap-shape', 'cap-surface', 'cap-color',
                        'does-bruise-or-bleed', 'gill-attachment', 'gill-spacing', 'gill-color',
                        'stem-root', 'stem-surface', 'stem-color', 'veil-type', 'veil-color',
                        'has-ring', 'ring-type', 'spore-print-color', 'habitat', 'season',      'cap_shape_surface']

In [25]:
X = train.drop(columns=['id', 'class'])
X[categorical_features] = X[categorical_features].astype('category')
y = train['class'].map({'e': 0, 'p': 1})
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

print("Training set shape:", X_train.shape)
print("Testing set shape:", X_val.shape)

Training set shape: (2493556, 23)
Testing set shape: (623389, 23)


In [26]:
# Future use of test set
test = test.drop(['id'], axis=1)

## Data pipeline

In [27]:
# Create a ColumnTransformer to handle different preprocessing steps for different feature types
preprocessor = ColumnTransformer(
    transformers=[
        # Pipeline for numeric features
        ('num', Pipeline([
            # Impute missing values in numeric columns with the median of the column
            ('imputer', SimpleImputer(strategy='median')),
            # Scale numeric features to have mean=0 and standard deviation=1
            ('scaler', StandardScaler())
        ]), numeric_features),
        
        # Pipeline for categorical features
        ('cat', Pipeline([
            # Impute missing values in categorical columns with the string 'missing'
            ('imputer', SimpleImputer(strategy='constant', fill_value='missing')), # Use to do this previously. 
            #('imputer', SimpleImputer(strategy='most_frequent')), # Better results with the missing fill_value. 
            # Convert categorical features to string type
            ('to_string', FunctionTransformer(lambda x: x.astype(str))),
            # One-hot encode categorical features, ignoring unknown categories
            ('onehot', OneHotEncoder(handle_unknown='ignore'))
        ]), categorical_features)
    ])

# Create a full pipeline that applies the preprocessor to the dataset
full_pipeline = Pipeline([
    ('preprocessor', preprocessor)  # Apply the preprocessor to the data
    #('pca', PCA(n_components=5)) # PCA does not work better. 
])

In [28]:
X_train = full_pipeline.fit_transform(X_train)
X_val = full_pipeline.transform(X_val)

In [29]:
test = full_pipeline.transform(test)

# Basic Model building


## Classic ML.

In [20]:
def train_evaluate_model(model, X_train, y_train, X_test, y_test, cv=5):
    # Fit the model
    model.fit(X_train, y_train)
    
    # Make predictions
    train_preds = model.predict(X_train)
    test_preds = model.predict(X_test)
    
    # Calculate metrics
    train_accuracy = accuracy_score(y_train, train_preds)
    test_accuracy = accuracy_score(y_test, test_preds)
    
    # Perform cross-validation
    cv_scores = cross_val_score(model, X_train, y_train, cv=cv)
    
    return {
        'model': model.__class__.__name__,
        'train_accuracy': train_accuracy,
        'test_accuracy': test_accuracy,
        'cv_mean': np.mean(cv_scores),
        'cv_std': np.std(cv_scores)
    }

models = [
    XGBClassifier(
        random_state=42, 
        n_estimators=50, 
        max_depth=2,
        min_child_weight=2,
        subsample=0.8,
        colsample_bytree=0.8,
        early_stopping_rounds=10
    ),
    LogisticRegression(random_state=42, max_iter=100),
    DecisionTreeClassifier(random_state=42, max_depth=10)
    #'Random Forest': RandomForestClassifier(random_state=42, n_estimators=100, max_depth=4),
    #'Gradient Boosting': GradientBoostingClassifier(random_state=42, n_estimators=100, learning_rate=0.1, max_depth=3),
    #SVC(random_state=42, max_iter=1000, tol=1e-3),
    #KNeighborsClassifier(n_neighbors=5, n_jobs=-1),
]

results = []

# Initialize tqdm progress bar
for model in tqdm(models, desc="Training Models", unit="model"):
    start_time = time.time()  # Track the start time
    
    if isinstance(model, XGBClassifier):
        model.fit(X_train, y_train, eval_set=[(X_val, y_val)])
    else:
        # Fit other models
        model.fit(X_train, y_train)
    
    # Train and evaluate the model
    model_results = train_evaluate_model(model, X_train, y_train, X_val, y_val)
    
    # Calculate the time taken
    end_time = time.time()
    model_results['time_taken'] = end_time - start_time
    
    # Append results
    results.append(model_results)

# Create and sort the results DataFrame
results_df = pd.DataFrame(results)
results_df = results_df.sort_values('test_accuracy', ascending=False).reset_index(drop=True)

print(results_df)

## Deep learning

In [None]:
model = Sequential([
    Dense(units=512, activation='relu', name='L1'),
    Dense(units=256, activation='relu', name='L2'),
    Dense(units=128, activation='relu', name='L3'),
    Dense(units=64, activation='relu', name='L4'),
    Dense(units=32, activation='relu', name='L5'),
    Dense(units=16, activation='relu', name='L6'),
    Dense(units=1, activation='sigmoid', name='L7')
])

In [None]:
model = Sequential([
    Dense(units=1024, activation='relu', name='L1'),  # Increased units
    Dropout(0.5),  # Optional: Add dropout for regularization
    Dense(units=512, activation='relu', name='L2'),   # Increased units
    Dropout(0.5),  # Optional: Add dropout for regularization
    Dense(units=256, activation='relu', name='L3'),   # Increased units
    Dense(units=128, activation='relu', name='L4'),    # Increased units
    Dense(units=64, activation='relu', name='L5'),      # Increased units
    Dense(units=32, activation='relu', name='L6'),      # Increased units
    Dense(units=16, activation='relu', name='L7'),      # Increased units
    Dense(units=1, activation='sigmoid', name='L8')     # Output layer
])

In [None]:
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3), loss='binary_crossentropy', metrics=['accuracy'])

early_stopping = EarlyStopping(
    monitor='val_loss',  # You can also use 'val_accuracy'
    patience=10,  # Number of epochs to wait for improvement
    restore_best_weights=True  # Restore the best model weights
)

model.fit(
    X_train,
    y_train,
    epochs=100,
    batch_size=128,
    validation_data=(X_val, y_val),  # Include validation data
    callbacks=[early_stopping]  # Include the early stopping callback
)

In [16]:
model.summary()

In [17]:
y_pred_train = model.predict(X_train)
y_pred_val = model.predict(X_val)

y_pred_train = (y_pred_train >= 0.5).astype(int)
y_pred_val = (y_pred_val >= 0.5).astype(int)

mcc_train = matthews_corrcoef(y_train, y_pred_train)
mcc_val = matthews_corrcoef(y_val, y_pred_val)
print(f"Train Matthhews {mcc_train} ")
print(f"Val Matthhews {mcc_val} ")

[1m77924/77924[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 535us/step
[1m19481/19481[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 502us/step
Train Matthhews 0.9370928730648166 
Val Matthhews 0.9348270255740985 


In [18]:
selected_model = model
predictions = selected_model.predict(test)
print(predictions)

binary_predictions = (predictions >= 0.5).astype(int)
print(binary_predictions)
predictions = np.where(binary_predictions == 0, 'e', 'p')
print(predictions)
predictions = predictions.flatten()

[1m64937/64937[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 512us/step
[[6.2245363e-03]
 [9.9635690e-01]
 [9.5335340e-01]
 ...
 [8.9703065e-01]
 [4.9264405e-02]
 [9.1004840e-06]]
[[0]
 [1]
 [1]
 ...
 [1]
 [0]
 [0]]
[['e']
 ['p']
 ['p']
 ...
 ['p']
 ['e']
 ['e']]


# Make final predictions 

In [None]:
selected_model = models[1] # LR
predictions = selected_model.predict(test)

In [None]:
print(predictions)
predictions = np.where(predictions == 0, 'e', 'p')
print(predictions)

# Prepare upload

In [19]:
choosen_model_name = 'nn_512_256_128_64_32_16_1'

submission = pd.DataFrame({
    'id': pd.read_csv('data/test.csv')['id'],  # Ensure PassengerId is correctly handled
    'class': predictions
})

# Get the current date and time
now = datetime.now()
# Format the date and time as a string
date_time_str = now.strftime("%Y%m%d_%H%M%S")

# Save the DataFrame to a CSV file with the date and time in the filename
submission.to_csv(f'output/submission_{choosen_model_name}_{date_time_str}.csv', index=False)

Pending:
1. understand features correctly 
2. See how we can create new ones with feature engineering.
3. Create a huge NN and leave it training for a whole night in Kaggle.