### Goal
uncover patterns in the data and use them to build a model to identify which mushrooms are edible or poisonous

followed this notebook: https://www.kaggle.com/code/annastasy/ps4e8-data-cleaning-and-eda-of-mushrooms#-1.-Importing-Required-Libraries-

### Import libraries and Dataset

In [32]:
#importing libraries
import pandas as pd
import numpy as np 
from scipy import stats
import seaborn as sns 
import matplotlib.pyplot as plt
from statsmodels.graphics.mosaicplot import mosaic
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, FunctionTransformer, LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from xgboost import XGBClassifier
from sklearn.metrics import matthews_corrcoef
from sklearn.ensemble import IsolationForest

palette = sns.color_palette("Spectral", n_colors=13) 
sns.set_theme(context='notebook', palette=palette, style='darkgrid')
rs = 101

In [33]:
df_train = pd.read_csv('/kaggle/input/playground-series-s4e8/train.csv')
df_test = pd.read_csv('/kaggle/input/playground-series-s4e8/test.csv')
df_sub = pd.read_csv('/kaggle/input/playground-series-s4e8/sample_submission.csv')

### Explore on data

In [34]:
df_train.head()

Unnamed: 0,id,class,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,...,stem-root,stem-surface,stem-color,veil-type,veil-color,has-ring,ring-type,spore-print-color,habitat,season
0,0,e,8.8,f,s,u,f,a,c,w,...,,,w,,,f,f,,d,a
1,1,p,4.51,x,h,o,f,a,c,n,...,,y,o,,,t,z,,d,w
2,2,e,6.94,f,s,b,f,x,c,w,...,,s,n,,,f,f,,l,w
3,3,e,3.88,f,y,g,f,s,,g,...,,,w,,,f,f,,d,u
4,4,e,5.85,x,l,w,f,d,,w,...,,,w,,,f,f,,g,a


In [35]:
df_test.head()

Unnamed: 0,id,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-height,...,stem-root,stem-surface,stem-color,veil-type,veil-color,has-ring,ring-type,spore-print-color,habitat,season
0,3116945,8.64,x,,n,t,,,w,11.13,...,b,,w,u,w,t,g,,d,a
1,3116946,6.9,o,t,o,f,,c,y,1.27,...,,,n,,,f,f,,d,a
2,3116947,2.0,b,g,n,f,,c,n,6.18,...,,,n,,,f,f,,d,s
3,3116948,3.47,x,t,n,f,s,c,n,4.98,...,,,w,,n,t,z,,d,u
4,3116949,6.17,x,h,y,f,p,,y,6.73,...,,,y,,y,t,,,d,u


In [36]:
print(f"there are {df_train.shape[1]} colums and {df_train.shape[0]} rows in train_df")
print(f"there are {df_test.shape[1]} colums and {df_test.shape[0]} rows in test_df")

there are 22 colums and 3116945 rows in train_df
there are 21 colums and 2077964 rows in test_df


In [37]:
print("Colum names and data type of each column")
df_train.dtypes

Colum names and data type of each column


id                        int64
class                    object
cap-diameter            float64
cap-shape                object
cap-surface              object
cap-color                object
does-bruise-or-bleed     object
gill-attachment          object
gill-spacing             object
gill-color               object
stem-height             float64
stem-width              float64
stem-root                object
stem-surface             object
stem-color               object
veil-type                object
veil-color               object
has-ring                 object
ring-type                object
spore-print-color        object
habitat                  object
season                   object
dtype: object

In [38]:
#check is there any duplicates in df_train
print("There are {} duplicates in train dataset".format(df_train.duplicated().sum()))

There are 0 duplicates in train dataset


In [39]:
#Checking missing values in df_train
print("Checking missing values")
print(df_train.isnull().mean() * 100)

Checking missing values
id                       0.000000
class                    0.000000
cap-diameter             0.000128
cap-shape                0.001283
cap-surface             21.528227
cap-color                0.000385
does-bruise-or-bleed     0.000257
gill-attachment         16.809280
gill-spacing            40.373988
gill-color               0.001829
stem-height              0.000000
stem-width               0.000000
stem-root               88.452732
stem-surface            63.551362
stem-color               0.001219
veil-type               94.884350
veil-color              87.936970
has-ring                 0.000770
ring-type                4.134818
spore-print-color       91.425482
habitat                  0.001444
season                   0.000000
dtype: float64


### Data cleansing

Columns which have missing data
* spore-print-color       91.425482
* veil-type               94.884350
* veil-color              87.936970
* stem-root               88.452732
* stem-surface            63.551362
* gill-spacing            40.373988

In [40]:
#drop column name 'id' which will not going to be used for prediciton

df_train = df_train.drop(columns=['id'])


In [41]:
df_train_cleaned = df_train.copy()
df_test_cleaned = df_test.copy()

#set up target column
target_column = 'class'

#select categorial columns
categorical_columns = df_train_cleaned.select_dtypes(include=[object]).columns
categorical_columns = categorical_columns.drop(target_column)

#select numerical cloumns
numerical_columns = df_train_cleaned.select_dtypes(exclude=[object]).columns

#print out lists of columns
print("target column:", target_column)
print("\n categorical columns:", categorical_columns)
print("\n numerical columns:", numerical_columns)

target column: class

 categorical columns: Index(['cap-shape', 'cap-surface', 'cap-color', 'does-bruise-or-bleed',
       'gill-attachment', 'gill-spacing', 'gill-color', 'stem-root',
       'stem-surface', 'stem-color', 'veil-type', 'veil-color', 'has-ring',
       'ring-type', 'spore-print-color', 'habitat', 'season'],
      dtype='object')

 numerical columns: Index(['cap-diameter', 'stem-height', 'stem-width'], dtype='object')


### Deal with Infrequent Categories

In [42]:
#categorial columns
for column in categorical_columns:
    num_unique = df_train_cleaned[column].nunique()
    print(f"'{column}' has {num_unique} unique categories.")
    



'cap-shape' has 74 unique categories.
'cap-surface' has 83 unique categories.
'cap-color' has 78 unique categories.
'does-bruise-or-bleed' has 26 unique categories.
'gill-attachment' has 78 unique categories.
'gill-spacing' has 48 unique categories.
'gill-color' has 63 unique categories.
'stem-root' has 38 unique categories.
'stem-surface' has 60 unique categories.
'stem-color' has 59 unique categories.
'veil-type' has 22 unique categories.
'veil-color' has 24 unique categories.
'has-ring' has 23 unique categories.
'ring-type' has 40 unique categories.
'spore-print-color' has 32 unique categories.
'habitat' has 52 unique categories.
'season' has 4 unique categories.


In [None]:
#print top 10 unique value counts for each categorical column
for column in categorical_columns:
    print(f"\n top value counts in '{column}' : \n {df_train_cleaned[column].value_counts().head(10)}")

some categories don't show up often compared to other categories <p>
to fix this group these rate categories together into a new category called "Unknown" <p>
Threshold is 70

In [None]:
def replace_infrequent_categories(df, column, threshold=70):
    value_counts = df[column].value_counts()
    infrequent = value_counts[value_counts <= threshold].index
    df[column] = df[column].apply(lambda x: "unknown" if x in infrequent else x)
    return df

#handle infrequent categories in both train and test dataset

for col in categorical_columns:
    df_train_cleaned = replace_infrequent_categories(df_train_cleaned, col)
    df_test_cleaned = replace_infrequent_categories(df_test_cleaned, col)
    
#print out cleaned results in both train and test dataset
print("replacement results")
for column in categorical_columns:
    num_unique = df_train_cleaned[column].nunique()
    print(f"'{column}' has {num_unique} categories.")

### Fill Missing values in Numerical columns

1. check skewness of data <p>
   what is skewness: 데이터 분포의 비대칭성 측정
   * the more skew value is closer to 0, this means dataset distributed as symmetrically <p>
    
2. use meadian value to fill NA values: as skewness of all numerical columns is more than 1, so used the median value to fill in any missing values

In [None]:
#check skewness of data

print("The skewness of columns:")
print(df_train_cleaned[numerical_columns].skew())

In [None]:
medians = df_train_cleaned[numerical_columns].median()

#Fill missing values in the training and testing sets
df_train_cleaned[numerical_columns] = df_train_cleaned[numerical_columns].fillna(medians)
df_test_cleaned[numerical_columns] = df_test_cleaned[numerical_columns].fillna(medians)

### Fill Missing values in categorical columns

In [None]:
# Impute any missing values with 'Unknown'
df_train_cleaned = df_train_cleaned.fillna("Unknown")
df_test_cleaned = df_test_cleaned.fillna("Unknown")

In [None]:
#check duplicates and clean them 

print("There are {} duplicates in train dataset.".format(df_train_cleaned.duplicated().sum()))
print("There are {} duplicates in test dataset.".format(df_test_cleaned.duplicated().sum()))

In [None]:
df_train_cleaned = df_train_cleaned.drop_duplicates()

## Exploratory Data Analysis(EDA)

1. distribution of numerical features
2. distribution of categorical features
3. correlation in numerical features
4. correlation in categorical features
5. exploring outliers
6. distribution of a target variable

### Distribution of numerical Features

In [None]:
plt.figure(figsize=(8,15))

for i, column in enumerate(numerical_columns):
    plt.subplot(3, 1, i+1)
    #draw histogram x axis: column , kde = 히스토그램 위에 밀도 곡선 추가?, bis = 히스토그램에서 사용할 구각의 개수 지정
    sns.histplot(data=df_train_cleaned, x=column, kde=True, bins=20) 
    
    plt.title(f'Distribution of {column}')
    
    #플롯의 축 선(spine) 제거하거나 간소화
    sns.despine()
    
    
plt.tight_layout()
plt.show()

The distribution of numerical columns is right-skewed with outliers <p>
this means there are some unusually high values(outliers) that are far away from the rest. 
<p>
This suggest data is may not be normally distriuted, which could impact our analysis and modeling results.


### Distribution of Categorical Features

In [None]:
#plot countplots for each categorical column
for column in categorical_columns:
    #exclude 'unknown'
    filtered_data = df_train_cleaned.loc[df_train_cleaned[column] != 'Unknown']
    
    #draw figure
    plt.figure(figsize=(8,5))
    sns.countplot(data=filtered_data, x=column)
    plt.title(f'Countplot of {column}')
    
    plt.tight_layout()
    plt.show()

### Exploring Correlations betweeb Numerical Features

In [None]:
custom_palette = sns.color_palette(["#5b81d4", "#b03e4d"])

pairplot = sns.pairplot(df_train_cleaned, hue='class', palette=custom_palette)
pairplot.figure.suptitle('Pairplot', fontsize=22, y=1.02)
plt.show()

According to pairplot, poisonous mushrooms tend to have smaller ecaps and narrower stems

In [None]:
for column in numerical_columns:
    plt.figure(figsize=(8,6))
    sns.violinplot(data=df_train_cleaned, x='class', y=column)
    plt.title(f'Distribution of {column} by class')
    
    plt.tight_layout()
    plt.show()

This plot shows there are presence of outliers

### Exploring Correlations between categorical Features

In [None]:
#plot mosaic plots for each categorical column, excluding "Unknown" values
for column in categorical_columns:
    filtered_data = df_train_cleaned.loc[df_train_cleaned[column] != 'Unknown']
    
    plt.figure(figsize=(8,6))
    mosaic(filtered_data, [column,'class'])
    plt.title(f'Mosaic Plot of {column} and class')
    
    plt.tight_layout()
    plt.show()

Based on visualization above, ti is evident that edible and poisonous mushrroms have distinct chatacteristics. <p>
    
For example,<p>
Edible mushrooms
* more prevalent in summer and winter <p>

Poisonous mushrooms
* more prevalent in autumn and spring


### Exploring Outliers

use z score to explore outliers
* z score는 데이터를 표준 정규 분포로 변환하여 평균을 0 표준 편차를 1로 만듬 -> 데이터의 원래 단위나 척도와 관계없이 일관된 비교가 가능해짐
* z score는 데이터 포인트가 평균에서 얼마나 떨어져 있는지를 표준 편차 단위로 나타냄 -> z score가 +-3을 넘는 데이터 포인트는 평균에서 많이 벗어난 것으로 간주되어 이상치로 식별됨
* z score를 사용하면 데이터가 특정 범위 내에 얼마나 몰려 있는지, 어떤 값들이 극단적으로 높은지 또는 낮은지 파악할 수 있음

In [None]:
#Calculate Z scores for the mumerical columns in the dataframe

z_scores = stats.zscore(df_train_cleaned[numerical_columns])

#Generate descriptive statistics for the z-scores and round the results to 3 decimal places
z_scores.describe().round(3)

From the z score
* outliers in 3 numerical columns: cap-diameter, stem-height, stem-width
* Choose isolation forest for removing outliers


### Distribution of a Target Variable

In [None]:
#Calculate counts for the pie chart and add labels

class_counts = df_train_cleaned['class'].value_counts().sort_index()
labels = ["Edible","Poisonous"]

plt.figure(figsize=(6,6))
plt.pie(class_counts, labels=labels, colors=custom_palette, autopct='%1.1f%%', startangle=90)
plt.title('Distribution of classes')
plt.axis('equal')

plt.show()

## Machine Learning

Preprocecssing steps
1. Label Encoder initialization: start by creating an instance of `LabelEncoder`, which is used to convert categorical labels('edible', 'poisonous') into numerical values(like 0,1)
2. Convert Categorical Columns to 'Category' dtype: this is efficient for memory usage and makes it easier to apply specific transformations to these columns
3. Defining the Numerical Pipeline
* Standard Scalar: this step standardizes the numerical features by removing the mean and scaling to unit variance
* Convert to float 32: this step converts the data type to `float32` to save memory similar to converting categorical columns to category dtype
4. Defining the Categorical pipeline: OrdinalEncoder: Encodes categorical feature as integers
* `handle_unknown=use_encoded_value` allows handling of unseen categories during transformation by assigning them a specific value(such as -1)
5. ColumnTransformer: combines the numerical and categorical pipeline into a single transformation step
6. Apply transformations: 
* `fit_transformation` fits the preprocessor on the training data and applies the transformations. 
* `transform` applies the same transformations to the test data.
Note that fit is not called onn the test data to avoid data leakage

In [None]:
#Initialize Encoder
label_encoder = LabelEncoder()

#Fit and transform the target variable
train_encoded_target = label_encoder.fit_transform(df_train_cleaned[['class']])

#convert categorical columns to 'category' dtype
df_train_cleaned[categorical_columns] = df_train_cleaned[categorical_columns].astype('category')
df_test_cleaned[categorical_columns] = df_test_cleaned[categorical_columns].astype('category')

#define numerical pipeline
numerical_pipeline = Pipeline(steps=[
    ('scaler', StandardScaler()),
    ('convert_to_float32', FunctionTransformer(lambda x: x.astype(np.float32)))
])

#Define categorical pipeline
categorical_pipeline = Pipeline(steps=[
     ('ordinal', OrdinalEncoder(dtype=np.int32, handle_unknown='use_encoded_value', unknown_value=-1))
])

#Combine both numerical and categorical pipelines
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, numerical_columns),
        ('cat', categorical_pipeline, categorical_columns)
    ]
)

# Apply the transformations using the pipeline
df_train_preprocessed = preprocessor.fit_transform(df_train_cleaned)
df_test_preprocessed = preprocessor.transform(df_test_cleaned)


***Difference between fit_transform and transform***

* fit_transform: fit()데이터를 기반으로 필요한 통계값을 계산하여 모델을 학습 -> 학습한 모델을 사용하여 데이터를 변환(보통 training 데이터에 많이 사용)
* transform: 이미 학습된 모델을 사용하여 데이터를 변환 (보통 테스트 데이터에 많이 사용)

**Isolation forest**

1. randomly sampling data and make many subsets
2. random으로 데이터 분할 -> build isolation tree
3. 데이터를 계속해서 분할하면서 특정 데이터 포인트가 얼마나 빨리 분리되는지 확인 -> 빨리 분리될수록 이상치일 확률 높음
4. 각 데이터 포인트에서 여러 트리에서 평균적으로 고립되는 수준을 계산(길이가 짧을수록 이상치일 가능성이 높음)

In [None]:
# Apply Isolation Forest for outlier detection
isolation_forest = IsolationForest(contamination=0.02, random_state=rs)
outlier_labels = isolation_forest.fit_predict(df_train_preprocessed)

# Filter out outliers
non_outliers_mask = outlier_labels != -1
df_train_preprocessed = df_train_preprocessed[non_outliers_mask]
train_encoded_target = train_encoded_target[non_outliers_mask]

In [None]:
# Separate features (X) and target variable (y)
X = df_train_preprocessed
y = train_encoded_target

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=rs)

**XGBClassifier**

* Gradient Boosting algorithm: 여러개의 약한 학습기를(대부분 결정트리) 결합하여 강력한 모델 생성, 새로운 학습기는 이전 학습기의 오차를 줄이기 위해 학습
* Regularization: L1, L2 정규화를 지원하여 과적합 방지
* Can automatically handle missing values
* Support parellelization
* tree pruning, max_depth
* Cross validation
* Suitable for large dataset

In [None]:
## Tuning Hyperparameters

# Define the XGBClassifier
xgb = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 500],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [1, 5, 7, 14],
    'min_child_weight': [1, 5, 10],
    'subsample': [0.5, 0.9],
    'colsample_bytree': [0.4, 0.6, 0.8]
}

# Define a custom scoring function for MCC
def mcc_scorer(estimator, X, y):
    y_pred = estimator.predict(X)
    return matthews_corrcoef(y, y_pred)

# Setup the GridSearchCV
grid_search = GridSearchCV(estimator=xgb, param_grid=param_grid, 
                           scoring=mcc_scorer, 
                           cv=5, 
                           verbose=1, 
                           n_jobs=-1)

# Fit GridSearchCV
grid_search.fit(X_train, y_train)

# Print the best parameters and best score
print("Best Parameters:\n", grid_search.best_params_)
print("Best MCC Score:\n", grid_search.best_score_)

# Predict on the test set
y_pred = grid_search.best_estimator_.predict(X_test)

# Evaluate the model
print("Test MCC Score:", matthews_corrcoef(y_test, y_pred))

In [None]:
# Define the best parameters
params = {
    'colsample_bytree': 0.4, 
    'learning_rate': 0.1, 
    'max_depth': 14, 
    'min_child_weight': 1, 
    'n_estimators': 200, 
    'subsample': 0.9,
    'use_label_encoder': False,  
    'eval_metric': 'mlogloss'   
}

# Initialize the XGBClassifier with the defined parameters
xgb_model = XGBClassifier(**params)

# Fit the model to the training data 
xgb_model.fit(X_train, y_train)

# Predict on the test data 
y_pred = xgb_model.predict(X_test)

# Evaluate the model using Matthews correlation coefficient
mcc = matthews_corrcoef(y_test, y_pred)
print("Matthews Correlation Coefficient:", mcc)


## Predictions and Submission

In [None]:
test_preds = xgb_model.predict(df_test_preprocessed)
test_preds = label_encoder.inverse_transform(test_preds)

In [None]:
output = pd.DataFrame({'id': df_test['id'],
                       'class': test_preds})

output.to_csv('submission.csv', index=False)

output.head()