In [25]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import matthews_corrcoef
from sklearn.impute import SimpleImputer
import warnings

warnings.filterwarnings('ignore')

In [3]:
# Loading the data
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [8]:
# Display the first few rows of the training data
train_data.head()

Unnamed: 0,class,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-height,...,stem-root,stem-surface,stem-color,veil-type,veil-color,has-ring,ring-type,spore-print-color,habitat,season
0,0,8.8,53,s,u,f,a,c,w,4.51,...,,,w,,,f,f,,d,a
1,1,4.51,71,h,o,f,a,c,n,4.79,...,,y,o,,,t,z,,d,w
2,0,6.94,53,s,b,f,x,c,w,6.85,...,,s,n,,,f,f,,l,w
3,0,3.88,53,y,g,f,s,,g,4.16,...,,,w,,,f,f,,d,u
4,0,5.85,71,l,w,f,d,,w,3.37,...,,,w,,,f,f,,g,a


In [20]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3116945 entries, 0 to 3116944
Data columns (total 21 columns):
 #   Column                Dtype  
---  ------                -----  
 0   class                 int64  
 1   cap-diameter          float64
 2   cap-shape             int64  
 3   cap-surface           object 
 4   cap-color             object 
 5   does-bruise-or-bleed  object 
 6   gill-attachment       object 
 7   gill-spacing          object 
 8   gill-color            object 
 9   stem-height           float64
 10  stem-width            float64
 11  stem-root             object 
 12  stem-surface          object 
 13  stem-color            object 
 14  veil-type             object 
 15  veil-color            object 
 16  has-ring              object 
 17  ring-type             object 
 18  spore-print-color     object 
 19  habitat               object 
 20  season                object 
dtypes: float64(3), int64(2), object(16)
memory usage: 499.4+ MB


In [17]:
test_data.head()

Unnamed: 0,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-height,stem-width,stem-root,stem-surface,stem-color,veil-type,veil-color,has-ring,ring-type,spore-print-color,habitat,season
0,8.64,x,,n,t,,,w,11.13,17.12,b,,w,u,w,t,g,,d,a
1,6.9,o,t,o,f,,c,y,1.27,10.75,,,n,,,f,f,,d,a
2,2.0,b,g,n,f,,c,n,6.18,3.14,,,n,,,f,f,,d,s
3,3.47,x,t,n,f,s,c,n,4.98,8.51,,,w,,n,t,z,,d,u
4,6.17,x,h,y,f,p,,y,6.73,13.7,,,y,,y,t,,,d,u


In [18]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2077964 entries, 0 to 2077963
Data columns (total 20 columns):
 #   Column                Dtype  
---  ------                -----  
 0   cap-diameter          float64
 1   cap-shape             object 
 2   cap-surface           object 
 3   cap-color             object 
 4   does-bruise-or-bleed  object 
 5   gill-attachment       object 
 6   gill-spacing          object 
 7   gill-color            object 
 8   stem-height           float64
 9   stem-width            float64
 10  stem-root             object 
 11  stem-surface          object 
 12  stem-color            object 
 13  veil-type             object 
 14  veil-color            object 
 15  has-ring              object 
 16  ring-type             object 
 17  spore-print-color     object 
 18  habitat               object 
 19  season                object 
dtypes: float64(3), object(17)
memory usage: 317.1+ MB


In [19]:
test_data.columns

Index(['cap-diameter', 'cap-shape', 'cap-surface', 'cap-color',
       'does-bruise-or-bleed', 'gill-attachment', 'gill-spacing', 'gill-color',
       'stem-height', 'stem-width', 'stem-root', 'stem-surface', 'stem-color',
       'veil-type', 'veil-color', 'has-ring', 'ring-type', 'spore-print-color',
       'habitat', 'season'],
      dtype='object')

In [26]:
# Preprocess the data
def preprocess_data(data):
    # Impute missing values
    for col in data.columns:
        if data[col].dtype == 'object':
            imputer = SimpleImputer(strategy='most_frequent')
            data[col] = imputer.fit_transform(data[[col]]).ravel()
        else:
            imputer = SimpleImputer(strategy='median')
            data[col] = imputer.fit_transform(data[[col]]).ravel()

    # Encode categorical variables
    le = LabelEncoder()
    for col in data.columns:
        if data[col].dtype == 'object':
            data[col] = le.fit_transform(data[col])

    return data

In [27]:
# Preprocess train and test data
train_data = preprocess_data(train_data)
test_data = preprocess_data(test_data)

# Separate features and target variable
X = train_data.drop(columns=['class'])
y = train_data['class']

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [28]:
# Scale features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
test_data = scaler.transform(test_data)

# Train a RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

In [29]:
# Validate the model
y_pred = clf.predict(X_val)
mcc = matthews_corrcoef(y_val, y_pred)
print(f'Matthews Correlation Coefficient: {mcc}')

# Predict on test data
test_predictions = clf.predict(test_data)

Matthews Correlation Coefficient: 0.9826141794934214


In [30]:
# Prepare submission file
submission = pd.DataFrame({'id': np.arange(len(test_predictions)), 'class': test_predictions})
submission['class'] = submission['class'].apply(lambda x: 'e' if x == 0 else 'p')
submission.to_csv('submission.csv', index=False)
print(submission.head())

   id class
0   0     e
1   1     p
2   2     p
3   3     p
4   4     p
