In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import matthews_corrcoef
from sklearn.model_selection import cross_val_score

# Load the data
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# Examine the structure and types of data in each column
print(train_df.info())
print(train_df.describe())

# Handle Missing Values
train_df.fillna(train_df.mean(), inplace=True)
test_df.fillna(test_df.mean(), inplace=True)

# Encode Categorical Features
categorical_cols = train_df.select_dtypes(include=['object']).columns
numerical_cols = train_df.select_dtypes(exclude=['object']).columns

ct = ColumnTransformer([
    ('onehot', OneHotEncoder(), categorical_cols),
    ('label', LabelEncoder(), ['class'])
], remainder='passthrough')

train_df = ct.fit_transform(train_df)
test_df = ct.transform(test_df)

# Split the Training Data
X = train_df[:, :-1]
y = train_df[:, -1]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Select a Model and Train
model = Pipeline([
    ('clf', LogisticRegression(max_iter=10000))
])

model.fit(X_train, y_train)

# Evaluate the Model
scores = cross_val_score(model, X_train, y_train, cv=5, scoring='matthews_corrcoef')
print("Matthews Correlation Coefficient: ", scores.mean())

# Make Predictions on the Test Set
test_pred = model.predict(test_df)

# Prepare the Submission File
submission_df = pd.DataFrame({'id': test_df[:, 0], 'class': test_pred})
submission_df['class'] = submission_df['class'].map({0: 'e', 1: 'p'})

# Save the Submission File
submission_df.to_csv('submission.csv', index=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3116945 entries, 0 to 3116944
Data columns (total 22 columns):
 #   Column                Dtype  
---  ------                -----  
 0   id                    int64  
 1   class                 object 
 2   cap-diameter          float64
 3   cap-shape             object 
 4   cap-surface           object 
 5   cap-color             object 
 6   does-bruise-or-bleed  object 
 7   gill-attachment       object 
 8   gill-spacing          object 
 9   gill-color            object 
 10  stem-height           float64
 11  stem-width            float64
 12  stem-root             object 
 13  stem-surface          object 
 14  stem-color            object 
 15  veil-type             object 
 16  veil-color            object 
 17  has-ring              object 
 18  ring-type             object 
 19  spore-print-color     object 
 20  habitat               object 
 21  season                object 
dtypes: float64(3), int64(1), object(18)
memory