# Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

from sklearn.metrics import confusion_matrix

# Onehot encoder
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder

from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier, VotingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier


In [2]:
train0 = pd.read_csv('/Users/merlesteffen/Documents/GitHub/MushroomClassification/Data/train.csv')
test0 = pd.read_csv('/Users/merlesteffen/Documents/GitHub/MushroomClassification/Data/test.csv')

In [3]:
train_id = train0.pop('Id')
test_id = test0.pop('Id')
target = train0.pop('poisonous')

# Split Data

In [4]:
X = train0.copy()
y = target.copy()


In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=42)

# Encode

In [6]:
cat_pipe = make_pipeline(
    OneHotEncoder(sparse_output=False),
    StandardScaler()
)

In [7]:
cat_pipe.fit(X_train)

In [20]:
# Transform the original data using the pipeline
X_train= cat_pipe.transform(X_train)
X_test = cat_pipe.transform(X_test)



ValueError: X has 42 features, but OneHotEncoder is expecting 6 features as input.

In [10]:
y = LabelEncoder().fit_transform(y)

In [11]:
clf = RandomForestClassifier(n_estimators=100, random_state=0)

In [12]:
clf.fit(X_train, y_train)

In [13]:
y_pred = clf.predict(X_test)

In [16]:
plt.figure(num=None, figsize=(10,8), dpi=80, facecolor='w', edgecolor='k')

# Extract feature importances from the classifier and match with column names
feat_importances = pd.Series(clf.feature_importances_, index= X.columns)

# Sort the values for better visualization
feat_importances = feat_importances.sort_values()

feat_importances.plot(kind='barh')
plt.title("Feature Importance")
plt.show()

ValueError: Length of values (42) does not match length of index (6)

<Figure size 800x640 with 0 Axes>

# Model Selection

In [None]:

# Linear Models
log_model = LogisticRegression()
ridge_classifier = RidgeClassifier()

# Support Vector Machines
linear_svc = LinearSVC()
svm_rbf = SVC(kernel='rbf')
svm_linear = SVC(kernel='linear')
svm_poly = SVC(kernel='poly')

# Tree-based Models
decision_tree = DecisionTreeClassifier()
random_forest = RandomForestClassifier()
gradient_boosting = GradientBoostingClassifier()
extra_trees = ExtraTreesClassifier()

# Neural Networks
nn_model = MLPClassifier(hidden_layer_sizes=(128, 128))

# Bayesian Models
gaussian_nb = GaussianNB()

# Nearest Neighbors
k_neighbors = KNeighborsClassifier()

# Discriminant Analysis
quadratic_discriminant = QuadraticDiscriminantAnalysis()

# Ensemble - For the sake of example, let's consider a Voting Classifier with two models: logistic regression and SVM with RBF kernel.
# Of course, you can replace these models with any other classifiers or add more classifiers to the ensemble.
voting_classifier = VotingClassifier(estimators=[('lr', log_model), ('svm_rbf', svm_rbf)], voting='hard')

# Now, all these models are initialized and can be trained using their respective 'fit' methods.

In [None]:
models = [log_model, ridge_classifier, linear_svc, svm_rbf, svm_linear, svm_poly, decision_tree, random_forest, gradient_boosting, extra_trees, nn_model, gaussian_nb, k_neighbors, quadratic_discriminant, voting_classifier]

# Train

In [None]:
np.sum(y) / len(y)

In [None]:
# Define a function to compute false negatives
def false_negatives(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    return fn

# Train all models
for model in models:
    model.fit(X_train, y_train)

# Evaluate all models
for model in models:
    y_pred_model = model.predict(X_test)
    fn = false_negatives(y_test, y_pred_model)
    print(f"---{type(model).__name__} FN: {fn}")
    cm_log = confusion_matrix(y_test, y_pred_model)
    print(f"Confusion Matrix for {model}:\n", cm_log)


In [None]:
from sklearn.metrics import roc_curve, auc, precision_recall_curve

# For Logistic Regression
y_prob_log = log_model.predict_proba(X_test)[:,1] # probabilities of the positive class

# ROC
fpr_log, tpr_log, thresholds_roc = roc_curve(y_test, y_prob_log)
roc_auc_log = auc(fpr_log, tpr_log)

# PR Curve
precision_log, recall_log, thresholds_pr = precision_recall_curve(y_test, y_prob_log)

# You can then plot these metrics against each other to visualize the curves.

In [None]:
import matplotlib.pyplot as plt

# For Logistic Regression ROC Curve
fpr_log, tpr_log, _ = roc_curve(y_test, y_prob_log)
roc_auc_log = auc(fpr_log, tpr_log)

plt.figure()
plt.plot(fpr_log, tpr_log, color='darkorange', label=f'ROC curve (area = {roc_auc_log:.2f})')
plt.plot([0, 1], [0, 1], color='navy', linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for Logistic Regression')
plt.legend(loc="lower right")
plt.show()

# For Logistic Regression PR Curve
precision_log, recall_log, _ = precision_recall_curve(y_test, y_prob_log)

plt.figure()
plt.plot(recall_log, precision_log, color='darkorange')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('PR Curve for Logistic Regression')
plt.show()

In [None]:
print(f"---Logistic Regression: {log_model.score(X_test, y_test)}")
print(f"Support Vector Machine: {svm_model.score(X_test, y_test)}")
print(f"--------Neural Network: {nn_model.score(X_test, y_test)}")

# Choosing one model

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'C': [0.1, 1, 10, 100, 1000],
    'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
    'kernel': ['rbf', 'linear', 'poly', 'sigmoid'],
    'degree': [2, 3, 4]  # Only used when kernel is 'poly'
}

grid_search = GridSearchCV(SVC(), param_grid, refit=True, verbose=3, scoring="recall")
grid_search.fit(X_train, y_train)

# Best parameters
print(grid_search.best_params_)


## Final model with parameters

# Visualisation

In [None]:
corr = train0.corr()

sns.heatmap(corr)

# Submit

In [None]:
X_transformed = cat_pipe.transform(X)

In [None]:
gaussian_nb.fit(X_transformed,y)

In [None]:
columns_order = ['cap.shape', 'cap.color', 'bruises', 'stalk.color.above.ring', 'stalk.color.below.ring', 'population']
test0 = test0[columns_order]
test0

In [None]:
test_transformed = cat_pipe.transform(test0)

In [None]:
y_predict = gaussian_nb.predict(test_transformed)

In [None]:
submission_file = pd.DataFrame({
    'Id':test_id,
    'poisonous': y_predict
})

In [None]:
submission_file.head()

In [None]:
submission_file.to_csv('/Users/merlesteffen/Documents/GitHub/MushroomClassification/Data/submissions/submission_merle_2.csv',index=False)