In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline


In [2]:
mushroom_data = pd.read_csv('/content/drive/MyDrive/house_princing_kaggle_data/mushroom_cleaned.csv')

In [3]:
mushroom_data

Unnamed: 0,cap-diameter,cap-shape,gill-attachment,gill-color,stem-height,stem-width,stem-color,season,class
0,1372,2,2,10,3.807467,1545,11,1.804273,1
1,1461,2,2,10,3.807467,1557,11,1.804273,1
2,1371,2,2,10,3.612496,1566,11,1.804273,1
3,1261,6,2,10,3.787572,1566,11,1.804273,1
4,1305,6,2,10,3.711971,1464,11,0.943195,1
...,...,...,...,...,...,...,...,...,...
54030,73,5,3,2,0.887740,569,12,0.943195,1
54031,82,2,3,2,1.186164,490,12,0.943195,1
54032,82,5,3,2,0.915593,584,12,0.888450,1
54033,79,2,3,2,1.034963,491,12,0.888450,1


In [4]:
X = mushroom_data.drop('class', axis=1)
y = mushroom_data['class']

In [5]:
# verificando valores nulos
missing_values = X.isnull().sum()
missing_values

cap-diameter       0
cap-shape          0
gill-attachment    0
gill-color         0
stem-height        0
stem-width         0
stem-color         0
season             0
dtype: int64

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [7]:
from sklearn.linear_model import LogisticRegression

In [8]:
model = Pipeline([
    ('classifier', LogisticRegression(max_iter=1000,random_state=42))
])

In [9]:

model.fit(X_train, y_train)

In [10]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [11]:
y_pred = model.predict(X_test)

In [12]:
acerto = accuracy_score(y_test, y_pred)

In [13]:
acerto

0.6431942259646526

In [15]:
confusion_matrix(y_test, y_pred)

array([[2705, 2167],
       [1689, 4246]])

In [16]:
from sklearn.model_selection import GridSearchCV

In [17]:
verificacao_bestparametros = {
    'classifier__C': [0.1, 1, 10, 100],
    'classifier__solver': ['liblinear', 'saga']
}

In [18]:
grid_search = GridSearchCV(model, verificacao_bestparametros, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

In [19]:
grid_search.best_params_

{'classifier__C': 10, 'classifier__solver': 'liblinear'}

In [20]:
model_update1 = grid_search.best_estimator_
y_pred2 = model_update1.predict(X_test)
accuracy_score(y_test, y_pred)

0.6431942259646526

In [21]:
#criando ensembles
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

In [22]:
random_forest = RandomForestClassifier(random_state=42)
svm = SVC(probability=True,random_state=42)

In [23]:
ensemble_model = VotingClassifier(estimators=[
    ('lr', model_update1.named_steps['classifier']),
    ('rf', random_forest),
    ('svc', svm)
], voting='soft')

In [24]:
ensemble_model.fit(X_train, y_train)

In [25]:
y_pred3 = ensemble_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred3)
print("Acurácia do ensemble:", accuracy)

Acurácia do ensemble: 0.9724252799111687


In [26]:
matrix_confusion = confusion_matrix(y_test, y_pred3)

matrix_confusion

array([[4684,  188],
       [ 110, 5825]])

In [27]:
class_report = classification_report(y_test, y_pred3)

print(class_report)

              precision    recall  f1-score   support

           0       0.98      0.96      0.97      4872
           1       0.97      0.98      0.98      5935

    accuracy                           0.97     10807
   macro avg       0.97      0.97      0.97     10807
weighted avg       0.97      0.97      0.97     10807



In [28]:
from sklearn.metrics import mean_squared_error

In [29]:
ms = mean_squared_error(y_test, y_pred3)

In [30]:
rmse = ms**0.5


In [31]:
rmse

0.1660563762366002