In [1]:
import joblib
import pandas as pd

from sklearn.datasets import load_breast_cancer
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# Process Dataset Breast Cancer

In [2]:
data_breast_cancer = load_breast_cancer(as_frame=True)
data_breast_cancer = data_breast_cancer.frame

data_breast_cancer.to_csv('test/data_breast_cancer.csv')

In [3]:
X_breast_cancer = data_breast_cancer.loc[:, data_breast_cancer.columns != 'target']
y_breast_cancer = data_breast_cancer.target

X_train, X_test, y_train, y_test = train_test_split(X_breast_cancer, y_breast_cancer, test_size=0.2, random_state=42)

In [4]:
grid = {'max_depth':[2,5], 'bootstrap': [True], 'oob_score': [True], 'max_samples': [0.8, 0.9]}
classifier = RandomForestClassifier(oob_score=True, random_state=42)
grid_classifier = GridSearchCV(classifier, grid, cv=5)
grid_classifier.fit(X_train, y_train)
rf = grid_classifier.best_estimator_

print('Parameters of best prediction model:')
print(grid_classifier.best_params_)
print('OOB accuracy of prediction model:')
print(rf.oob_score_)

Parameters of best prediction model:
{'bootstrap': True, 'max_depth': 5, 'max_samples': 0.9, 'oob_score': True}
OOB accuracy of prediction model:
0.9626373626373627


In [5]:
filename_model = 'test/random_forest_breat_cancer.joblib'
joblib.dump(rf, open(filename_model, 'wb'))

# Process Dataset Iris

In [7]:
data_iris = load_iris(as_frame=True)
data_iris = data_iris.frame

data_iris.to_csv('test/data_iris.csv')

In [8]:
X_iris = data_iris.loc[:, data_iris.columns != 'target']
y_iris = data_iris.target

X_train, X_test, y_train, y_test = train_test_split(X_iris, y_iris, test_size=0.2, random_state=42)

In [9]:
grid = {'max_depth':[2,5], 'bootstrap': [True], 'oob_score': [True], 'max_samples': [0.8, 0.9]}
classifier = RandomForestClassifier(oob_score=True, random_state=42)
grid_classifier = GridSearchCV(classifier, grid, cv=5)
grid_classifier.fit(X_train, y_train)
rf = grid_classifier.best_estimator_

print('Parameters of best prediction model:')
print(grid_classifier.best_params_)
print('OOB accuracy of prediction model:')
print(rf.oob_score_)

Parameters of best prediction model:
{'bootstrap': True, 'max_depth': 2, 'max_samples': 0.8, 'oob_score': True}
OOB accuracy of prediction model:
0.9416666666666667


In [10]:
filename_model = 'test/random_forest_iris.joblib'
joblib.dump(rf, open(filename_model, 'wb'))