In [1]:
import pandas as pd

In [2]:
df_cancer = pd.read_csv('cancer.csv', sep=';', index_col=0)
df_cancer.head()


Unnamed: 0,clump,ucellsize,ucellshape,mgadhesion,sepics,bnuclei,bchromatin,normnucl,mitoses,classe
1000025,5,1,1,1,2,1,3,1,1,2
1002945,5,4,4,5,7,10,3,2,1,2
1015425,3,1,1,1,2,2,3,1,1,2
1016277,6,8,8,1,3,4,3,7,1,2
1017023,4,1,1,3,2,1,3,1,1,2


In [3]:
from sklearn.model_selection import train_test_split
X = df_cancer.iloc[:,:-1]
y = df_cancer['classe']
X_train, X_test, y_train, y_test= train_test_split(X,y, test_size=0.25,random_state=1,stratify=y)

In [4]:
#instanciation de la forêt
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()

In [5]:
rf.fit(X_train,y_train)

RandomForestClassifier()

In [6]:
rf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [7]:
Y_pred = rf.predict(X_test)

In [8]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, Y_pred)
cm

array([[107,   4],
       [  3,  57]], dtype=int64)

In [9]:
#rapport de prédiction
from sklearn.metrics import classification_report
print(classification_report(y_test,Y_pred))

              precision    recall  f1-score   support

           2       0.97      0.96      0.97       111
           4       0.93      0.95      0.94        60

    accuracy                           0.96       171
   macro avg       0.95      0.96      0.96       171
weighted avg       0.96      0.96      0.96       171



In [10]:
rf.score(X_train, y_train)

1.0

In [11]:
rf.score(X,y)

0.9897510980966325

In [12]:
#importance des variables
impVarFirst={"Variable":df_cancer.columns[:-1],"Importance":rf.feature_importances_}
print(pd.DataFrame(impVarFirst).sort_values(by="Importance",ascending=False))

     Variable  Importance
2  ucellshape    0.246983
1   ucellsize    0.225153
5     bnuclei    0.224379
6  bchromatin    0.095446
7    normnucl    0.086121
4      sepics    0.058487
0       clump    0.039549
3  mgadhesion    0.018134
8     mitoses    0.005748


In [16]:
from sklearn.model_selection import GridSearchCV
param_grid = {
    'min_samples_split': [3, 5, 10], 
    'n_estimators' : [100, 300],
    'max_depth': [3, 5, 15],
    'max_features': [3, 5, 9]
}

grid = GridSearchCV(rf, param_grid, cv=10, scoring='accuracy')
grid.fit(X_train, y_train)

GridSearchCV(cv=10, estimator=RandomForestClassifier(),
             param_grid={'max_depth': [3, 5, 15], 'max_features': [3, 5, 9],
                         'min_samples_split': [3, 5, 10],
                         'n_estimators': [100, 300]},
             scoring='accuracy')

In [17]:
grid.best_estimator_

RandomForestClassifier(max_depth=15, max_features=3, min_samples_split=3,
                       n_estimators=300)

In [18]:
grid.best_score_

0.9765460030165913

In [19]:
best_rf = grid.best_estimator_

In [20]:
Y_pred = best_rf.predict(X_test)

In [21]:
print(classification_report(y_test,Y_pred))

              precision    recall  f1-score   support

           2       0.99      0.95      0.97       111
           4       0.92      0.98      0.95        60

    accuracy                           0.96       171
   macro avg       0.96      0.97      0.96       171
weighted avg       0.97      0.96      0.97       171

