# Test of Random Forest

In [14]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score

In [15]:
# Load the dataset
dataset = pd.read_csv('datasets/new_train.csv', index_col=0)
X = dataset.drop('label',axis=1).values
y = dataset['label'].values

In [16]:
# Standardize features
sc = StandardScaler()
X = sc.fit_transform(X)

In [17]:
# Split the dataset into the Training set and Test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [10]:
# Fit Random Forest Classification to the Training set
classifier = RandomForestClassifier(n_estimators=10, criterion='entropy', random_state=0)
classifier.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='entropy', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [12]:
# Predicting the Test set results
y_pred = classifier.predict(X_test)
accuracy_score(y_test, y_pred)

0.9242122058236937

# Apply Grid Search to Random Forest

In [29]:
# Apply Grid Search to find the best model and the best parameters
parameters = {'max_features': [8],
              'n_estimators': np.arange(68, 73),
              'min_samples_leaf': [1],
              'max_depth': [15] }
grid_search = GridSearchCV(estimator=classifier,
                           param_grid=parameters,
                           scoring='accuracy',
                           cv=10,
                           n_jobs=-1)
grid_search = grid_search.fit(X_train, y_train)
print(grid_search.best_score_)
print(grid_search.best_params_)

0.9410038960425456
{'max_depth': 15, 'max_features': 8, 'min_samples_leaf': 1, 'n_estimators': 70}


0.8815086411588178
{'max_depth': 8, 'max_features': 8, 'min_samples_leaf': 10, 'n_estimators': 10}
0.8880415113760961
{'max_depth': 8, 'max_features': 8, 'min_samples_leaf': 5, 'n_estimators': 50}
0.9199086779561565
{'max_depth': 10, 'max_features': 8, 'min_samples_leaf': 3, 'n_estimators': 30}
0.9405052175145385
{'max_depth': 20, 'max_features': 8, 'min_samples_leaf': 1, 'n_estimators': 40}
0.940654818586634
{'max_depth': 20, 'max_features': 8, 'min_samples_leaf': 1, 'n_estimators': 50}
0.9410038960425456
{'max_depth': 15, 'max_features': 8, 'min_samples_leaf': 1, 'n_estimators': 70}
0.9410038960425456
{'max_depth': 15, 'max_features': 8, 'min_samples_leaf': 1, 'n_estimators': 70}

# Prediction on Test set

In [30]:
# Load the dataset
dataset_train = pd.read_csv('datasets/new_train.csv', index_col=0)
dataset_test = pd.read_csv('datasets/new_test.csv', index_col=0)
X_train = dataset_train.drop('label',axis=1).values
y_train = dataset_train['label'].values
X_test = dataset_test.values

In [31]:
# Standardize features
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [33]:
# Fit Random Forest Classification to the Training set
classifier = RandomForestClassifier(max_depth=15, max_features=8, min_samples_leaf=2, n_estimators=70, criterion='entropy', random_state=0)
classifier.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='entropy', max_depth=15, max_features=8,
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=70,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [34]:
# Predict the Test set results
y_pred = classifier.predict(X_test)

In [35]:
# Save results to submission file
y_pred = pd.DataFrame(y_pred, columns=['label'])
y_pred.to_csv("datasets/random_forest_submission.csv", index=True, index_label='Id')