In [27]:
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
train = pd.read_csv("data/train.csv")



In [28]:
labels = train.loc[:,train.columns == 'outcome_group']
features = train.loc[:,train.columns != 'outcome_group']

In [3]:
train_features, test_features, train_labels, test_labels = train_test_split(features,labels, test_size=0.25, random_state = 42)

In [4]:
print('Training Features Shape:', train_features.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Features Shape:', test_features.shape)
print('Testing Labels Shape:', test_labels.shape)

Training Features Shape: (12909, 12)
Training Labels Shape: (12909, 1)
Testing Features Shape: (4303, 12)
Testing Labels Shape: (4303, 1)


In [29]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
rf_base = RandomForestClassifier(random_state = 42, verbose=True)

In [30]:
# Number of trees in Random Forest
rf_n_estimators = [int(x) for x in np.linspace(200, 1000, 5)]
rf_n_estimators.append(1500)
rf_n_estimators.append(2000)

# Maximum number of levels in tree
rf_max_depth = [int(x) for x in np.linspace(5, 55, 11)]
# Add the default as a possible value
rf_max_depth.append(None)

# Number of features to consider at every split
rf_max_features = ['auto', 'sqrt', 'log2']

# Criterion to split on
rf_criterion = ['gini', 'entropy']

# Minimum number of samples required to split a node
rf_min_samples_split = [int(x) for x in np.linspace(2, 10, 9)]

# Minimum decrease in impurity required for split to happen
rf_min_impurity_decrease = [0.0, 0.05, 0.1]

# Method of selecting samples for training each tree
rf_bootstrap = [True, False]

# Create the grid
rf_grid = {'n_estimators': rf_n_estimators,
               'max_depth': rf_max_depth,
               'max_features': rf_max_features,
               'criterion': rf_criterion,
               'min_samples_split': rf_min_samples_split,
               'min_impurity_decrease': rf_min_impurity_decrease,
               'bootstrap': rf_bootstrap}
rf_grid

{'n_estimators': [200, 400, 600, 800, 1000, 1500, 2000],
 'max_depth': [5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, None],
 'max_features': ['auto', 'sqrt', 'log2'],
 'criterion': ['gini', 'entropy'],
 'min_samples_split': [2, 3, 4, 5, 6, 7, 8, 9, 10],
 'min_impurity_decrease': [0.0, 0.05, 0.1],
 'bootstrap': [True, False]}

In [35]:
from sklearn.metrics import f1_score, make_scorer
scoring = {"f1": make_scorer(f1_score , average='macro')}
rf_random = RandomizedSearchCV(estimator= rf_base, param_distributions= rf_grid, scoring=scoring, refit="f1", n_iter =65,cv= 5,verbose = 3,random_state= 42, n_jobs=-1)

In [50]:
from sklearn.preprocessing import StandardScaler
X_std = StandardScaler().fit_transform(features)
rf_random.fit(X_std, labels.values.ravel())

Fitting 5 folds for each of 65 candidates, totalling 325 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 1500 out of 1500 | elapsed:   35.2s finished


RandomizedSearchCV(cv=5,
                   estimator=RandomForestClassifier(random_state=42,
                                                    verbose=True),
                   n_iter=65, n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'criterion': ['gini', 'entropy'],
                                        'max_depth': [5, 10, 15, 20, 25, 30, 35,
                                                      40, 45, 50, 55, None],
                                        'max_features': ['auto', 'sqrt',
                                                         'log2'],
                                        'min_impurity_decrease': [0.0, 0.05,
                                                                  0.1],
                                        'min_samples_split': [2, 3, 4, 5, 6, 7,
                                                              8, 9, 10],
                                        'n_estimators': [

In [51]:
rf_random.best_score_

{'n_estimators': 1500,
 'min_samples_split': 2,
 'min_impurity_decrease': 0.0,
 'max_features': 'auto',
 'max_depth': 20,
 'criterion': 'entropy',
 'bootstrap': False}

In [38]:
rf_best = RandomForestClassifier(n_estimators=1500, min_samples_split= 2,min_impurity_decrease=0.0, max_features='auto',max_depth=20, criterion= 'entropy', bootstrap=False, random_state=42)
rf_best.fit(X_std, labels.values.ravel())

RandomForestClassifier(bootstrap=False, criterion='entropy', max_depth=20,
                       n_estimators=1500, random_state=42)

In [42]:
predictions = rf_best.predict(test_features.values)
predictions
from sklearn.metrics import f1_score
f1_score(test_labels,predictions,average='macro')

ValueError: could not convert string to float: '2020-04-07'

In [39]:
test = pd.read_csv("data/test.csv")
Y_std = StandardScaler().fit_transform(test.values)
predictions = rf_best.predict(Y_std)

In [40]:
preds = []
for i in predictions:
    n = int(i)
    preds.append(str(n))
preds

['2',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '2',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '0',
 '1',
 '1',
 '0',
 '1',
 '1',
 '1',
 '2',
 '1',
 '2',
 '1',
 '1',
 '2',
 '2',
 '1',
 '1',
 '1',
 '1',
 '1',
 '0',
 '1',
 '1',
 '1',
 '1',
 '2',
 '1',
 '1',
 '1',
 '1',
 '2',
 '1',
 '2',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '2',
 '2',
 '1',
 '0',
 '2',
 '1',
 '2',
 '0',
 '1',
 '2',
 '2',
 '1',
 '1',
 '1',
 '2',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '0',
 '0',
 '1',
 '1',
 '2',
 '2',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '2',
 '1',
 '2',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '2',
 '1',
 '1',
 '1',
 '1',
 '2',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '2',
 '0',
 '1',
 '1',
 '1',
 '0',
 '1',
 '2',
 '1',
 '1',
 '0',
 '1',
 '1',
 '1',
 '2',
 '1',
 '1',
 '1',
 '2',
 '2',
 '1',
 '2',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '0',
 '1',
 '1',
 '1',
 '2',
 '2'

In [41]:
import csv
def create_submission_file(y_preds, file_name):
    with open(file_name, 'w') as csvfile:
        wr = csv.writer(csvfile, quoting=csv.QUOTE_ALL)
        wr.writerow(["Id", "Prediction"])
        for i, pred in enumerate(y_preds):
            wr.writerow([str(i), str(pred)])
create_submission_file(preds, 'submission.csv')