# Before proceeding first replace all 'placeholder' tokens with your local path

### Initialization

In [0]:
# Import the required dependencies
import numpy as np
import pandas as pd

Load both the training data and the test data.

In [0]:
# Load the training data
# training_data = pd.read_csv(r'PLACEHOLDER\train.csv') # Uncomment when using jupyter
training_data = pd.read_csv('train.csv') # Uncomment when using Google Colab

# Load the test data
# training_data = pd.read_csv(r'PLACEHOLDER\test.csv') # Uncomment when using jupyter
test_data = pd.read_csv('test.csv') # Uncomment when using Google Colab

Separate the training data into dependent and independent variables: 
1. where list x contains all the features
2. and y contains the target value

In [0]:
# Seperate the training data into two list
# Where list x contains all data for generating the results
x = training_data.iloc[:, 1:].values
# And list y contains the expected results
y = training_data.iloc[:, 0].values

Use the StandardScaler from sklearn for standardizing the train and test input data.

In [0]:
# Standardize the input data
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x = sc.fit_transform(x)

Split the training data into training and testing data, using a seed for the random_state to assist with reproducability.

In [0]:
# Splitting the training data into training and testing data
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=5)

## Comparing models


We import the models

In [0]:
# Importing models for comparison
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

models = [('Logistic Regression', LogisticRegression(max_iter=600)),
         ('KNN', KNeighborsClassifier()),
         ('Support vector machine', SVC()),
         ('Naive Bayes', GaussianNB()),
         ('Decision tree', DecisionTreeClassifier()),
         ('Random forest', RandomForestClassifier())
         ]

We use cross validation to determine the performance of each model

In [0]:
# Cross validation for comparing models with default parameters
from sklearn.model_selection import cross_val_score

for name,model in models:
    scores = cross_val_score(model, x, y, cv=10)
    print(name, scores.mean())

Logistic Regression 0.7376751773049646
KNN 0.7544716312056737
Support vector machine 0.7851297872340426
Naive Bayes 0.6059737588652483
Decision tree 0.7339411347517731
Random forest 0.7992602836879433



### Create the final model

Import the RandomForestClassifier model and set a number of hyperparameters.

These hyperparameters have been discovered with the help of CrossValidation.

In [0]:
# Import the model and set hyperparameters
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators=1200, bootstrap=False, random_state=173, 
                                    min_samples_split=15, min_samples_leaf=1, max_features='sqrt',
                                   max_depth=44)

Train the model with the previously splitted training data, also provides the model with the expected result.

In [0]:
# Train the model on the training data
classifier.fit(x_train, y_train)

RandomForestClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=44, max_features='sqrt',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=15,
                       min_weight_fraction_leaf=0.0, n_estimators=1200,
                       n_jobs=None, oob_score=False, random_state=173,
                       verbose=0, warm_start=False)

Using the trained model, predict the results for the previously splitted test data.

In [0]:
# Predict the test results
y_predictions = classifier.predict(x_test)

Calculate the test results using the confusion_matrix and accuracy_score from sklearn.

In [0]:
# Checking testing results with confusion matrix and accuracy
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_predictions)
print(cm)
# Calculate the accuracy
accuracy_score(y_test, y_predictions)

[[270  82]
 [ 75 324]]


0.7909454061251664

### Optimization

Hyperparameter tuning using Randomized Search Cross Validation

Providing the possible paramater combinations

In [0]:
from sklearn.model_selection import RandomizedSearchCV

# Num of trees in random forest
n_estimators = [int(x) for x in np.linspace(1, 2000, num=21)]
# Num of features to consider at every split
max_features = ['auto', 'sqrt']
# Max number of level in tre e
max_depth = [int(x) for x in np.linspace(1, 110, num=11)]
max_depth.append(None)
# Min number of samples required to split a node
min_samples_split=[2, 5, 10, 15, 100]
# Min number of samples reaquired at each leaf node
min_samples_leaf=[1, 2, 5, 10]
# Method of collecting samples for training each tree
bootstrap = [True, False]

# Create random grid
random_grid={'n_estimators': n_estimators,
            'max_features': max_features,
            'max_depth': max_depth,
            'min_samples_split': min_samples_split,
            'min_samples_leaf': min_samples_leaf,
            'bootstrap': bootstrap,
            'random_state': [77, 173, 546]}

Calling the Randomized Search CV method with the provided paramaters

In [0]:
# Create a default random forest model
rf = RandomForestClassifier()
# Creates multiple estimators within the RandomizedSearchCV entity, all with different parameters
rf_random = RandomizedSearchCV(estimator=rf, param_distributions=random_grid, 
                               n_iter=300, cv=5, verbose=2, random_state=173, 
                               n_jobs=-1)

rf_random.fit(x_train, y_train)

At first we were using train_test_split to train the model. However,
doing that for the final model is not a good idea, since the model will miss out on valuable training data. For the final tune up, we decided to use the whole dataset for training. 

In [0]:
classifier.fit(x,y)

RandomForestClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=44, max_features='sqrt',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=15,
                       min_weight_fraction_leaf=0.0, n_estimators=1200,
                       n_jobs=None, oob_score=False, random_state=173,
                       verbose=0, warm_start=False)

### Use the model with the test data

Use the StandardScaler from sklearn for standardizing the test input data.

In [0]:
# Standardize the test data
test_data = sc.fit_transform(test_data)

Using the previously trained model to predict the result for the specified data.

In [0]:
# Predict the outcome of the test data
y_pred_prob = classifier.predict_proba(test_data)
y_pred_prob

array([[0.16612842, 0.83387158],
       [0.14186329, 0.85813671],
       [0.41122774, 0.58877226],
       ...,
       [0.06283969, 0.93716031],
       [0.30385915, 0.69614085],
       [0.72874247, 0.27125753]])

The predicted probability returns a 2 dimensional array, in which only the values of the second index are of importance.

Creates a new list containing solely the values of the second index.

In [0]:
# Create the predicted probability by taking the float data of index '1' from the y_pred_prob
predicted_probs = ["%f" % x[1] for x in y_pred_prob]

Create a pandas dataframe from the predicted probabilitys.

In [0]:
# Create a pandas dataframe including a header
test_df = pd.DataFrame({'MoleculeId': np.arange(1, len(predicted_probs)+1),
                        'PredictedProbability': predicted_probs})

Writes the created dataframe to a (new) file on your local drive.

In [0]:
# Write the resulting dataframe to the local drive
# test_df.to_csv(r'PLACEHOLDER\rf_predictions.csv', index=False) # Uncomment when using Jupyter
test_df.to_csv('rf_predictions.csv', index=False) # Uncomment when using Google Colab